Skip to content
Snippets Groups Projects
pi-monte-carlo.py 34.6 KiB
Newer Older
  • Learn to ignore specific revisions
  • Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    #!/usr/bin/env python3
    
    #
    # Pi-by-MonteCarlo using PyCUDA/PyOpenCL
    #
    # performs an estimation of Pi using Monte Carlo method
    # a large amount of iterations is divided and distributed to compute units
    # a lot of options are provided to perform scalabilty tests
    #
    # use -h for complete set of options
    #
    
    # CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    # Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
    #
    
    # Thanks to Andreas Klockner for PyCUDA:
    # http://mathema.tician.de/software/pycuda
    # Thanks to Andreas Klockner for PyOpenCL:
    # http://mathema.tician.de/software/pyopencl
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    # 2013-01-01 : problems with launch timeout
    # http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
    # Option "Interactive" "0" in /etc/X11/xorg.conf
    
    # Common tools
    import numpy
    import sys
    import getopt
    import time
    import itertools
    from socket import gethostname
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    class PenStacle:
        """Pentacle of Statistics from data"""
    
    
        Avg = 0
        Med = 0
        Std = 0
        Min = 0
        Max = 0
    
        def __init__(self, Data):
            self.Avg = numpy.average(Data)
            self.Med = numpy.median(Data)
            self.Std = numpy.std(Data)
            self.Max = numpy.max(Data)
            self.Min = numpy.min(Data)
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        def display(self):
    
            print("%s %s %s %s %s" % (self.Avg, self.Med, self.Std, self.Min, self.Max))
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    class Experience:
        """Metrology for experiences"""
    
    
        DeviceStyle = ""
        DeviceId = 0
        AvgD = 0
        MedD = 0
        StdD = 0
        MinD = 0
        MaxD = 0
        AvgR = 0
        MedR = 0
        StdR = 0
        MinR = 0
        MaxR = 0
    
        def __init__(self, DeviceStyle, DeviceId, Iterations):
            self.DeviceStyle = DeviceStyle
            self.DeviceId = DeviceId
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            self.Iterations
    
    
        def Metrology(self, Data):
            Duration = PenStacle(Data)
            Rate = PenStacle(Iterations / Data)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            print("Duration %s" % Duration)
            print("Rate %s" % Rate)
    
    
    def DictionariesAPI():
    
        Marsaglia = {"CONG": 0, "SHR3": 1, "MWC": 2, "KISS": 3}
        Computing = {"INT32": 0, "INT64": 1, "FP32": 2, "FP64": 3}
        Test = {True: 1, False: 0}
        return (Marsaglia, Computing, Test)
    
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    # find prime factors of a number
    # Get for WWW :
    # http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
    def PrimeFactors(x):
    
    
        factorlist = numpy.array([]).astype("uint32")
        loop = 2
        while loop <= x:
            if x % loop == 0:
                x /= loop
                factorlist = numpy.append(factorlist, [loop])
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            else:
    
                loop += 1
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        return factorlist
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    # Try to find the best thread number in Hybrid approach (Blocks&Threads)
    # output is thread number
    def BestThreadsNumber(jobs):
    
        factors = PrimeFactors(jobs)
        matrix = numpy.append([factors], [factors[::-1]], axis=0)
        threads = 1
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        for factor in matrix.transpose().ravel():
    
            threads = threads * factor
            if threads * threads > jobs or threads > 512:
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                break
    
        return int(threads)
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    
    # Predicted Amdahl Law (Reduced with s=1-p)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    def AmdahlR(N, T1, p):
    
        return T1 * (1 - p + p / N)
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    # Predicted Amdahl Law
    def Amdahl(N, T1, s, p):
    
        return T1 * (s + p / N)
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    # Predicted Mylq Law with first order
    
    def Mylq(N, T1, s, c, p):
        return T1 * (s + p / N) + c * N
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    # Predicted Mylq Law with second order
    
    def Mylq2(N, T1, s, c1, c2, p):
        return T1 * (s + p / N) + c1 * N + c2 * N * N
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    def KernelCodeCuda():
    
        KERNEL_CODE_CUDA = """
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    #define TCONG 0
    #define TSHR3 1
    #define TMWC 2
    #define TKISS 3
    
    #define TINT32 0
    #define TINT64 1
    #define TFP32 2
    #define TFP64 3
    
    #define IFTHEN 1
    
    // Marsaglia RNG very simple implementation
    
    #define znew  ((z=36969*(z&65535)+(z>>16))<<16)
    #define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
    #define MWC   (znew+wnew)
    #define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
    #define CONG  (jcong=69069*jcong+1234567)
    #define KISS  ((MWC^CONG)+SHR3)
    
    #define MWCfp MWC * 2.328306435454494e-10f
    #define KISSfp KISS * 2.328306435454494e-10f
    #define SHR3fp SHR3 * 2.328306435454494e-10f
    #define CONGfp CONG * 2.328306435454494e-10f
    
    __device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
    {
    
    #if TRNG == TCONG
       uint jcong=seed_z+work;
    #elif TRNG == TSHR3
       uint jsr=seed_w+work;
    #elif TRNG == TMWC
       uint z=seed_z+work;
       uint w=seed_w+work;
    #elif TRNG == TKISS
       uint jcong=seed_z+work;
       uint jsr=seed_w+work;
       uint z=seed_z-work;
       uint w=seed_w-work;
    #endif
    
       ulong total=0;
    
       for (ulong i=0;i<iterations;i++) {
    
    #if TYPE == TINT32
        #define THEONE 1073741824
        #if TRNG == TCONG
            uint x=CONG>>17 ;
            uint y=CONG>>17 ;
        #elif TRNG == TSHR3
            uint x=SHR3>>17 ;
            uint y=SHR3>>17 ;
        #elif TRNG == TMWC
            uint x=MWC>>17 ;
            uint y=MWC>>17 ;
        #elif TRNG == TKISS
            uint x=KISS>>17 ;
            uint y=KISS>>17 ;
        #endif
    #elif TYPE == TINT64
        #define THEONE 4611686018427387904
        #if TRNG == TCONG
            ulong x=(ulong)(CONG>>1) ;
            ulong y=(ulong)(CONG>>1) ;
        #elif TRNG == TSHR3
            ulong x=(ulong)(SHR3>>1) ;
            ulong y=(ulong)(SHR3>>1) ;
        #elif TRNG == TMWC
            ulong x=(ulong)(MWC>>1) ;
            ulong y=(ulong)(MWC>>1) ;
        #elif TRNG == TKISS
            ulong x=(ulong)(KISS>>1) ;
            ulong y=(ulong)(KISS>>1) ;
        #endif
    #elif TYPE == TFP32
        #define THEONE 1.0f
        #if TRNG == TCONG
            float x=CONGfp ;
            float y=CONGfp ;
        #elif TRNG == TSHR3
            float x=SHR3fp ;
            float y=SHR3fp ;
        #elif TRNG == TMWC
            float x=MWCfp ;
            float y=MWCfp ;
        #elif TRNG == TKISS
          float x=KISSfp ;
          float y=KISSfp ;
        #endif
    #elif TYPE == TFP64
        #define THEONE 1.0f
        #if TRNG == TCONG
            double x=(double)CONGfp ;
            double y=(double)CONGfp ;
        #elif TRNG == TSHR3
            double x=(double)SHR3fp ;
            double y=(double)SHR3fp ;
        #elif TRNG == TMWC
            double x=(double)MWCfp ;
            double y=(double)MWCfp ;
        #elif TRNG == TKISS
            double x=(double)KISSfp ;
            double y=(double)KISSfp ;
        #endif
    #endif
    
    #if TEST == IFTHEN
          if ((x*x+y*y) <=THEONE) {
             total+=1;
          }
    #else
          ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
          total+=inside;
    #endif
       }
    
       return(total);
    }
    
    __global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
    {
       ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
       s[blockIdx.x]=total;
       __syncthreads();
    
    }
    
    __global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
    {
       ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
       s[threadIdx.x]=total;
       __syncthreads();
    
    }
    
    __global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
    {
       ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
       s[blockDim.x*blockIdx.x+threadIdx.x]=total;
       __syncthreads();
    }
    
    """
    
        return KERNEL_CODE_CUDA
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    def KernelCodeOpenCL():
    
        KERNEL_CODE_OPENCL = """
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    #define TCONG 0
    #define TSHR3 1
    #define TMWC 2
    #define TKISS 3
    
    #define TINT32 0
    #define TINT64 1
    #define TFP32 2
    #define TFP64 3
    
    #define IFTHEN 1
    
    // Marsaglia RNG very simple implementation
    #define znew  ((z=36969*(z&65535)+(z>>16))<<16)
    #define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
    
    #define MWC   (znew+wnew)
    #define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
    #define CONG  (jcong=69069*jcong+1234567)
    #define KISS  ((MWC^CONG)+SHR3)
    
    #define MWCfp MWC * 2.328306435454494e-10f
    #define KISSfp KISS * 2.328306435454494e-10f
    #define CONGfp CONG * 2.328306435454494e-10f
    #define SHR3fp SHR3 * 2.328306435454494e-10f
    
    ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
    {
    
    #if TRNG == TCONG
       uint jcong=seed_z+work;
    #elif TRNG == TSHR3
       uint jsr=seed_w+work;
    #elif TRNG == TMWC
       uint z=seed_z+work;
       uint w=seed_w+work;
    #elif TRNG == TKISS
       uint jcong=seed_z+work;
       uint jsr=seed_w+work;
       uint z=seed_z-work;
       uint w=seed_w-work;
    #endif
    
       ulong total=0;
    
       for (ulong i=0;i<iterations;i++) {
    
    #if TYPE == TINT32
        #define THEONE 1073741824
        #if TRNG == TCONG
            uint x=CONG>>17 ;
            uint y=CONG>>17 ;
        #elif TRNG == TSHR3
            uint x=SHR3>>17 ;
            uint y=SHR3>>17 ;
        #elif TRNG == TMWC
            uint x=MWC>>17 ;
            uint y=MWC>>17 ;
        #elif TRNG == TKISS
            uint x=KISS>>17 ;
            uint y=KISS>>17 ;
        #endif
    #elif TYPE == TINT64
        #define THEONE 4611686018427387904
        #if TRNG == TCONG
            ulong x=(ulong)(CONG>>1) ;
            ulong y=(ulong)(CONG>>1) ;
        #elif TRNG == TSHR3
            ulong x=(ulong)(SHR3>>1) ;
            ulong y=(ulong)(SHR3>>1) ;
        #elif TRNG == TMWC
            ulong x=(ulong)(MWC>>1) ;
            ulong y=(ulong)(MWC>>1) ;
        #elif TRNG == TKISS
            ulong x=(ulong)(KISS>>1) ;
            ulong y=(ulong)(KISS>>1) ;
        #endif
    #elif TYPE == TFP32
        #define THEONE 1.0f
        #if TRNG == TCONG
            float x=CONGfp ;
            float y=CONGfp ;
        #elif TRNG == TSHR3
            float x=SHR3fp ;
            float y=SHR3fp ;
        #elif TRNG == TMWC
            float x=MWCfp ;
            float y=MWCfp ;
        #elif TRNG == TKISS
          float x=KISSfp ;
          float y=KISSfp ;
        #endif
    #elif TYPE == TFP64
    #pragma OPENCL EXTENSION cl_khr_fp64: enable
        #define THEONE 1.0f
        #if TRNG == TCONG
            double x=(double)CONGfp ;
            double y=(double)CONGfp ;
        #elif TRNG == TSHR3
            double x=(double)SHR3fp ;
            double y=(double)SHR3fp ;
        #elif TRNG == TMWC
            double x=(double)MWCfp ;
            double y=(double)MWCfp ;
        #elif TRNG == TKISS
            double x=(double)KISSfp ;
            double y=(double)KISSfp ;
        #endif
    #endif
    
    #if TEST == IFTHEN
          if ((x*x+y*y) <= THEONE) {
             total+=1;
          }
    #else
          ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
          total+=inside;
    #endif
       }
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
       return(total);
    }
    
    
    __kernel void MainLoopGlobal(
        __global ulong *s,ulong iterations,uint seed_w,uint seed_z)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    {
       ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
       barrier(CLK_GLOBAL_MEM_FENCE);
    
       s[get_global_id(0)]=total;
    
    __kernel void MainLoopLocal(
        __global ulong *s,ulong iterations,uint seed_w,uint seed_z)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    {
       ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
       barrier(CLK_LOCAL_MEM_FENCE);
       s[get_local_id(0)]=total;
    }
    
    
    __kernel void MainLoopHybrid(
        __global ulong *s,ulong iterations,uint seed_w,uint seed_z)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    {
       ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
       barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
       s[get_global_id(0)]=total;
    }
    
    """
    
        return KERNEL_CODE_OPENCL
    
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    def MetropolisCuda(InputCU):
    
    
        print("Inside ", InputCU)
    
        iterations = InputCU["Iterations"]
        steps = InputCU["Steps"]
        blocks = InputCU["Blocks"]
        threads = InputCU["Threads"]
        Device = InputCU["Device"]
        RNG = InputCU["RNG"]
        ValueType = InputCU["ValueType"]
        Seeds = InputCU["Seeds"]
    
        Marsaglia, Computing, Test = DictionariesAPI()
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        try:
            # For PyCUDA import
            import pycuda.driver as cuda
            from pycuda.compiler import SourceModule
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            cuda.init()
            for Id in range(cuda.Device.count()):
    
                if Id == Device:
                    XPU = cuda.Device(Id)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                    print("GPU selected %s" % XPU.name())
            print
    
        except ImportError:
            print("Platform does not seem to support CUDA")
    
    
        circle = numpy.zeros(blocks * threads).astype(numpy.uint64)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        circleCU = cuda.InOut(circle)
    
        # circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
        # cuda.memcpy_htod(circleCU, circle)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    
        Context = XPU.make_context()
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
        try:
    
            mod = SourceModule(
                KernelCodeCuda(),
                options=[
                    "--compiler-options",
                    "-DTRNG=%i -DTYPE=%s" % (Marsaglia[RNG], Computing[ValueType]),
                ],
            )
            # mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            # Needed to set the compiler via ccbin for CUDA9 implementation
    
            # mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True)  # noqa: E501
        except Exception:
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            print("Compilation seems to break")
    
    
        MetropolisBlocksCU = mod.get_function("MainLoopBlocks")  # noqa: F841
        MetropolisThreadsCU = mod.get_function("MainLoopThreads")  # noqa: F841
        MetropolisHybridCU = mod.get_function("MainLoopHybrid")
    
        MyDuration = numpy.zeros(steps)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    
        jobs = blocks * threads
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    
        iterationsCU = numpy.uint64(iterations / jobs)
        if iterations % jobs != 0:
            iterationsCU += numpy.uint64(1)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
        for i in range(steps):
    
            start_time = time.time()
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
            try:
    
                MetropolisHybridCU(
                    circleCU,
                    numpy.uint64(iterationsCU),
                    numpy.uint32(Seeds[0]),
                    numpy.uint32(Seeds[1]),
                    grid=(blocks, 1),
                    block=(threads, 1, 1),
                )
            except Exception:
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                print("Crash during CUDA call")
    
    
            elapsed = time.time() - start_time
            print(
                "(Blocks/Threads)=(%i,%i) method done in %.2f s..."
                % (blocks, threads, elapsed)
            )
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    
            MyDuration[i] = elapsed
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    
        OutputCU = {
            "Inside": sum(circle),
            "NewIterations": numpy.uint64(iterationsCU * jobs),
            "Duration": MyDuration,
        }
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        print(OutputCU)
        Context.pop()
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        Context.detach()
    
    
        return OutputCU
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    def MetropolisOpenCL(InputCL):
    
        import pyopencl as cl
    
    
        iterations = InputCL["Iterations"]
        steps = InputCL["Steps"]
        blocks = InputCL["Blocks"]
        threads = InputCL["Threads"]
        Device = InputCL["Device"]
        RNG = InputCL["RNG"]
        ValueType = InputCL["ValueType"]
        TestType = InputCL["IfThen"]
        Seeds = InputCL["Seeds"]
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    
        Marsaglia, Computing, Test = DictionariesAPI()
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
        # Initialisation des variables en les CASTant correctement
    
        Id = 0
        HasXPU = False
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        for platform in cl.get_platforms():
            for device in platform.get_devices():
    
                if Id == Device:
                    XPU = device
                    print("CPU/GPU selected: ", device.name.lstrip())
                    HasXPU = True
                Id += 1
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                # print(Id)
    
    
        if not HasXPU:
            print("No XPU #%i found in all of %i devices, sorry..." % (Device, Id - 1))
            sys.exit()
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
        # Je cree le contexte et la queue pour son execution
        try:
            ctx = cl.Context(devices=[XPU])
    
            queue = cl.CommandQueue(
                ctx, properties=cl.command_queue_properties.PROFILING_ENABLE
            )
        except Exception:
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            print("Crash during context creation")
    
        # Je recupere les flag possibles pour les buffers
        mf = cl.mem_flags
    
    
        circle = numpy.zeros(blocks * threads).astype(numpy.uint64)
        circleCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=circle)
    
        MetropolisCL = cl.Program(ctx, KernelCodeOpenCL()).build(
            options="-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s"
            % (Marsaglia[RNG], Computing[ValueType], Test[TestType])
        )
    
        MyDuration = numpy.zeros(steps)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    
        jobs = blocks * threads
    
        iterationsCL = numpy.uint64(iterations / jobs)
        if iterations % jobs != 0:
            iterationsCL += 1
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
        for i in range(steps):
    
            start_time = time.time()
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            if threads == 1:
    
                CLLaunch = MetropolisCL.MainLoopGlobal(
                    queue,
                    (blocks,),
                    None,
                    circleCL,
                    numpy.uint64(iterationsCL),
                    numpy.uint32(Seeds[0]),
                    numpy.uint32(Seeds[1]),
                )
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            else:
    
                CLLaunch = MetropolisCL.MainLoopHybrid(
                    queue,
                    (jobs,),
                    (threads,),
                    circleCL,
                    numpy.uint64(iterationsCL),
                    numpy.uint32(Seeds[0]),
                    numpy.uint32(Seeds[1]),
                )
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
            CLLaunch.wait()
            cl.enqueue_copy(queue, circle, circleCL).wait()
    
    
            elapsed = time.time() - start_time
            print(
                "(Blocks/Threads)=(%i,%i) method done in %.2f s..."
                % (blocks, threads, elapsed)
            )
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
            # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
    
            # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
    
            MyDuration[i] = elapsed
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
            # MyPi[i]=numpy.median(AllPi)
            # print MyPi[i],numpy.std(AllPi),MyDuration[i]
    
        circleCL.release()
    
    
        OutputCL = {
            "Inside": sum(circle),
            "NewIterations": numpy.uint64(iterationsCL * jobs),
            "Duration": MyDuration,
        }
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # print(OutputCL)
    
        return OutputCL
    
    def FitAndPrint(N, D, Curves):
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
        from scipy.optimize import curve_fit
        import matplotlib.pyplot as plt
    
        try:
            coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
    
    
            D_Amdahl = Amdahl(N, coeffs_Amdahl[0], coeffs_Amdahl[1], coeffs_Amdahl[2])
            coeffs_Amdahl[1] = coeffs_Amdahl[1] * coeffs_Amdahl[0] / D[0]
            coeffs_Amdahl[2] = coeffs_Amdahl[2] * coeffs_Amdahl[0] / D[0]
            coeffs_Amdahl[0] = D[0]
            print(
                "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)"
                % (coeffs_Amdahl[0], coeffs_Amdahl[1], coeffs_Amdahl[2])
            )
        except Exception:
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            print("Impossible to fit for Amdahl law : only %i elements" % len(D))
    
        try:
            coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
    
    
            # D_AmdahlR = AmdahlR(N, coeffs_AmdahlR[0], coeffs_AmdahlR[1])
            coeffs_AmdahlR[1] = coeffs_AmdahlR[1] * coeffs_AmdahlR[0] / D[0]
            coeffs_AmdahlR[0] = D[0]
            print(
                "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)"
                % (coeffs_AmdahlR[0], 1 - coeffs_AmdahlR[1], coeffs_AmdahlR[1])
            )
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
    
        except Exception:
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
    
        try:
            coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
    
    
            coeffs_Mylq[1] = coeffs_Mylq[1] * coeffs_Mylq[0] / D[0]
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
    
            coeffs_Mylq[3] = coeffs_Mylq[3] * coeffs_Mylq[0] / D[0]
            coeffs_Mylq[0] = D[0]
            print(
                "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N"
                % (coeffs_Mylq[0], coeffs_Mylq[1], coeffs_Mylq[3], coeffs_Mylq[2])
            )
            D_Mylq = Mylq(N, coeffs_Mylq[0], coeffs_Mylq[1], coeffs_Mylq[2],
                    coeffs_Mylq[3])
        except Exception:
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            print("Impossible to fit for Mylq law : only %i elements" % len(D))
    
        try:
            coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
    
    
            coeffs_Mylq2[1] = coeffs_Mylq2[1] * coeffs_Mylq2[0] / D[0]
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
            # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
    
            coeffs_Mylq2[4] = coeffs_Mylq2[4] * coeffs_Mylq2[0] / D[0]
            coeffs_Mylq2[0] = D[0]
            print(
                "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2"
                % (
                    coeffs_Mylq2[0],
                    coeffs_Mylq2[1],
                    coeffs_Mylq2[4],
                    coeffs_Mylq2[2],
                    coeffs_Mylq2[3],
                )
            )
    
        except Exception:
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
    
        if Curves:
            plt.xlabel("Number of Threads/work Items")
            plt.ylabel("Total Elapsed Time")
    
    
            (Experience,) = plt.plot(N, D, "ro")
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        try:
    
            (pAmdahl,) = plt.plot(N, D_Amdahl, label="Loi de Amdahl")
            (pMylq,) = plt.plot(N, D_Mylq, label="Loi de Mylq")
        except Exception:
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            print("Fit curves seem not to be available")
    
        plt.legend()
        plt.show()
    
    
    
    if __name__ == "__main__":
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # Set defaults values
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # GPU style can be Cuda (Nvidia implementation) or OpenCL
    
        GpuStyle = "OpenCL"
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # Iterations is integer
    
        Iterations = 1000000000
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # BlocksBlocks in first number of Blocks to explore
    
        BlocksBegin = 1024
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # BlocksEnd is last number of Blocks to explore
    
        BlocksEnd = 1024
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # BlocksStep is the step of Blocks to explore
    
        BlocksStep = 1
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # ThreadsBlocks in first number of Blocks to explore
    
        ThreadsBegin = 1
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # ThreadsEnd is last number of Blocks to explore
    
        ThreadsEnd = 1
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # ThreadsStep is the step of Blocks to explore
    
        ThreadsStep = 1
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # Redo is the times to redo the test to improve metrology
    
        Redo = 1
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # OutMetrology is method for duration estimation : False is GPU inside
    
        OutMetrology = False
        Metrology = "InMetro"
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # Curves is True to print the curves
    
        Curves = False
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # Fit is True to print the curves
    
        Fit = False
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # Inside based on If
    
        IfThen = False
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # Marsaglia RNG
    
        RNG = "MWC"
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # Value type : INT32, INT64, FP32, FP64
    
        ValueType = "FP32"
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        # Seeds for RNG
    
        Seeds = 110271, 101008
    
        HowToUse = "%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>"  # noqa: E501
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
        try:
    
            opts, args = getopt.getopt(
                sys.argv[1:],
                "hockg:i:b:e:s:f:l:t:r:d:m:v:",
                [
                    "gpustyle=",
                    "iterations=",
                    "blocksBegin=",
                    "blocksEnd=",
                    "blocksStep=",
                    "threadsFirst=",
                    "threadsLast=",
                    "threadssTep=",
                    "redo=",
                    "device=",
                    "marsaglia=",
                    "valuetype=",
                ],
            )
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        except getopt.GetoptError:
            print(HowToUse % sys.argv[0])
            sys.exit(2)
    
        # List of Devices
    
        Devices = []
        Alu = {}
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        for opt, arg in opts:
    
            if opt == "-h":
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                print(HowToUse % sys.argv[0])
    
                print("\nInformations about devices detected under OpenCL API:")
                # For PyOpenCL import
                try:
                    import pyopencl as cl
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                    for platform in cl.get_platforms():
                        for device in platform.get_devices():
    
                            # deviceType=cl.device_type.to_string(device.type)
                            deviceType = "xPU"
                            print(
                                "Device #%i from %s of type %s : %s"
                                % (
                                    Id,
                                    platform.vendor.lstrip(),
                                    deviceType,
                                    device.name.lstrip(),
                                )
                            )
                            Id = Id + 1
    
                except Exception:
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                    print("Your platform does not seem to support OpenCL")
    
                print("\nInformations about devices detected under CUDA API:")
                # For PyCUDA import
                try:
                    import pycuda.driver as cuda
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                    cuda.init()
                    for Id in range(cuda.Device.count()):
    
                        device = cuda.Device(Id)
                        print("Device #%i of type GPU : %s" % (Id, device.name()))
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                    print
    
                except Exception:
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                    print("Your platform does not seem to support CUDA")
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                sys.exit()
    
    
            elif opt == "-o":
                OutMetrology = True
                Metrology = "OutMetro"
            elif opt == "-c":
                Curves = True
            elif opt == "-k":
                IfThen = True
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            elif opt in ("-d", "--device"):
                Devices.append(int(arg))
            elif opt in ("-g", "--gpustyle"):
                GpuStyle = arg
            elif opt in ("-m", "--marsaglia"):
                RNG = arg
            elif opt in ("-v", "--valuetype"):
                ValueType = arg
            elif opt in ("-i", "--iterations"):
                Iterations = numpy.uint64(arg)
            elif opt in ("-b", "--blocksbegin"):
                BlocksBegin = int(arg)
    
                BlocksEnd = BlocksBegin
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            elif opt in ("-e", "--blocksend"):
                BlocksEnd = int(arg)
            elif opt in ("-s", "--blocksstep"):
                BlocksStep = int(arg)
            elif opt in ("-f", "--threadsfirst"):
                ThreadsBegin = int(arg)
                ThreadsEnd = ThreadsBegin
            elif opt in ("-l", "--threadslast"):
                ThreadsEnd = int(arg)
            elif opt in ("-t", "--threadsstep"):
                ThreadsStep = int(arg)
            elif opt in ("-r", "--redo"):
                Redo = int(arg)
    
        # If no device has been specified, take the first one!
    
        if len(Devices) == 0:
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            Devices.append(0)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
        print("Devices Identification : %s" % Devices)
        print("GpuStyle used : %s" % GpuStyle)
        print("Iterations : %s" % Iterations)
        print("Number of Blocks on begin : %s" % BlocksBegin)
        print("Number of Blocks on end : %s" % BlocksEnd)
        print("Step on Blocks : %s" % BlocksStep)
        print("Number of Threads on begin : %s" % ThreadsBegin)
        print("Number of Threads on end : %s" % ThreadsEnd)
        print("Step on Threads : %s" % ThreadsStep)
        print("Number of redo : %s" % Redo)
        print("Metrology done out of XPU : %r" % OutMetrology)
        print("Type of Marsaglia RNG used : %s" % RNG)
        print("Type of variable : %s" % ValueType)
    
    
        if GpuStyle == "CUDA":
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            try:
                # For PyCUDA import
                import pycuda.driver as cuda
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                cuda.init()
                for Id in range(cuda.Device.count()):
    
                    device = cuda.Device(Id)
                    print("Device #%i of type GPU : %s" % (Id, device.name()))
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                    if Id in Devices:
    
                        Alu[Id] = "GPU"
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            except ImportError:
                print("Platform does not seem to support CUDA")
    
    
        if GpuStyle == "OpenCL":
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            try:
                # For PyOpenCL import
                import pyopencl as cl
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                for platform in cl.get_platforms():
                    for device in platform.get_devices():
    
                        # deviceType=cl.device_type.to_string(device.type)
                        deviceType = "xPU"
                        print(
                            "Device #%i from %s of type %s : %s"
                            % (
                                Id,
                                platform.vendor.lstrip().rstrip(),
                                deviceType,
                                device.name.lstrip().rstrip(),
                            )
                        )
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
    
                        if Id in Devices:
    
                            # Set the Alu as detected Device Type
                            Alu[Id] = deviceType
                        Id = Id + 1
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            except ImportError:
                print("Platform does not seem to support OpenCL")
    
    
        # print(Devices,Alu)
    
        BlocksList = range(BlocksBegin, BlocksEnd + BlocksStep, BlocksStep)
        ThreadsList = range(ThreadsBegin, ThreadsEnd + ThreadsStep, ThreadsStep)
    
        ExploredJobs = numpy.array([]).astype(numpy.uint32)
        ExploredBlocks = numpy.array([]).astype(numpy.uint32)
        ExploredThreads = numpy.array([]).astype(numpy.uint32)
        avgD = numpy.array([]).astype(numpy.float32)
        medD = numpy.array([]).astype(numpy.float32)
        stdD = numpy.array([]).astype(numpy.float32)
        minD = numpy.array([]).astype(numpy.float32)
        maxD = numpy.array([]).astype(numpy.float32)
        avgR = numpy.array([]).astype(numpy.float32)
        medR = numpy.array([]).astype(numpy.float32)
        stdR = numpy.array([]).astype(numpy.float32)
        minR = numpy.array([]).astype(numpy.float32)
        maxR = numpy.array([]).astype(numpy.float32)
    
        for Blocks, Threads in itertools.product(BlocksList, ThreadsList):
    
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
            # print Blocks,Threads
    
            circle = numpy.zeros(Blocks * Threads).astype(numpy.uint64)
            ExploredJobs = numpy.append(ExploredJobs, Blocks * Threads)
            ExploredBlocks = numpy.append(ExploredBlocks, Blocks)
            ExploredThreads = numpy.append(ExploredThreads, Threads)
    
            if OutMetrology:
                DurationItem = numpy.array([]).astype(numpy.float32)
                Duration = numpy.array([]).astype(numpy.float32)
                Rate = numpy.array([]).astype(numpy.float32)
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                for i in range(Redo):
    
                    start = time.time()
                    if GpuStyle == "CUDA":
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                        try:
    
                            InputCU = {}
                            InputCU["Iterations"] = Iterations
                            InputCU["Steps"] = 1
                            InputCU["Blocks"] = Blocks
                            InputCU["Threads"] = Threads
                            InputCU["Device"] = Devices[0]
                            InputCU["RNG"] = RNG
                            InputCU["Seeds"] = Seeds
                            InputCU["ValueType"] = ValueType
                            InputCU["IfThen"] = IfThen
                            OutputCU = MetropolisCuda(InputCU)
                            Inside = OutputCU["Circle"]
                            NewIterations = OutputCU["NewIterations"]
                            Duration = OutputCU["Duration"]
                        except Exception:
                            print(
                                "Problem with (%i,%i) // computations on Cuda"
                                % (Blocks, Threads)
                            )
                    elif GpuStyle == "OpenCL":
    
    Emmanuel QUEMENER's avatar
    Emmanuel QUEMENER committed
                        try:
    
                            InputCL = {}