#!/usr/bin/env python3

#
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
#
# performs an estimation of Pi using Monte Carlo method
# a large amount of iterations is divided and distributed to compute units
# a lot of options are provided to perform scalabilty tests
#
# use -h for complete set of options
#
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
#

# Thanks to Andreas Klockner for PyCUDA:
# http://mathema.tician.de/software/pycuda
# Thanks to Andreas Klockner for PyOpenCL:
# http://mathema.tician.de/software/pyopencl
#

# 2013-01-01 : problems with launch timeout
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
# Option "Interactive" "0" in /etc/X11/xorg.conf

# Common tools
import numpy
import sys
import getopt
import time
import itertools
from socket import gethostname


class PenStacle:
    """Pentacle of Statistics from data"""

    Avg = 0
    Med = 0
    Std = 0
    Min = 0
    Max = 0

    def __init__(self, Data):
        self.Avg = numpy.average(Data)
        self.Med = numpy.median(Data)
        self.Std = numpy.std(Data)
        self.Max = numpy.max(Data)
        self.Min = numpy.min(Data)

    def display(self):
        print("%s %s %s %s %s" % (self.Avg, self.Med, self.Std, self.Min, self.Max))


class Experience:
    """Metrology for experiences"""

    DeviceStyle = ""
    DeviceId = 0
    AvgD = 0
    MedD = 0
    StdD = 0
    MinD = 0
    MaxD = 0
    AvgR = 0
    MedR = 0
    StdR = 0
    MinR = 0
    MaxR = 0

    def __init__(self, DeviceStyle, DeviceId, Iterations):
        self.DeviceStyle = DeviceStyle
        self.DeviceId = DeviceId
        self.Iterations

    def Metrology(self, Data):
        Duration = PenStacle(Data)
        Rate = PenStacle(Iterations / Data)
        print("Duration %s" % Duration)
        print("Rate %s" % Rate)


def DictionariesAPI():
    Marsaglia = {"CONG": 0, "SHR3": 1, "MWC": 2, "KISS": 3}
    Computing = {"INT32": 0, "INT64": 1, "FP32": 2, "FP64": 3}
    Test = {True: 1, False: 0}
    return (Marsaglia, Computing, Test)


# find prime factors of a number
# Get for WWW :
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
def PrimeFactors(x):

    factorlist = numpy.array([]).astype("uint32")
    loop = 2
    while loop <= x:
        if x % loop == 0:
            x /= loop
            factorlist = numpy.append(factorlist, [loop])
        else:
            loop += 1
    return factorlist


# Try to find the best thread number in Hybrid approach (Blocks&Threads)
# output is thread number
def BestThreadsNumber(jobs):
    factors = PrimeFactors(jobs)
    matrix = numpy.append([factors], [factors[::-1]], axis=0)
    threads = 1
    for factor in matrix.transpose().ravel():
        threads = threads * factor
        if threads * threads > jobs or threads > 512:
            break
    return int(threads)


# Predicted Amdahl Law (Reduced with s=1-p)
def AmdahlR(N, T1, p):
    return T1 * (1 - p + p / N)


# Predicted Amdahl Law
def Amdahl(N, T1, s, p):
    return T1 * (s + p / N)


# Predicted Mylq Law with first order
def Mylq(N, T1, s, c, p):
    return T1 * (s + p / N) + c * N


# Predicted Mylq Law with second order
def Mylq2(N, T1, s, c1, c2, p):
    return T1 * (s + p / N) + c1 * N + c2 * N * N


def KernelCodeCuda():
    KERNEL_CODE_CUDA = """
#define TCONG 0
#define TSHR3 1
#define TMWC 2
#define TKISS 3

#define TINT32 0
#define TINT64 1
#define TFP32 2
#define TFP64 3

#define IFTHEN 1

// Marsaglia RNG very simple implementation

#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
#define MWC   (znew+wnew)
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
#define CONG  (jcong=69069*jcong+1234567)
#define KISS  ((MWC^CONG)+SHR3)

#define MWCfp MWC * 2.328306435454494e-10f
#define KISSfp KISS * 2.328306435454494e-10f
#define SHR3fp SHR3 * 2.328306435454494e-10f
#define CONGfp CONG * 2.328306435454494e-10f

__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
{

#if TRNG == TCONG
   uint jcong=seed_z+work;
#elif TRNG == TSHR3
   uint jsr=seed_w+work;
#elif TRNG == TMWC
   uint z=seed_z+work;
   uint w=seed_w+work;
#elif TRNG == TKISS
   uint jcong=seed_z+work;
   uint jsr=seed_w+work;
   uint z=seed_z-work;
   uint w=seed_w-work;
#endif

   ulong total=0;

   for (ulong i=0;i<iterations;i++) {

#if TYPE == TINT32
    #define THEONE 1073741824
    #if TRNG == TCONG
        uint x=CONG>>17 ;
        uint y=CONG>>17 ;
    #elif TRNG == TSHR3
        uint x=SHR3>>17 ;
        uint y=SHR3>>17 ;
    #elif TRNG == TMWC
        uint x=MWC>>17 ;
        uint y=MWC>>17 ;
    #elif TRNG == TKISS
        uint x=KISS>>17 ;
        uint y=KISS>>17 ;
    #endif
#elif TYPE == TINT64
    #define THEONE 4611686018427387904
    #if TRNG == TCONG
        ulong x=(ulong)(CONG>>1) ;
        ulong y=(ulong)(CONG>>1) ;
    #elif TRNG == TSHR3
        ulong x=(ulong)(SHR3>>1) ;
        ulong y=(ulong)(SHR3>>1) ;
    #elif TRNG == TMWC
        ulong x=(ulong)(MWC>>1) ;
        ulong y=(ulong)(MWC>>1) ;
    #elif TRNG == TKISS
        ulong x=(ulong)(KISS>>1) ;
        ulong y=(ulong)(KISS>>1) ;
    #endif
#elif TYPE == TFP32
    #define THEONE 1.0f
    #if TRNG == TCONG
        float x=CONGfp ;
        float y=CONGfp ;
    #elif TRNG == TSHR3
        float x=SHR3fp ;
        float y=SHR3fp ;
    #elif TRNG == TMWC
        float x=MWCfp ;
        float y=MWCfp ;
    #elif TRNG == TKISS
      float x=KISSfp ;
      float y=KISSfp ;
    #endif
#elif TYPE == TFP64
    #define THEONE 1.0f
    #if TRNG == TCONG
        double x=(double)CONGfp ;
        double y=(double)CONGfp ;
    #elif TRNG == TSHR3
        double x=(double)SHR3fp ;
        double y=(double)SHR3fp ;
    #elif TRNG == TMWC
        double x=(double)MWCfp ;
        double y=(double)MWCfp ;
    #elif TRNG == TKISS
        double x=(double)KISSfp ;
        double y=(double)KISSfp ;
    #endif
#endif

#if TEST == IFTHEN
      if ((x*x+y*y) <=THEONE) {
         total+=1;
      }
#else
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
      total+=inside;
#endif
   }

   return(total);
}

__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
{
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
   s[blockIdx.x]=total;
   __syncthreads();

}

__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
{
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
   s[threadIdx.x]=total;
   __syncthreads();

}

__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
{
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
   __syncthreads();
}

"""
    return KERNEL_CODE_CUDA


def KernelCodeOpenCL():
    KERNEL_CODE_OPENCL = """
#define TCONG 0
#define TSHR3 1
#define TMWC 2
#define TKISS 3

#define TINT32 0
#define TINT64 1
#define TFP32 2
#define TFP64 3

#define IFTHEN 1

// Marsaglia RNG very simple implementation
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)

#define MWC   (znew+wnew)
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
#define CONG  (jcong=69069*jcong+1234567)
#define KISS  ((MWC^CONG)+SHR3)

#define MWCfp MWC * 2.328306435454494e-10f
#define KISSfp KISS * 2.328306435454494e-10f
#define CONGfp CONG * 2.328306435454494e-10f
#define SHR3fp SHR3 * 2.328306435454494e-10f

ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
{

#if TRNG == TCONG
   uint jcong=seed_z+work;
#elif TRNG == TSHR3
   uint jsr=seed_w+work;
#elif TRNG == TMWC
   uint z=seed_z+work;
   uint w=seed_w+work;
#elif TRNG == TKISS
   uint jcong=seed_z+work;
   uint jsr=seed_w+work;
   uint z=seed_z-work;
   uint w=seed_w-work;
#endif

   ulong total=0;

   for (ulong i=0;i<iterations;i++) {

#if TYPE == TINT32
    #define THEONE 1073741824
    #if TRNG == TCONG
        uint x=CONG>>17 ;
        uint y=CONG>>17 ;
    #elif TRNG == TSHR3
        uint x=SHR3>>17 ;
        uint y=SHR3>>17 ;
    #elif TRNG == TMWC
        uint x=MWC>>17 ;
        uint y=MWC>>17 ;
    #elif TRNG == TKISS
        uint x=KISS>>17 ;
        uint y=KISS>>17 ;
    #endif
#elif TYPE == TINT64
    #define THEONE 4611686018427387904
    #if TRNG == TCONG
        ulong x=(ulong)(CONG>>1) ;
        ulong y=(ulong)(CONG>>1) ;
    #elif TRNG == TSHR3
        ulong x=(ulong)(SHR3>>1) ;
        ulong y=(ulong)(SHR3>>1) ;
    #elif TRNG == TMWC
        ulong x=(ulong)(MWC>>1) ;
        ulong y=(ulong)(MWC>>1) ;
    #elif TRNG == TKISS
        ulong x=(ulong)(KISS>>1) ;
        ulong y=(ulong)(KISS>>1) ;
    #endif
#elif TYPE == TFP32
    #define THEONE 1.0f
    #if TRNG == TCONG
        float x=CONGfp ;
        float y=CONGfp ;
    #elif TRNG == TSHR3
        float x=SHR3fp ;
        float y=SHR3fp ;
    #elif TRNG == TMWC
        float x=MWCfp ;
        float y=MWCfp ;
    #elif TRNG == TKISS
      float x=KISSfp ;
      float y=KISSfp ;
    #endif
#elif TYPE == TFP64
#pragma OPENCL EXTENSION cl_khr_fp64: enable
    #define THEONE 1.0f
    #if TRNG == TCONG
        double x=(double)CONGfp ;
        double y=(double)CONGfp ;
    #elif TRNG == TSHR3
        double x=(double)SHR3fp ;
        double y=(double)SHR3fp ;
    #elif TRNG == TMWC
        double x=(double)MWCfp ;
        double y=(double)MWCfp ;
    #elif TRNG == TKISS
        double x=(double)KISSfp ;
        double y=(double)KISSfp ;
    #endif
#endif

#if TEST == IFTHEN
      if ((x*x+y*y) <= THEONE) {
         total+=1;
      }
#else
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
      total+=inside;
#endif
   }

   return(total);
}

__kernel void MainLoopGlobal(
    __global ulong *s,ulong iterations,uint seed_w,uint seed_z)
{
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
   barrier(CLK_GLOBAL_MEM_FENCE);
   s[get_global_id(0)]=total;
}

__kernel void MainLoopLocal(
    __global ulong *s,ulong iterations,uint seed_w,uint seed_z)
{
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
   barrier(CLK_LOCAL_MEM_FENCE);
   s[get_local_id(0)]=total;
}

__kernel void MainLoopHybrid(
    __global ulong *s,ulong iterations,uint seed_w,uint seed_z)
{
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
   s[get_global_id(0)]=total;
}

"""
    return KERNEL_CODE_OPENCL


def MetropolisCuda(InputCU):

    print("Inside ", InputCU)

    iterations = InputCU["Iterations"]
    steps = InputCU["Steps"]
    blocks = InputCU["Blocks"]
    threads = InputCU["Threads"]
    Device = InputCU["Device"]
    RNG = InputCU["RNG"]
    ValueType = InputCU["ValueType"]
    Seeds = InputCU["Seeds"]

    Marsaglia, Computing, Test = DictionariesAPI()

    try:
        # For PyCUDA import
        import pycuda.driver as cuda
        from pycuda.compiler import SourceModule

        cuda.init()
        for Id in range(cuda.Device.count()):
            if Id == Device:
                XPU = cuda.Device(Id)
                print("GPU selected %s" % XPU.name())
        print

    except ImportError:
        print("Platform does not seem to support CUDA")

    circle = numpy.zeros(blocks * threads).astype(numpy.uint64)
    circleCU = cuda.InOut(circle)
    # circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
    # cuda.memcpy_htod(circleCU, circle)

    Context = XPU.make_context()

    try:
        mod = SourceModule(
            KernelCodeCuda(),
            options=[
                "--compiler-options",
                "-DTRNG=%i -DTYPE=%s" % (Marsaglia[RNG], Computing[ValueType]),
            ],
        )
        # mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
        # Needed to set the compiler via ccbin for CUDA9 implementation
        # mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True)  # noqa: E501
    except Exception:
        print("Compilation seems to break")

    MetropolisBlocksCU = mod.get_function("MainLoopBlocks")  # noqa: F841
    MetropolisThreadsCU = mod.get_function("MainLoopThreads")  # noqa: F841
    MetropolisHybridCU = mod.get_function("MainLoopHybrid")

    MyDuration = numpy.zeros(steps)

    jobs = blocks * threads

    iterationsCU = numpy.uint64(iterations / jobs)
    if iterations % jobs != 0:
        iterationsCU += numpy.uint64(1)

    for i in range(steps):
        start_time = time.time()

        try:
            MetropolisHybridCU(
                circleCU,
                numpy.uint64(iterationsCU),
                numpy.uint32(Seeds[0]),
                numpy.uint32(Seeds[1]),
                grid=(blocks, 1),
                block=(threads, 1, 1),
            )
        except Exception:
            print("Crash during CUDA call")

        elapsed = time.time() - start_time
        print(
            "(Blocks/Threads)=(%i,%i) method done in %.2f s..."
            % (blocks, threads, elapsed)
        )

        MyDuration[i] = elapsed

    OutputCU = {
        "Inside": sum(circle),
        "NewIterations": numpy.uint64(iterationsCU * jobs),
        "Duration": MyDuration,
    }
    print(OutputCU)
    Context.pop()

    Context.detach()

    return OutputCU


def MetropolisOpenCL(InputCL):

    import pyopencl as cl

    iterations = InputCL["Iterations"]
    steps = InputCL["Steps"]
    blocks = InputCL["Blocks"]
    threads = InputCL["Threads"]
    Device = InputCL["Device"]
    RNG = InputCL["RNG"]
    ValueType = InputCL["ValueType"]
    TestType = InputCL["IfThen"]
    Seeds = InputCL["Seeds"]

    Marsaglia, Computing, Test = DictionariesAPI()

    # Initialisation des variables en les CASTant correctement
    Id = 0
    HasXPU = False
    for platform in cl.get_platforms():
        for device in platform.get_devices():
            if Id == Device:
                XPU = device
                print("CPU/GPU selected: ", device.name.lstrip())
                HasXPU = True
            Id += 1
            # print(Id)

    if not HasXPU:
        print("No XPU #%i found in all of %i devices, sorry..." % (Device, Id - 1))
        sys.exit()

    # Je cree le contexte et la queue pour son execution
    try:
        ctx = cl.Context(devices=[XPU])
        queue = cl.CommandQueue(
            ctx, properties=cl.command_queue_properties.PROFILING_ENABLE
        )
    except Exception:
        print("Crash during context creation")

    # Je recupere les flag possibles pour les buffers
    mf = cl.mem_flags

    circle = numpy.zeros(blocks * threads).astype(numpy.uint64)
    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=circle)

    MetropolisCL = cl.Program(ctx, KernelCodeOpenCL()).build(
        options="-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s"
        % (Marsaglia[RNG], Computing[ValueType], Test[TestType])
    )

    MyDuration = numpy.zeros(steps)

    jobs = blocks * threads

    iterationsCL = numpy.uint64(iterations / jobs)
    if iterations % jobs != 0:
        iterationsCL += 1

    for i in range(steps):
        start_time = time.time()
        if threads == 1:
            CLLaunch = MetropolisCL.MainLoopGlobal(
                queue,
                (blocks,),
                None,
                circleCL,
                numpy.uint64(iterationsCL),
                numpy.uint32(Seeds[0]),
                numpy.uint32(Seeds[1]),
            )
        else:
            CLLaunch = MetropolisCL.MainLoopHybrid(
                queue,
                (jobs,),
                (threads,),
                circleCL,
                numpy.uint64(iterationsCL),
                numpy.uint32(Seeds[0]),
                numpy.uint32(Seeds[1]),
            )

        CLLaunch.wait()
        cl.enqueue_copy(queue, circle, circleCL).wait()

        elapsed = time.time() - start_time
        print(
            "(Blocks/Threads)=(%i,%i) method done in %.2f s..."
            % (blocks, threads, elapsed)
        )

        # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)

        # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
        MyDuration[i] = elapsed
        # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
        # MyPi[i]=numpy.median(AllPi)
        # print MyPi[i],numpy.std(AllPi),MyDuration[i]

    circleCL.release()

    OutputCL = {
        "Inside": sum(circle),
        "NewIterations": numpy.uint64(iterationsCL * jobs),
        "Duration": MyDuration,
    }
    # print(OutputCL)
    return OutputCL


def FitAndPrint(N, D, Curves):

    from scipy.optimize import curve_fit
    import matplotlib.pyplot as plt

    try:
        coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)

        D_Amdahl = Amdahl(N, coeffs_Amdahl[0], coeffs_Amdahl[1], coeffs_Amdahl[2])
        coeffs_Amdahl[1] = coeffs_Amdahl[1] * coeffs_Amdahl[0] / D[0]
        coeffs_Amdahl[2] = coeffs_Amdahl[2] * coeffs_Amdahl[0] / D[0]
        coeffs_Amdahl[0] = D[0]
        print(
            "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)"
            % (coeffs_Amdahl[0], coeffs_Amdahl[1], coeffs_Amdahl[2])
        )
    except Exception:
        print("Impossible to fit for Amdahl law : only %i elements" % len(D))

    try:
        coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)

        # D_AmdahlR = AmdahlR(N, coeffs_AmdahlR[0], coeffs_AmdahlR[1])
        coeffs_AmdahlR[1] = coeffs_AmdahlR[1] * coeffs_AmdahlR[0] / D[0]
        coeffs_AmdahlR[0] = D[0]
        print(
            "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)"
            % (coeffs_AmdahlR[0], 1 - coeffs_AmdahlR[1], coeffs_AmdahlR[1])
        )

    except Exception:
        print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))

    try:
        coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)

        coeffs_Mylq[1] = coeffs_Mylq[1] * coeffs_Mylq[0] / D[0]
        # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
        coeffs_Mylq[3] = coeffs_Mylq[3] * coeffs_Mylq[0] / D[0]
        coeffs_Mylq[0] = D[0]
        print(
            "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N"
            % (coeffs_Mylq[0], coeffs_Mylq[1], coeffs_Mylq[3], coeffs_Mylq[2])
        )
        D_Mylq = Mylq(N, coeffs_Mylq[0], coeffs_Mylq[1], coeffs_Mylq[2],
                coeffs_Mylq[3])
    except Exception:
        print("Impossible to fit for Mylq law : only %i elements" % len(D))

    try:
        coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)

        coeffs_Mylq2[1] = coeffs_Mylq2[1] * coeffs_Mylq2[0] / D[0]
        # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
        # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
        coeffs_Mylq2[4] = coeffs_Mylq2[4] * coeffs_Mylq2[0] / D[0]
        coeffs_Mylq2[0] = D[0]
        print(
            "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2"
            % (
                coeffs_Mylq2[0],
                coeffs_Mylq2[1],
                coeffs_Mylq2[4],
                coeffs_Mylq2[2],
                coeffs_Mylq2[3],
            )
        )

    except Exception:
        print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))

    if Curves:
        plt.xlabel("Number of Threads/work Items")
        plt.ylabel("Total Elapsed Time")

        (Experience,) = plt.plot(N, D, "ro")
    try:
        (pAmdahl,) = plt.plot(N, D_Amdahl, label="Loi de Amdahl")
        (pMylq,) = plt.plot(N, D_Mylq, label="Loi de Mylq")
    except Exception:
        print("Fit curves seem not to be available")

    plt.legend()
    plt.show()


if __name__ == "__main__":

    # Set defaults values

    # GPU style can be Cuda (Nvidia implementation) or OpenCL
    GpuStyle = "OpenCL"
    # Iterations is integer
    Iterations = 1000000000
    # BlocksBlocks in first number of Blocks to explore
    BlocksBegin = 1024
    # BlocksEnd is last number of Blocks to explore
    BlocksEnd = 1024
    # BlocksStep is the step of Blocks to explore
    BlocksStep = 1
    # ThreadsBlocks in first number of Blocks to explore
    ThreadsBegin = 1
    # ThreadsEnd is last number of Blocks to explore
    ThreadsEnd = 1
    # ThreadsStep is the step of Blocks to explore
    ThreadsStep = 1
    # Redo is the times to redo the test to improve metrology
    Redo = 1
    # OutMetrology is method for duration estimation : False is GPU inside
    OutMetrology = False
    Metrology = "InMetro"
    # Curves is True to print the curves
    Curves = False
    # Fit is True to print the curves
    Fit = False
    # Inside based on If
    IfThen = False
    # Marsaglia RNG
    RNG = "MWC"
    # Value type : INT32, INT64, FP32, FP64
    ValueType = "FP32"
    # Seeds for RNG
    Seeds = 110271, 101008

    HowToUse = "%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>"  # noqa: E501

    try:
        opts, args = getopt.getopt(
            sys.argv[1:],
            "hockg:i:b:e:s:f:l:t:r:d:m:v:",
            [
                "gpustyle=",
                "iterations=",
                "blocksBegin=",
                "blocksEnd=",
                "blocksStep=",
                "threadsFirst=",
                "threadsLast=",
                "threadssTep=",
                "redo=",
                "device=",
                "marsaglia=",
                "valuetype=",
            ],
        )
    except getopt.GetoptError:
        print(HowToUse % sys.argv[0])
        sys.exit(2)

    # List of Devices
    Devices = []
    Alu = {}

    for opt, arg in opts:
        if opt == "-h":
            print(HowToUse % sys.argv[0])

            print("\nInformations about devices detected under OpenCL API:")
            # For PyOpenCL import
            try:
                import pyopencl as cl

                Id = 0
                for platform in cl.get_platforms():
                    for device in platform.get_devices():
                        # deviceType=cl.device_type.to_string(device.type)
                        deviceType = "xPU"
                        print(
                            "Device #%i from %s of type %s : %s"
                            % (
                                Id,
                                platform.vendor.lstrip(),
                                deviceType,
                                device.name.lstrip(),
                            )
                        )
                        Id = Id + 1

            except Exception:
                print("Your platform does not seem to support OpenCL")

            print("\nInformations about devices detected under CUDA API:")
            # For PyCUDA import
            try:
                import pycuda.driver as cuda

                cuda.init()
                for Id in range(cuda.Device.count()):
                    device = cuda.Device(Id)
                    print("Device #%i of type GPU : %s" % (Id, device.name()))
                print
            except Exception:
                print("Your platform does not seem to support CUDA")

            sys.exit()

        elif opt == "-o":
            OutMetrology = True
            Metrology = "OutMetro"
        elif opt == "-c":
            Curves = True
        elif opt == "-k":
            IfThen = True
        elif opt in ("-d", "--device"):
            Devices.append(int(arg))
        elif opt in ("-g", "--gpustyle"):
            GpuStyle = arg
        elif opt in ("-m", "--marsaglia"):
            RNG = arg
        elif opt in ("-v", "--valuetype"):
            ValueType = arg
        elif opt in ("-i", "--iterations"):
            Iterations = numpy.uint64(arg)
        elif opt in ("-b", "--blocksbegin"):
            BlocksBegin = int(arg)
            BlocksEnd = BlocksBegin
        elif opt in ("-e", "--blocksend"):
            BlocksEnd = int(arg)
        elif opt in ("-s", "--blocksstep"):
            BlocksStep = int(arg)
        elif opt in ("-f", "--threadsfirst"):
            ThreadsBegin = int(arg)
            ThreadsEnd = ThreadsBegin
        elif opt in ("-l", "--threadslast"):
            ThreadsEnd = int(arg)
        elif opt in ("-t", "--threadsstep"):
            ThreadsStep = int(arg)
        elif opt in ("-r", "--redo"):
            Redo = int(arg)

    # If no device has been specified, take the first one!
    if len(Devices) == 0:
        Devices.append(0)

    print("Devices Identification : %s" % Devices)
    print("GpuStyle used : %s" % GpuStyle)
    print("Iterations : %s" % Iterations)
    print("Number of Blocks on begin : %s" % BlocksBegin)
    print("Number of Blocks on end : %s" % BlocksEnd)
    print("Step on Blocks : %s" % BlocksStep)
    print("Number of Threads on begin : %s" % ThreadsBegin)
    print("Number of Threads on end : %s" % ThreadsEnd)
    print("Step on Threads : %s" % ThreadsStep)
    print("Number of redo : %s" % Redo)
    print("Metrology done out of XPU : %r" % OutMetrology)
    print("Type of Marsaglia RNG used : %s" % RNG)
    print("Type of variable : %s" % ValueType)

    if GpuStyle == "CUDA":
        try:
            # For PyCUDA import
            import pycuda.driver as cuda

            cuda.init()
            for Id in range(cuda.Device.count()):
                device = cuda.Device(Id)
                print("Device #%i of type GPU : %s" % (Id, device.name()))
                if Id in Devices:
                    Alu[Id] = "GPU"

        except ImportError:
            print("Platform does not seem to support CUDA")

    if GpuStyle == "OpenCL":
        try:
            # For PyOpenCL import
            import pyopencl as cl

            Id = 0
            for platform in cl.get_platforms():
                for device in platform.get_devices():
                    # deviceType=cl.device_type.to_string(device.type)
                    deviceType = "xPU"
                    print(
                        "Device #%i from %s of type %s : %s"
                        % (
                            Id,
                            platform.vendor.lstrip().rstrip(),
                            deviceType,
                            device.name.lstrip().rstrip(),
                        )
                    )

                    if Id in Devices:
                        # Set the Alu as detected Device Type
                        Alu[Id] = deviceType
                    Id = Id + 1
        except ImportError:
            print("Platform does not seem to support OpenCL")

    # print(Devices,Alu)

    BlocksList = range(BlocksBegin, BlocksEnd + BlocksStep, BlocksStep)
    ThreadsList = range(ThreadsBegin, ThreadsEnd + ThreadsStep, ThreadsStep)

    ExploredJobs = numpy.array([]).astype(numpy.uint32)
    ExploredBlocks = numpy.array([]).astype(numpy.uint32)
    ExploredThreads = numpy.array([]).astype(numpy.uint32)
    avgD = numpy.array([]).astype(numpy.float32)
    medD = numpy.array([]).astype(numpy.float32)
    stdD = numpy.array([]).astype(numpy.float32)
    minD = numpy.array([]).astype(numpy.float32)
    maxD = numpy.array([]).astype(numpy.float32)
    avgR = numpy.array([]).astype(numpy.float32)
    medR = numpy.array([]).astype(numpy.float32)
    stdR = numpy.array([]).astype(numpy.float32)
    minR = numpy.array([]).astype(numpy.float32)
    maxR = numpy.array([]).astype(numpy.float32)

    for Blocks, Threads in itertools.product(BlocksList, ThreadsList):

        # print Blocks,Threads
        circle = numpy.zeros(Blocks * Threads).astype(numpy.uint64)
        ExploredJobs = numpy.append(ExploredJobs, Blocks * Threads)
        ExploredBlocks = numpy.append(ExploredBlocks, Blocks)
        ExploredThreads = numpy.append(ExploredThreads, Threads)

        if OutMetrology:
            DurationItem = numpy.array([]).astype(numpy.float32)
            Duration = numpy.array([]).astype(numpy.float32)
            Rate = numpy.array([]).astype(numpy.float32)
            for i in range(Redo):
                start = time.time()
                if GpuStyle == "CUDA":
                    try:
                        InputCU = {}
                        InputCU["Iterations"] = Iterations
                        InputCU["Steps"] = 1
                        InputCU["Blocks"] = Blocks
                        InputCU["Threads"] = Threads
                        InputCU["Device"] = Devices[0]
                        InputCU["RNG"] = RNG
                        InputCU["Seeds"] = Seeds
                        InputCU["ValueType"] = ValueType
                        InputCU["IfThen"] = IfThen
                        OutputCU = MetropolisCuda(InputCU)
                        Inside = OutputCU["Circle"]
                        NewIterations = OutputCU["NewIterations"]
                        Duration = OutputCU["Duration"]
                    except Exception:
                        print(
                            "Problem with (%i,%i) // computations on Cuda"
                            % (Blocks, Threads)
                        )
                elif GpuStyle == "OpenCL":
                    try:
                        InputCL = {}
                        InputCL["Iterations"] = Iterations
                        InputCL["Steps"] = 1
                        InputCL["Blocks"] = Blocks
                        InputCL["Threads"] = Threads
                        InputCL["Device"] = Devices[0]
                        InputCL["RNG"] = RNG
                        InputCL["Seeds"] = Seeds
                        InputCL["ValueType"] = ValueType
                        InputCL["IfThen"] = IfThen
                        OutputCL = MetropolisOpenCL(InputCL)
                        Inside = OutputCL["Circle"]
                        NewIterations = OutputCL["NewIterations"]
                        Duration = OutputCL["Duration"]
                    except Exception:
                        print(
                            "Problem with (%i,%i) // computations on OpenCL"
                            % (Blocks, Threads)
                        )
                Duration = numpy.append(Duration, time.time() - start)
                Rate = numpy.append(Rate, NewIterations / Duration[-1])
        else:
            if GpuStyle == "CUDA":
                try:
                    InputCU = {}
                    InputCU["Iterations"] = Iterations
                    InputCU["Steps"] = Redo
                    InputCU["Blocks"] = Blocks
                    InputCU["Threads"] = Threads
                    InputCU["Device"] = Devices[0]
                    InputCU["RNG"] = RNG
                    InputCU["Seeds"] = Seeds
                    InputCU["ValueType"] = ValueType
                    InputCU["IfThen"] = IfThen
                    OutputCU = MetropolisCuda(InputCU)
                    Inside = OutputCU["Inside"]
                    NewIterations = OutputCU["NewIterations"]
                    Duration = OutputCU["Duration"]
                    pycuda.context.pop()  # noqa: F821
                except Exception:
                    print(
                        "Problem with (%i,%i) // computations on Cuda"
                        % (Blocks, Threads)
                    )
            elif GpuStyle == "OpenCL":
                try:
                    InputCL = {}
                    InputCL["Iterations"] = Iterations
                    InputCL["Steps"] = Redo
                    InputCL["Blocks"] = Blocks
                    InputCL["Threads"] = Threads
                    InputCL["Device"] = Devices[0]
                    InputCL["RNG"] = RNG
                    InputCL["Seeds"] = Seeds
                    InputCL["ValueType"] = ValueType
                    InputCL["IfThen"] = IfThen
                    OutputCL = MetropolisOpenCL(InputCL)
                    Inside = OutputCL["Inside"]
                    NewIterations = OutputCL["NewIterations"]
                    Duration = OutputCL["Duration"]
                except Exception:
                    print(
                        "Problem with (%i,%i) // computations on OpenCL"
                        % (Blocks, Threads)
                    )
            Rate = NewIterations / Duration[-1]
            print(
                "Itops %i\nLogItops %.2f "
                % (int(Rate), numpy.log(Rate) / numpy.log(10))
            )
            print("Pi estimation %.8f" % (4.0 / NewIterations * Inside))

        avgD = numpy.append(avgD, numpy.average(Duration))
        medD = numpy.append(medD, numpy.median(Duration))
        stdD = numpy.append(stdD, numpy.std(Duration))
        minD = numpy.append(minD, numpy.min(Duration))
        maxD = numpy.append(maxD, numpy.max(Duration))
        avgR = numpy.append(avgR, numpy.average(Rate))
        medR = numpy.append(medR, numpy.median(Rate))
        stdR = numpy.append(stdR, numpy.std(Rate))
        minR = numpy.append(minR, numpy.min(Rate))
        maxR = numpy.append(maxR, numpy.max(Rate))

        print(
            "%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i"
            % (
                avgD[-1],
                medD[-1],
                stdD[-1],
                minD[-1],
                maxD[-1],
                avgR[-1],
                medR[-1],
                stdR[-1],
                minR[-1],
                maxR[-1],
            )
        )

        numpy.savez(
            "Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s"
            % (
                ValueType,
                RNG,
                Alu[Devices[0]],
                GpuStyle,
                BlocksBegin,
                BlocksEnd,
                ThreadsBegin,
                ThreadsEnd,
                Iterations,
                Devices[0],
                Metrology,
                gethostname(),
            ),
            (
                ExploredBlocks,
                ExploredThreads,
                avgD,
                medD,
                stdD,
                minD,
                maxD,
                avgR,
                medR,
                stdR,
                minR,
                maxR,
            ),
        )
        ToSave = [
            ExploredBlocks,
            ExploredThreads,
            avgD,
            medD,
            stdD,
            minD,
            maxD,
            avgR,
            medR,
            stdR,
            minR,
            maxR,
        ]
        numpy.savetxt(
            "Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s"
            % (
                ValueType,
                RNG,
                Alu[Devices[0]],
                GpuStyle,
                BlocksBegin,
                BlocksEnd,
                ThreadsBegin,
                ThreadsEnd,
                Iterations,
                Devices[0],
                Metrology,
                gethostname(),
            ),
            numpy.transpose(ToSave),
            fmt="%i %i %e %e %e %e %e %i %i %i %i %i",
        )

    if Fit:
        FitAndPrint(ExploredJobs, median, Curves)  # noqa: F821, E501 # FIXME: undefined var 'median'