pi-monte-carlo.py

#!/usr/bin/env python3

#
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
#
# performs an estimation of Pi using Monte Carlo method
# a large amount of iterations is divided and distributed to compute units
# a lot of options are provided to perform scalabilty tests
#
# use -h for complete set of options
#
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
#

# Thanks to Andreas Klockner for PyCUDA:
# http://mathema.tician.de/software/pycuda
# Thanks to Andreas Klockner for PyOpenCL:
# http://mathema.tician.de/software/pyopencl
#

# 2013-01-01 : problems with launch timeout
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
# Option "Interactive" "0" in /etc/X11/xorg.conf

# Common tools
import numpy
import sys
import getopt
import time
import itertools
from socket import gethostname


class PenStacle:
    """Pentacle of Statistics from data"""

    Avg = 0
    Med = 0
    Std = 0
    Min = 0
    Max = 0

    def __init__(self, Data):
        self.Avg = numpy.average(Data)
        self.Med = numpy.median(Data)
        self.Std = numpy.std(Data)
        self.Max = numpy.max(Data)
        self.Min = numpy.min(Data)

    def display(self):
        print("%s %s %s %s %s" % (self.Avg, self.Med, self.Std, self.Min, self.Max))


class Experience:
    """Metrology for experiences"""

    DeviceStyle = ""
    DeviceId = 0
    AvgD = 0
    MedD = 0
    StdD = 0
    MinD = 0
    MaxD = 0
    AvgR = 0
    MedR = 0
    StdR = 0
    MinR = 0
    MaxR = 0

    def __init__(self, DeviceStyle, DeviceId, Iterations):
        self.DeviceStyle = DeviceStyle
        self.DeviceId = DeviceId
        self.Iterations

    def Metrology(self, Data):
        Duration = PenStacle(Data)
        Rate = PenStacle(Iterations / Data)
        print("Duration %s" % Duration)
        print("Rate %s" % Rate)


def DictionariesAPI():
    Marsaglia = {"CONG": 0, "SHR3": 1, "MWC": 2, "KISS": 3}
    Computing = {"INT32": 0, "INT64": 1, "FP32": 2, "FP64": 3}
    Test = {True: 1, False: 0}
    return (Marsaglia, Computing, Test)


# find prime factors of a number
# Get for WWW :
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
def PrimeFactors(x):

    factorlist = numpy.array([]).astype("uint32")
    loop = 2
    while loop <= x:
        if x % loop == 0:
            x /= loop
            factorlist = numpy.append(factorlist, [loop])
        else:
            loop += 1
    return factorlist


# Try to find the best thread number in Hybrid approach (Blocks&Threads)
# output is thread number
def BestThreadsNumber(jobs):
    factors = PrimeFactors(jobs)
    matrix = numpy.append([factors], [factors[::-1]], axis=0)
    threads = 1
    for factor in matrix.transpose().ravel():
        threads = threads * factor
        if threads * threads > jobs or threads > 512:
            break
    return int(threads)


# Predicted Amdahl Law (Reduced with s=1-p)
def AmdahlR(N, T1, p):
    return T1 * (1 - p + p / N)


# Predicted Amdahl Law
def Amdahl(N, T1, s, p):
    return T1 * (s + p / N)


# Predicted Mylq Law with first order
def Mylq(N, T1, s, c, p):
    return T1 * (s + p / N) + c * N


# Predicted Mylq Law with second order
def Mylq2(N, T1, s, c1, c2, p):
    return T1 * (s + p / N) + c1 * N + c2 * N * N


def KernelCodeCuda():
    KERNEL_CODE_CUDA = """
#define TCONG 0
#define TSHR3 1
#define TMWC 2
#define TKISS 3

#define TINT32 0
#define TINT64 1
#define TFP32 2
#define TFP64 3

#define IFTHEN 1

// Marsaglia RNG very simple implementation

#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
#define MWC   (znew+wnew)
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
#define CONG  (jcong=69069*jcong+1234567)
#define KISS  ((MWC^CONG)+SHR3)

#define MWCfp MWC * 2.328306435454494e-10f
#define KISSfp KISS * 2.328306435454494e-10f
#define SHR3fp SHR3 * 2.328306435454494e-10f
#define CONGfp CONG * 2.328306435454494e-10f

__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
{

#if TRNG == TCONG
   uint jcong=seed_z+work;
#elif TRNG == TSHR3
   uint jsr=seed_w+work;
#elif TRNG == TMWC
   uint z=seed_z+work;
   uint w=seed_w+work;
#elif TRNG == TKISS
   uint jcong=seed_z+work;
   uint jsr=seed_w+work;
   uint z=seed_z-work;
   uint w=seed_w-work;
#endif

   ulong total=0;

   for (ulong i=0;i<iterations;i++) {

#if TYPE == TINT32
    #define THEONE 1073741824
    #if TRNG == TCONG
        uint x=CONG>>17 ;
        uint y=CONG>>17 ;
    #elif TRNG == TSHR3
        uint x=SHR3>>17 ;
        uint y=SHR3>>17 ;
    #elif TRNG == TMWC
        uint x=MWC>>17 ;
        uint y=MWC>>17 ;
    #elif TRNG == TKISS
        uint x=KISS>>17 ;