diff --git a/src/cl/pyopencl-ranluxcl.cl b/src/cl/pyopencl-ranluxcl.cl index aab02bb75c18368d9024926e051eef5e1984d3fe..a40cadf1e3f5e28a390eb796d7ac7aaeda47a48a 100644 --- a/src/cl/pyopencl-ranluxcl.cl +++ b/src/cl/pyopencl-ranluxcl.cl @@ -10,8 +10,8 @@ which should perfectly replicate the numbers generated by the original Fortran ***** QUICK USAGE DESCRIPTION ************************************************** -1. Create an OpenCL buffer with room for at least 28 32-bit variables -(112 byte). I.e., in C/C++: size_t buffSize = numWorkitems * 112; +1. Create an OpenCL buffer with room for at least 28 32-bit variables (112 byte) +per work-item. I.e., in C/C++: size_t buffSize = numWorkitems * 112; 2. Pass the buffer and an unsigned integer seed <ins> to a kernel that launches the ranluxcl_initialization function. The seed <ins> can be any unsigned 32-bit @@ -33,8 +33,7 @@ points not included, then uploads the seeds again: #include "ranluxcl.cl" kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab) { - //ranluxclstate is a struct of 7 float4 variables - //storing the state of the generator. + //ranluxclstate stores the state of the generator. ranluxcl_state_t ranluxclstate; //Download state into ranluxclstate struct. @@ -354,7 +353,7 @@ void ranluxcl_download_seed(ranluxcl_state_t *rst, void ranluxcl_upload_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab) { - ranluxcltab[RANLUXCL_MYID] = (*rst); + ranluxcltab[RANLUXCL_MYID] = (*rst); } /* @@ -581,90 +580,24 @@ void ranluxcl_synchronize(ranluxcl_state_t *rst) ranluxcl32(rst); } -void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab) +/* + * Uses a 64-bit xorshift PRNG by George Marsaglia to initialize the generator. + * + * This function can be used instead of ranluxcl_initialization if manual + * control of the seed of each generator is desired. x must be unique for each + * time this function is called, and *ranluxcltab should point to the specific + * entry in the table to be initialized. Compare this to ranluxcl_initialization + * where ins needs only be unique for each NDRange, and *ranluxcltab points + * to the base address of the table for the entire NDRange. Also note that + * depending on what you are doing the ranluxcl_upload_seed and + * ranluxcl_download_seed functions may not do what you want, so make sure + * you know what you are doing! + */ + +void ranluxcl_init(ulong x, global ranluxcl_state_t *ranluxcltab) { ranluxcl_state_t rst; - #ifdef RANLUXCL_USE_LEGACY_INITIALIZATION - //Using legacy initialization from original Fortan 77 implementation - - //ins is scaled so that if the user makes another call somewhere else - //with ins + 1 there should be no overlap. Also adding one - //allows us to use ins = 0. - int k, maxWorkitems; - - #ifdef RANLUXCL_MAXWORKITEMS - maxWorkitems = RANLUXCL_MAXWORKITEMS; - #else - maxWorkitems = RANLUXCL_NUMWORKITEMS; - #endif //RANLUXCL_MAXWORKITEMS - - int scaledins = ins * maxWorkitems + 1; - - int js = scaledins + RANLUXCL_MYID; - - //Make sure js is not too small (should really be an error) - if(js < 1) - js = 1; - - #define IC 2147483563 - #define ITWO24 16777216 - - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s01=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s02=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s03=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s04=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s05=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s06=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s07=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s08=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s09=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s10=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s11=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s12=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s13=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s14=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s15=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s16=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s17=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s18=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s19=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s20=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s21=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s22=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s23=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; - rst.s24=(js%ITWO24)*RANLUXCL_TWOM24; - - #undef IC - #undef ITWO24 - - #else //RANLUXCL_USE_LEGACY_INITIALIZATION - //Using default initialization - #define RANLUXCL_POW2_24 16777216 #define RANLUXCL_56 0x00FFFFFFFFFFFFFF #define RANLUXCL_48 0x0000FFFFFFFFFFFF @@ -674,13 +607,7 @@ void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab) #define RANLUXCL_16 0x000000000000FFFF #define RANLUXCL_8 0x00000000000000FF - //We scale ins by (2^32)-1. As long as we never use more than (2^32)-1 - //work-items per NDRange the initial states should never be the same. - //We use a simple 64-bit xorshift RNG by George Marsaglia, which has a - //period of 2^64 - 1. - ulong x1, x2, x3; - ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * ((ulong)UINT_MAX + 1); //Logical shifts used so that all 64 bits of output are used (24 bits //per float), to be certain that all initial states are different. @@ -753,7 +680,128 @@ void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab) #undef RANLUXCL_16 #undef RANLUXCL_8 - #endif //RANLUXCL_USE_LEGACY_INITIALIZATION + rst.in24 = 0; + rst.stepnr = 0; + rst.carry = 0.0f; + if(rst.s24 == 0.0f) + rst.carry = RANLUXCL_TWOM24; + + #ifndef RANLUXCL_NO_WARMUP + //Warming up the generator, ensuring there are no initial correlations. + //16 is a "magic number". It is the number of times we must generate + //a batch of 24 numbers to ensure complete decorrelation, however it + //seems like it is necessary to double this for the special case when + //the generator is initialized to all zeros. + for(int i=0; i<16 * 2; i++){ + ranluxcl_os(rst.s09, rst.s10, &(rst.s24), &(rst.carry)); + ranluxcl_os(rst.s08, rst.s09, &(rst.s23), &(rst.carry)); + ranluxcl_os(rst.s07, rst.s08, &(rst.s22), &(rst.carry)); + ranluxcl_os(rst.s06, rst.s07, &(rst.s21), &(rst.carry)); + ranluxcl_os(rst.s05, rst.s06, &(rst.s20), &(rst.carry)); + ranluxcl_os(rst.s04, rst.s05, &(rst.s19), &(rst.carry)); + ranluxcl_os(rst.s03, rst.s04, &(rst.s18), &(rst.carry)); + ranluxcl_os(rst.s02, rst.s03, &(rst.s17), &(rst.carry)); + ranluxcl_os(rst.s01, rst.s02, &(rst.s16), &(rst.carry)); + ranluxcl_os(rst.s24, rst.s01, &(rst.s15), &(rst.carry)); + ranluxcl_os(rst.s23, rst.s24, &(rst.s14), &(rst.carry)); + ranluxcl_os(rst.s22, rst.s23, &(rst.s13), &(rst.carry)); + ranluxcl_os(rst.s21, rst.s22, &(rst.s12), &(rst.carry)); + ranluxcl_os(rst.s20, rst.s21, &(rst.s11), &(rst.carry)); + ranluxcl_os(rst.s19, rst.s20, &(rst.s10), &(rst.carry)); + ranluxcl_os(rst.s18, rst.s19, &(rst.s09), &(rst.carry)); + ranluxcl_os(rst.s17, rst.s18, &(rst.s08), &(rst.carry)); + ranluxcl_os(rst.s16, rst.s17, &(rst.s07), &(rst.carry)); + ranluxcl_os(rst.s15, rst.s16, &(rst.s06), &(rst.carry)); + ranluxcl_os(rst.s14, rst.s15, &(rst.s05), &(rst.carry)); + ranluxcl_os(rst.s13, rst.s14, &(rst.s04), &(rst.carry)); + ranluxcl_os(rst.s12, rst.s13, &(rst.s03), &(rst.carry)); + ranluxcl_os(rst.s11, rst.s12, &(rst.s02), &(rst.carry)); + ranluxcl_os(rst.s10, rst.s11, &(rst.s01), &(rst.carry)); + } + #endif //RANLUXCL_NO_WARMUP + + //Upload the state + *ranluxcltab = rst; +} + +void ranluxcl_init_legacy(uint ins, global ranluxcl_state_t *ranluxcltab) +{ + //Using legacy initialization from original Fortan 77 implementation + + //ins is scaled so that if the user makes another call somewhere else + //with ins + 1 there should be no overlap. Also adding one + //allows us to use ins = 0. + int k, maxWorkitems; + ranluxcl_state_t rst; + + #ifdef RANLUXCL_MAXWORKITEMS + maxWorkitems = RANLUXCL_MAXWORKITEMS; + #else + maxWorkitems = RANLUXCL_NUMWORKITEMS; + #endif //RANLUXCL_MAXWORKITEMS + + int scaledins = ins * maxWorkitems + 1; + + int js = scaledins + RANLUXCL_MYID; + + //Make sure js is not too small (should really be an error) + if(js < 1) + js = 1; + + #define IC 2147483563 + #define ITWO24 16777216 + + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s01=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s02=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s03=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s04=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s05=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s06=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s07=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s08=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s09=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s10=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s11=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s12=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s13=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s14=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s15=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s16=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s17=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s18=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s19=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s20=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s21=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s22=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s23=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s24=(js%ITWO24)*RANLUXCL_TWOM24; + + #undef IC + #undef ITWO24 rst.in24 = 0; rst.stepnr = 0; @@ -797,6 +845,22 @@ void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab) ranluxcl_upload_seed(&rst, ranluxcltab); } +void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab) +{ + #ifdef RANLUXCL_USE_LEGACY_INITIALIZATION + ranluxcl_init_legacy(ins, ranluxcltab); + + #else // Not RANLUXCL_USE_LEGACY_INITIALIZATION + + // We scale ins by 2^32. As long as we never use more than (2^32)-1 + // work-items per NDRange the initial states should never be the same. + + ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * ((ulong)UINT_MAX + 1); + ranluxcl_init(x, ranluxcltab + RANLUXCL_MYID); + + #endif // RANLUXCL_USE_LEGACY_INITIALIZATION +} + float4 ranluxcl32norm(ranluxcl_state_t *rst) { //Returns a vector where each component is a normally