Skip to content
Snippets Groups Projects
Commit 7c83a521 authored by Andreas Klöckner's avatar Andreas Klöckner
Browse files

Update RANLUX.

parent 3f06ec73
No related branches found
No related tags found
No related merge requests found
......@@ -10,8 +10,8 @@ which should perfectly replicate the numbers generated by the original Fortran
***** QUICK USAGE DESCRIPTION **************************************************
1. Create an OpenCL buffer with room for at least 28 32-bit variables
(112 byte). I.e., in C/C++: size_t buffSize = numWorkitems * 112;
1. Create an OpenCL buffer with room for at least 28 32-bit variables (112 byte)
per work-item. I.e., in C/C++: size_t buffSize = numWorkitems * 112;
2. Pass the buffer and an unsigned integer seed <ins> to a kernel that launches
the ranluxcl_initialization function. The seed <ins> can be any unsigned 32-bit
......@@ -33,8 +33,7 @@ points not included, then uploads the seeds again:
#include "ranluxcl.cl"
kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab)
{
//ranluxclstate is a struct of 7 float4 variables
//storing the state of the generator.
//ranluxclstate stores the state of the generator.
ranluxcl_state_t ranluxclstate;
//Download state into ranluxclstate struct.
......@@ -354,7 +353,7 @@ void ranluxcl_download_seed(ranluxcl_state_t *rst,
void ranluxcl_upload_seed(ranluxcl_state_t *rst,
global ranluxcl_state_t *ranluxcltab)
{
ranluxcltab[RANLUXCL_MYID] = (*rst);
ranluxcltab[RANLUXCL_MYID] = (*rst);
}
/*
......@@ -581,90 +580,24 @@ void ranluxcl_synchronize(ranluxcl_state_t *rst)
ranluxcl32(rst);
}
void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
/*
* Uses a 64-bit xorshift PRNG by George Marsaglia to initialize the generator.
*
* This function can be used instead of ranluxcl_initialization if manual
* control of the seed of each generator is desired. x must be unique for each
* time this function is called, and *ranluxcltab should point to the specific
* entry in the table to be initialized. Compare this to ranluxcl_initialization
* where ins needs only be unique for each NDRange, and *ranluxcltab points
* to the base address of the table for the entire NDRange. Also note that
* depending on what you are doing the ranluxcl_upload_seed and
* ranluxcl_download_seed functions may not do what you want, so make sure
* you know what you are doing!
*/
void ranluxcl_init(ulong x, global ranluxcl_state_t *ranluxcltab)
{
ranluxcl_state_t rst;
#ifdef RANLUXCL_USE_LEGACY_INITIALIZATION
//Using legacy initialization from original Fortan 77 implementation
//ins is scaled so that if the user makes another call somewhere else
//with ins + 1 there should be no overlap. Also adding one
//allows us to use ins = 0.
int k, maxWorkitems;
#ifdef RANLUXCL_MAXWORKITEMS
maxWorkitems = RANLUXCL_MAXWORKITEMS;
#else
maxWorkitems = RANLUXCL_NUMWORKITEMS;
#endif //RANLUXCL_MAXWORKITEMS
int scaledins = ins * maxWorkitems + 1;
int js = scaledins + RANLUXCL_MYID;
//Make sure js is not too small (should really be an error)
if(js < 1)
js = 1;
#define IC 2147483563
#define ITWO24 16777216
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s01=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s02=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s03=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s04=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s05=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s06=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s07=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s08=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s09=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s10=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s11=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s12=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s13=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s14=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s15=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s16=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s17=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s18=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s19=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s20=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s21=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s22=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s23=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s24=(js%ITWO24)*RANLUXCL_TWOM24;
#undef IC
#undef ITWO24
#else //RANLUXCL_USE_LEGACY_INITIALIZATION
//Using default initialization
#define RANLUXCL_POW2_24 16777216
#define RANLUXCL_56 0x00FFFFFFFFFFFFFF
#define RANLUXCL_48 0x0000FFFFFFFFFFFF
......@@ -674,13 +607,7 @@ void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
#define RANLUXCL_16 0x000000000000FFFF
#define RANLUXCL_8 0x00000000000000FF
//We scale ins by (2^32)-1. As long as we never use more than (2^32)-1
//work-items per NDRange the initial states should never be the same.
//We use a simple 64-bit xorshift RNG by George Marsaglia, which has a
//period of 2^64 - 1.
ulong x1, x2, x3;
ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * ((ulong)UINT_MAX + 1);
//Logical shifts used so that all 64 bits of output are used (24 bits
//per float), to be certain that all initial states are different.
......@@ -753,7 +680,128 @@ void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
#undef RANLUXCL_16
#undef RANLUXCL_8
#endif //RANLUXCL_USE_LEGACY_INITIALIZATION
rst.in24 = 0;
rst.stepnr = 0;
rst.carry = 0.0f;
if(rst.s24 == 0.0f)
rst.carry = RANLUXCL_TWOM24;
#ifndef RANLUXCL_NO_WARMUP
//Warming up the generator, ensuring there are no initial correlations.
//16 is a "magic number". It is the number of times we must generate
//a batch of 24 numbers to ensure complete decorrelation, however it
//seems like it is necessary to double this for the special case when
//the generator is initialized to all zeros.
for(int i=0; i<16 * 2; i++){
ranluxcl_os(rst.s09, rst.s10, &(rst.s24), &(rst.carry));
ranluxcl_os(rst.s08, rst.s09, &(rst.s23), &(rst.carry));
ranluxcl_os(rst.s07, rst.s08, &(rst.s22), &(rst.carry));
ranluxcl_os(rst.s06, rst.s07, &(rst.s21), &(rst.carry));
ranluxcl_os(rst.s05, rst.s06, &(rst.s20), &(rst.carry));
ranluxcl_os(rst.s04, rst.s05, &(rst.s19), &(rst.carry));
ranluxcl_os(rst.s03, rst.s04, &(rst.s18), &(rst.carry));
ranluxcl_os(rst.s02, rst.s03, &(rst.s17), &(rst.carry));
ranluxcl_os(rst.s01, rst.s02, &(rst.s16), &(rst.carry));
ranluxcl_os(rst.s24, rst.s01, &(rst.s15), &(rst.carry));
ranluxcl_os(rst.s23, rst.s24, &(rst.s14), &(rst.carry));
ranluxcl_os(rst.s22, rst.s23, &(rst.s13), &(rst.carry));
ranluxcl_os(rst.s21, rst.s22, &(rst.s12), &(rst.carry));
ranluxcl_os(rst.s20, rst.s21, &(rst.s11), &(rst.carry));
ranluxcl_os(rst.s19, rst.s20, &(rst.s10), &(rst.carry));
ranluxcl_os(rst.s18, rst.s19, &(rst.s09), &(rst.carry));
ranluxcl_os(rst.s17, rst.s18, &(rst.s08), &(rst.carry));
ranluxcl_os(rst.s16, rst.s17, &(rst.s07), &(rst.carry));
ranluxcl_os(rst.s15, rst.s16, &(rst.s06), &(rst.carry));
ranluxcl_os(rst.s14, rst.s15, &(rst.s05), &(rst.carry));
ranluxcl_os(rst.s13, rst.s14, &(rst.s04), &(rst.carry));
ranluxcl_os(rst.s12, rst.s13, &(rst.s03), &(rst.carry));
ranluxcl_os(rst.s11, rst.s12, &(rst.s02), &(rst.carry));
ranluxcl_os(rst.s10, rst.s11, &(rst.s01), &(rst.carry));
}
#endif //RANLUXCL_NO_WARMUP
//Upload the state
*ranluxcltab = rst;
}
void ranluxcl_init_legacy(uint ins, global ranluxcl_state_t *ranluxcltab)
{
//Using legacy initialization from original Fortan 77 implementation
//ins is scaled so that if the user makes another call somewhere else
//with ins + 1 there should be no overlap. Also adding one
//allows us to use ins = 0.
int k, maxWorkitems;
ranluxcl_state_t rst;
#ifdef RANLUXCL_MAXWORKITEMS
maxWorkitems = RANLUXCL_MAXWORKITEMS;
#else
maxWorkitems = RANLUXCL_NUMWORKITEMS;
#endif //RANLUXCL_MAXWORKITEMS
int scaledins = ins * maxWorkitems + 1;
int js = scaledins + RANLUXCL_MYID;
//Make sure js is not too small (should really be an error)
if(js < 1)
js = 1;
#define IC 2147483563
#define ITWO24 16777216
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s01=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s02=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s03=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s04=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s05=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s06=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s07=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s08=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s09=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s10=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s11=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s12=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s13=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s14=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s15=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s16=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s17=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s18=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s19=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s20=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s21=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s22=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s23=(js%ITWO24)*RANLUXCL_TWOM24;
k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s24=(js%ITWO24)*RANLUXCL_TWOM24;
#undef IC
#undef ITWO24
rst.in24 = 0;
rst.stepnr = 0;
......@@ -797,6 +845,22 @@ void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
ranluxcl_upload_seed(&rst, ranluxcltab);
}
void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
{
#ifdef RANLUXCL_USE_LEGACY_INITIALIZATION
ranluxcl_init_legacy(ins, ranluxcltab);
#else // Not RANLUXCL_USE_LEGACY_INITIALIZATION
// We scale ins by 2^32. As long as we never use more than (2^32)-1
// work-items per NDRange the initial states should never be the same.
ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * ((ulong)UINT_MAX + 1);
ranluxcl_init(x, ranluxcltab + RANLUXCL_MYID);
#endif // RANLUXCL_USE_LEGACY_INITIALIZATION
}
float4 ranluxcl32norm(ranluxcl_state_t *rst)
{
//Returns a vector where each component is a normally
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment