diff --git a/src/cl/pyopencl-ranluxcl.cl b/src/cl/pyopencl-ranluxcl.cl
index aab02bb75c18368d9024926e051eef5e1984d3fe..a40cadf1e3f5e28a390eb796d7ac7aaeda47a48a 100644
--- a/src/cl/pyopencl-ranluxcl.cl
+++ b/src/cl/pyopencl-ranluxcl.cl
@@ -10,8 +10,8 @@ which should perfectly replicate the numbers generated by the original Fortran
 
 ***** QUICK USAGE DESCRIPTION **************************************************
 
-1. Create an OpenCL buffer with room for at least 28 32-bit variables
-(112 byte). I.e., in C/C++: size_t buffSize = numWorkitems * 112;
+1. Create an OpenCL buffer with room for at least 28 32-bit variables (112 byte)
+per work-item. I.e., in C/C++: size_t buffSize = numWorkitems * 112;
 
 2. Pass the buffer and an unsigned integer seed <ins> to a kernel that launches
 the ranluxcl_initialization function. The seed <ins> can be any unsigned 32-bit
@@ -33,8 +33,7 @@ points not included, then uploads the seeds again:
 	#include "ranluxcl.cl"
 	kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab)
 	{
-		//ranluxclstate is a struct of 7 float4 variables
-		//storing the state of the generator.
+		//ranluxclstate stores the state of the generator.
 		ranluxcl_state_t ranluxclstate;
 
 		//Download state into ranluxclstate struct.
@@ -354,7 +353,7 @@ void ranluxcl_download_seed(ranluxcl_state_t *rst,
 void ranluxcl_upload_seed(ranluxcl_state_t *rst,
 	global ranluxcl_state_t *ranluxcltab)
 {
-        ranluxcltab[RANLUXCL_MYID] = (*rst);
+	ranluxcltab[RANLUXCL_MYID] = (*rst);
 }
 
 /*
@@ -581,90 +580,24 @@ void ranluxcl_synchronize(ranluxcl_state_t *rst)
 		ranluxcl32(rst);
 }
 
-void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
+/*
+ * Uses a 64-bit xorshift PRNG by George Marsaglia to initialize the generator.
+ *
+ * This function can be used instead of ranluxcl_initialization if manual
+ * control of the seed of each generator is desired. x must be unique for each
+ * time this function is called, and *ranluxcltab should point to the specific
+ * entry in the table to be initialized. Compare this to ranluxcl_initialization
+ * where ins needs only be unique for each NDRange, and *ranluxcltab points
+ * to the base address of the table for the entire NDRange. Also note that
+ * depending on what you are doing the ranluxcl_upload_seed and
+ * ranluxcl_download_seed functions may not do what you want, so make sure
+ * you know what you are doing!
+ */
+
+void ranluxcl_init(ulong x, global ranluxcl_state_t *ranluxcltab)
 {
 	ranluxcl_state_t rst;
 
-	#ifdef RANLUXCL_USE_LEGACY_INITIALIZATION
-	//Using legacy initialization from original Fortan 77 implementation
-
-	//ins is scaled so that if the user makes another call somewhere else
-	//with ins + 1 there should be no overlap. Also adding one
-	//allows us to use ins = 0.
-	int k, maxWorkitems;
-
-	#ifdef RANLUXCL_MAXWORKITEMS
-	maxWorkitems = RANLUXCL_MAXWORKITEMS;
-	#else
-	maxWorkitems = RANLUXCL_NUMWORKITEMS;
-	#endif //RANLUXCL_MAXWORKITEMS
-
-	int scaledins = ins * maxWorkitems + 1;
-
-	int js = scaledins + RANLUXCL_MYID;
-
-	//Make sure js is not too small (should really be an error)
-	if(js < 1)
-		js = 1;
-
-	#define IC 2147483563
-	#define ITWO24 16777216
-
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s01=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s02=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s03=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s04=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s05=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s06=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s07=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s08=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s09=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s10=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s11=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s12=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s13=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s14=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s15=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s16=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s17=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s18=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s19=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s20=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s21=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s22=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s23=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
-		rst.s24=(js%ITWO24)*RANLUXCL_TWOM24;
-
-	#undef IC
-	#undef ITWO24
-
-	#else //RANLUXCL_USE_LEGACY_INITIALIZATION
-	//Using default initialization
-
 	#define RANLUXCL_POW2_24 16777216
 	#define RANLUXCL_56 0x00FFFFFFFFFFFFFF
 	#define RANLUXCL_48 0x0000FFFFFFFFFFFF
@@ -674,13 +607,7 @@ void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
 	#define RANLUXCL_16 0x000000000000FFFF
 	#define RANLUXCL_8  0x00000000000000FF
 
-	//We scale ins by (2^32)-1. As long as we never use more than (2^32)-1
-	//work-items per NDRange the initial states should never be the same.
-	//We use a simple 64-bit xorshift RNG by George Marsaglia, which has a
-	//period of 2^64 - 1.
-
 	ulong x1, x2, x3;
-	ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * ((ulong)UINT_MAX + 1);
 
 	//Logical shifts used so that all 64 bits of output are used (24 bits
 	//per float), to be certain that all initial states are different.
@@ -753,7 +680,128 @@ void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
 	#undef RANLUXCL_16
 	#undef RANLUXCL_8
 
-	#endif //RANLUXCL_USE_LEGACY_INITIALIZATION
+	rst.in24 = 0;
+	rst.stepnr = 0;
+	rst.carry = 0.0f;
+	if(rst.s24 == 0.0f)
+		rst.carry = RANLUXCL_TWOM24;
+
+	#ifndef RANLUXCL_NO_WARMUP
+	//Warming up the generator, ensuring there are no initial correlations.
+	//16 is a "magic number". It is the number of times we must generate
+	//a batch of 24 numbers to ensure complete decorrelation, however it
+	//seems like it is necessary to double this for the special case when
+	//the generator is initialized to all zeros.
+	for(int i=0; i<16 * 2; i++){
+		ranluxcl_os(rst.s09, rst.s10, &(rst.s24), &(rst.carry));
+		ranluxcl_os(rst.s08, rst.s09, &(rst.s23), &(rst.carry));
+		ranluxcl_os(rst.s07, rst.s08, &(rst.s22), &(rst.carry));
+		ranluxcl_os(rst.s06, rst.s07, &(rst.s21), &(rst.carry));
+		ranluxcl_os(rst.s05, rst.s06, &(rst.s20), &(rst.carry));
+		ranluxcl_os(rst.s04, rst.s05, &(rst.s19), &(rst.carry));
+		ranluxcl_os(rst.s03, rst.s04, &(rst.s18), &(rst.carry));
+		ranluxcl_os(rst.s02, rst.s03, &(rst.s17), &(rst.carry));
+		ranluxcl_os(rst.s01, rst.s02, &(rst.s16), &(rst.carry));
+		ranluxcl_os(rst.s24, rst.s01, &(rst.s15), &(rst.carry));
+		ranluxcl_os(rst.s23, rst.s24, &(rst.s14), &(rst.carry));
+		ranluxcl_os(rst.s22, rst.s23, &(rst.s13), &(rst.carry));
+		ranluxcl_os(rst.s21, rst.s22, &(rst.s12), &(rst.carry));
+		ranluxcl_os(rst.s20, rst.s21, &(rst.s11), &(rst.carry));
+		ranluxcl_os(rst.s19, rst.s20, &(rst.s10), &(rst.carry));
+		ranluxcl_os(rst.s18, rst.s19, &(rst.s09), &(rst.carry));
+		ranluxcl_os(rst.s17, rst.s18, &(rst.s08), &(rst.carry));
+		ranluxcl_os(rst.s16, rst.s17, &(rst.s07), &(rst.carry));
+		ranluxcl_os(rst.s15, rst.s16, &(rst.s06), &(rst.carry));
+		ranluxcl_os(rst.s14, rst.s15, &(rst.s05), &(rst.carry));
+		ranluxcl_os(rst.s13, rst.s14, &(rst.s04), &(rst.carry));
+		ranluxcl_os(rst.s12, rst.s13, &(rst.s03), &(rst.carry));
+		ranluxcl_os(rst.s11, rst.s12, &(rst.s02), &(rst.carry));
+		ranluxcl_os(rst.s10, rst.s11, &(rst.s01), &(rst.carry));
+	}
+	#endif //RANLUXCL_NO_WARMUP
+
+	//Upload the state
+	*ranluxcltab = rst;
+}
+
+void ranluxcl_init_legacy(uint ins, global ranluxcl_state_t *ranluxcltab)
+{
+	//Using legacy initialization from original Fortan 77 implementation
+
+	//ins is scaled so that if the user makes another call somewhere else
+	//with ins + 1 there should be no overlap. Also adding one
+	//allows us to use ins = 0.
+	int k, maxWorkitems;
+	ranluxcl_state_t rst;
+
+	#ifdef RANLUXCL_MAXWORKITEMS
+	maxWorkitems = RANLUXCL_MAXWORKITEMS;
+	#else
+	maxWorkitems = RANLUXCL_NUMWORKITEMS;
+	#endif //RANLUXCL_MAXWORKITEMS
+
+	int scaledins = ins * maxWorkitems + 1;
+
+	int js = scaledins + RANLUXCL_MYID;
+
+	//Make sure js is not too small (should really be an error)
+	if(js < 1)
+		js = 1;
+
+	#define IC 2147483563
+	#define ITWO24 16777216
+
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s01=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s02=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s03=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s04=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s05=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s06=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s07=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s08=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s09=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s10=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s11=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s12=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s13=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s14=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s15=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s16=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s17=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s18=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s19=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s20=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s21=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s22=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s23=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s24=(js%ITWO24)*RANLUXCL_TWOM24;
+
+	#undef IC
+	#undef ITWO24
 
 	rst.in24 = 0;
 	rst.stepnr = 0;
@@ -797,6 +845,22 @@ void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
 	ranluxcl_upload_seed(&rst, ranluxcltab);
 }
 
+void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
+{
+	#ifdef RANLUXCL_USE_LEGACY_INITIALIZATION
+	ranluxcl_init_legacy(ins, ranluxcltab);
+
+	#else // Not RANLUXCL_USE_LEGACY_INITIALIZATION
+
+	// We scale ins by 2^32. As long as we never use more than (2^32)-1
+	// work-items per NDRange the initial states should never be the same.
+
+	ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * ((ulong)UINT_MAX + 1);
+	ranluxcl_init(x, ranluxcltab + RANLUXCL_MYID);
+
+	#endif // RANLUXCL_USE_LEGACY_INITIALIZATION
+}
+
 float4 ranluxcl32norm(ranluxcl_state_t *rst)
 {
 	//Returns a vector where each component is a normally