diff --git a/pyopencl/array.py b/pyopencl/array.py
index c27ea813cacf4232822ee7972efd694f21cef37f..00a634cbe6609888e4820165a95b1f68bb9f4d50 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -144,6 +144,7 @@ def elwise_kernel_runner(kernel_getter):
         queue = kwargs.pop("queue", None) or repr_ary.queue
 
         knl = kernel_getter(*args)
+
         gs, ls = repr_ary.get_sizes(queue,
                 knl.get_work_group_info(
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
diff --git a/pyopencl/characterize.py b/pyopencl/characterize.py
index 6836362e2c9265b6dc8399be1048ccc04e575f1c..b5283f0c7fa3ef49b3544f6aa23f907dc713668d 100644
--- a/pyopencl/characterize.py
+++ b/pyopencl/characterize.py
@@ -188,7 +188,6 @@ def why_not_local_access_conflict_free(dev, itemsize,
     gran = local_memory_access_granularity(dev)
     if itemsize != gran:
         from warnings import warn
-        print gran
         warn("local conflict info might be inaccurate "
                 "for itemsize != %d" % gran,
                 CLCharacterizationWarning)
diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py
index c0475854f5e3b11888f6cc00be2d28c71de63972..4724d35f2e75af0a13122ae49e4cbdb27fcedb90 100644
--- a/pyopencl/clrandom.py
+++ b/pyopencl/clrandom.py
@@ -1,3 +1,4 @@
+from __future__ import division
 import pyopencl as cl
 import pyopencl.array as cl_array
 from pyopencl.tools import first_arg_dependent_memoize
@@ -11,7 +12,7 @@ import numpy as np
 class RanluxGenerator(object):
     def __init__(self, queue, num_work_items,
             luxury=None, seed=None, no_warmup=False,
-            use_legacy_init=True, max_work_items=None):
+            use_legacy_init=False, max_work_items=None):
         if luxury is None:
             luxury = 4
 
@@ -42,8 +43,31 @@ class RanluxGenerator(object):
             """ % self.generate_settings_defines()
         prg = cl.Program(queue.context, src).build()
 
+        # {{{ compute work group size
+
+        wg_size = prg.init_ranlux.get_work_group_info(
+                cl.kernel_work_group_info.WORK_GROUP_SIZE,
+                queue.device)
+
+        while num_work_items % wg_size != 0:
+            wg_size //= 2
+
+        import sys
+        import platform
+        if ("darwin" in sys.platform
+                and "Apple" in queue.device.platform.vendor
+                and platform.mac_ver()[0].startswith("10.7")
+                and queue.device.type == cl.device_type.CPU):
+            wg_size = 1
+
+        self.wg_size = wg_size
+
+        # }}}
+
         self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8)
-        prg.init_ranlux(queue, (num_work_items,), None, np.uint32(seed),
+        self.state.fill(17)
+
+        prg.init_ranlux(queue, (num_work_items,), (self.wg_size,), np.uint32(seed),
                 self.state.data)
 
     def generate_settings_defines(self, include_double_pragma=True):
@@ -150,7 +174,8 @@ class RanluxGenerator(object):
         if queue is None:
             queue = ary.queue
 
-        self.get_gen_kernel(ary.dtype, "")(queue, (self.num_work_items,), None,
+        self.get_gen_kernel(ary.dtype, "")(queue,
+                (self.num_work_items,), (self.wg_size,),
                 self.state.data, ary.data, ary.size,
                 b-a, a)
 
@@ -167,7 +192,8 @@ class RanluxGenerator(object):
         if queue is None:
             queue = ary.queue
 
-        self.get_gen_kernel(ary.dtype, "norm")(queue, (self.num_work_items,), None,
+        self.get_gen_kernel(ary.dtype, "norm")(queue,
+                (self.num_work_items,), (self.wg_size,),
                 self.state.data, ary.data, ary.size, sigma, mu)
 
     def normal(self, *args, **kwargs):
diff --git a/src/cl/pyopencl-ranluxcl.cl b/src/cl/pyopencl-ranluxcl.cl
index 71606a15c1a0123c0d52d1b8ac90110c811051a4..a0794eef5176e2cc8833a4f1d0689fa6c6bf4976 100644
--- a/src/cl/pyopencl-ranluxcl.cl
+++ b/src/cl/pyopencl-ranluxcl.cl
@@ -1,798 +1,910 @@
-#ifndef RANLUXCL_CL
-#define RANLUXCL_CL
-
-/**** RANLUXCL v1.3.1 MODIFIED *****************************************************
-
-Implements the RANLUX generator of Matrin Luscher, based on the Fortran 77
-implementation by Fred James. This OpenCL code is a complete implementation which 
-should perfectly replicate the numbers generated by the original Fortran 77
-implementation (if using the legacy initialization routine).
-
-***** QUICK USAGE DESCRIPTION ******************************************************
-
-1. Create an OpenCL buffer with room for at least 28 32-bit variables (112 byte).
-I.e., in C/C++: size_t buffSize = numWorkitems * 112;
-
-2. Pass the buffer and an unsigned integer seed <ins> to a kernel that launches the
-ranluxcl_initialization function. The seed <ins> can be any unsigned 32-bit integer,
-and must be different on different OpenCL devices/NDRanges to ensure different
-sequences. As long as the number of work-items on each device/NDRange is less than
-2^32 = 4294967296 all sequences will be different.
-An examle initialization kernel would be:
-	#include "ranluxcl.cl"
-	kernel void Kernel_Ranluxcl_Init(private uint ins, global ranluxcl_state_t *ranluxcltab){
-		ranluxcl_initialization(ins, ranluxcltab);
-	}
-
-3. Now the generator is ready for use. Remember to download the seeds first, and
-upload them again when done. Example kernel that downloads seeds, generates a float4
-where each component is uniformly distributed between 0 and 1, end points not included,
-then uploads the seeds again:
-	#include "ranluxcl.cl"
-	kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab){
-		//ranluxclstate is a struct of 7 float4 variables
-		//storing the state of the generator.
-		ranluxcl_state_t ranluxclstate;
-
-		//Download state into ranluxclstate struct.
-		ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
-
-		//Generate a float4 with each component on (0,1),
-		//end points not included. We can call ranluxcl as many
-		//times as we like until we upload the state again.
-		float4 randomnr = ranluxcl32(&ranluxclstate);
-
-		//Upload state again so that we don't get the same
-		//numbers over again the next time we use ranluxcl.
-		ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
-	}
-
-***** MACROS ***********************************************************************
-
-The following macros can optionally be defined:
-
-RANLUXCL_LUX:
-Sets the luxury level of the generator. Should be 0-4, or if it is 24 or larger it
-sets the p-value of the generator (generally not needed). If this macro is not set
-then lux=4 is the default (highest quality). For many applications the high quality
-of lux=4 may not be needed. Indeed if two values (each value having 24 random bits) 
-are glued together to form a 48-bit value the generator passes all tests in the TestU01
-suite already with lux=2. See "TestU01: A C Library for Empirical Testing of Random 
-Number Generators" by PIERRE L’ECUYER and RICHARD SIMARD. SWB(224, 10, 24)[24, l] is 
-RANLUX with two values glued together to create 48-bit numbers, and we see that it
-passes all tests already at luxury value 2.
-
-RANLUXCL_NO_WARMUP:
-Turns off the warmup functionality in ranluxcl_initialization. This macro should
-generally not be used, since the generators will initially be correlated if it is
-defined. The only advantage is that the numbers generated will exactly correspond
-to those of the original Fortran 77 implementation.
-
-RANLUXCL_SUPPORT_DOUBLE:
-Enables double precision functions. Please enable the OpenCL double precision
-extension yourself, usually by "#pragma OPENCL EXTENSION cl_khr_fp64 : enable".
-
-RANLUXCL_USE_LEGACY_INITIALIZATION
-Uses exactly the same initialization routine as in the original Fortran 77 code,
-leading to the same sequences. If using legacy initialization there are some
-restrictions on what the seed <ins> can be, and it may also be necessary to define
-RANLUXCL_MAXWORKITEMS if several sequences are to be run in parallel.
-
-RANLUXCL_MAXWORKITEMS:
-When RANLUXCL_USE_LEGACY_INITIALIZATION is defined we may need this macro.
-If several OpenCL NDRanges will be running in parallel and the parallel sequences should
-be different then this macro should have a value equal or larger than the
-largest number of work-items in any of the parallel runs. The default is to use the
-current global size, so if all NDRanges are of the same size this need not be
-defined.
-	Each parallel instance must also have different seeds <ins>. For example if
-we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2 we would use
-different seeds for the two generators, and RANLUXCL_MAXWORKITEMS must be defined to
-be at least 10240. If GPU1 and GPU2 had the same number of work-items this would not
-be necessary. 
-	An underestimate of the highest permissible seed <ins> is given by the smallest of:
-(<maxins> = 10^9 / <numWorkitems>) or (<maxins> = 10^9 / RANLUXCL_MAXWORKITEMS).
-Please make sure that <ins> is never higher than this since it could cause undetected
-problems. For example with 10240 work-items the highest permissible <ins> is about
-100 000.
-	Again note that this is only relevant when using the legacy
-initialization function enabled by RANLUXCL_USE_LEGACY_INITIALIZATION. When not using
-the legacy initialization this macro is effectively set to a very high value of 2^32-1.
-
-***** FUNCTIONS: INITIALIZATION ****************************************************
-
-The initialization function is defined as:
-void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
-Run once at the very beginning. ranluxcltab should be a buffer with space for
-112 byte per work-item in the NDRange. <ins> is the seed to the generator.
-For a given <ins> each work-item in the NDRange will generate a different sequence.
-If more than one NDRange is used in parallel then <ins> must be different for each
-NDRange to avoid identical sequences.
-
-***** FUNCTIONS: SEED UPLOAD/DOWNLOAD **********************************************
-
-The following two functions should be launced at the beginning and end of a kernel
-that uses ranluxcl to generate numbers, respectively:
-
-void ranluxcl_download_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab)
-Run at the beginning of a kernel to download ranluxcl state data
-
-void ranluxcl_upload_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab)
-Run at the end of a kernel to upload state data
-
-***** FUNCTIONS: GENERATION AND SYNCHRONIZATION ************************************
-
-float4 ranluxcl32(ranluxcl_state_t *rst)
-Run to generate a pseudo-random float4 where each component is a number between
-0 and 1, end points not included (meaning the number will never be exactly 0 or 1).
-
-double4 ranluxcl64(ranluxcl_state_t *rst)
-Double precision version of the above function. The preprocessor macro
-RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.
-This function "glues" together two single-precision numbers to make one double
-precision number. Most of the work is still done in single precision, so the
-performance will be roughly halved regardless of the double precision performance
-of the hardware.
-
-float4 ranluxcl32norm(ranluxcl_state_t *rst)
-Run to generate a pseudo-random float4 where each component is normally distributed
-with mean 0 and standard deviation 1.
-
-double4 ranluxcl64norm(ranluxcl_state_t *rst)
-Double precision version of the above function. The preprocessor macro
-RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.
-
-void ranluxcl_synchronize(ranluxcl_state_t *rst)
-Run to synchronize execution in case different work-items have made a different
-number of calls to ranluxcl. On SIMD machines this could lead to inefficient execution.
-ranluxcl_synchronize allows us to make sure all generators are SIMD-friendly again. Not
-needed if all work-items always call ranluxcl the same number of times.
-
-***** PERFORMANCE ******************************************************************
-
-For luxury setting 4, performance on AMD Cypress should be ~4.5*10^9 pseudorandom 
-values per second, when not downloading values to host memory (i.e. the values are 
-just generated, but not used for anything in particular).
-
-***** DESCRIPTION OF THE IMPLEMENTATION ********************************************
-
-This code closely follows the original Fortran 77 code (see credit section). Here
-the differences (and similarities) between RANLUXCL (this implementation) and the
-original RANLUX are discussed.
-
-The Fortran 77 implementation uses a simple LCG to initialize the generator, and
-so the same approach is taken here. If RANLUXCL is initialized with <ins> = 0 as
-seed, the first work-item behaves like the original RANLUX with seed equal 1, the
-second work-item as if with seed equal 2 and so on. If <ins> = 1 then the first
-work-item behaves like the original RANLUX with seed equal to <numWorkitems> + 1,
-and so on for higher <ins> so that we never have overlapping sequences. This is
-why the RANLUXCL_MAXWORKITEMS macro must be set if we have different NDRanges with
-a different number of work-items.
-
-RANLUX is based on chaos theory, and what we are actually doing when selecting
-a luxury value is setting how many values to skip over (causing decorrelation).
-The number of values to skip is controlled by the so-called p-value of the
-generator. After generating 24 values we skip p - 24 values until again generating
-24 values.
-
-This implementation is somewhat modified from the original fortran implementation 
-by F. James. Because of the way the OpenCL code is optimized with 4-component 
-32-bit float vectors, it is most convenient to always throw away some multiple 
-of 24 values (i.e. p is always a multiple of 24).
-
-However, there might be some resonances if we always throw away a multiple of
-the seeds table size. Therefore the implementation is slightly more intricate
-where p can be a multiple of 4 instead, at a cost to performance (only about 10%
-lower than the cleaner 24 values approach on AMD Cypress). These two approaches
-are termed planar and planar shift respectively. The idea for the planar approach
-comes from the following paper:
-Vadim Demchik, Pseudo-random number generators for Monte Carlo simulations on 
-Graphics Processing Units, arXiv:1003.1898v1 [hep-lat]
-
-Below the p-values for the original reference implementation are listed along with 
-those of the planar shift implementation. Suggested values for the planar approach 
-are also presented. When this function is called with RANLUXCL_LUX set to 0-4, the
-planar shift values are used. To use the pure planar approach (for some extra
-performance with likely undetectable quality decrease), set lux equal to the specific 
-p-value.
-
-Luxury setting (RANLUXCL_LUX):                   0   1   2   3   4
-Original fortran77 implementation by F. James:  24  48  97  223 389
-Planar (suggested):                             24  48  120 240 408
-Planar shift:                                   24  48  100 224 404
-
-Note that levels 0 and 1 are the same as in the original implementation for both
-planar and planar shift. Level 4 of planar shift where p=404 is the same as chosen 
-for luxury level 1 by Martin Luescher for his v3 version of RANLUX. Therefore if 
-it is considered important to only use "official" values, luxury settings 0, 1 or 
-4 of planar shift should be used. It is however unlikely that the other values are 
-bad, they just haven't been as extensively used and tested by others.
-
-Variable names are generally the same as in the fortran77 implementation, however 
-because of the way the generator is implemented, the i24 and j24 variables are 
-no longer needed.
-
-***** CREDIT ***********************************************************************
-
-I have been told by Fred James (the coder) that the original Fortran 77 
-implementation (which is the subject of the second paper below) is free to use and 
-share. Therefore I am using the MIT license (below). But most importantly please 
-always remember to give credit to the two articles by Martin Luscher and Fred James, 
-describing the generator and the Fortran 77 implementation on which this 
-implementation is based, respectively:
-
-Martin Lüscher, A portable high-quality random number generator for lattice 
-field theory simulations, Computer Physics Communications 79 (1994) 100-110
-
-F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom 
-number generator of Lüscher, Computer Physics Communications 79 (1994) 111-114
-
-***** LICENSE **********************************************************************
-
-Copyright (c) 2011 Ivar Ursin Nikolaisen
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this 
-software and associated documentation files (the "Software"), to deal in the Software 
-without restriction, including without limitation the rights to use, copy, modify, 
-merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
-permit persons to whom the Software is furnished to do so, subject to the following 
-conditions:
-
-The above copyright notice and this permission notice shall be included in all copies 
-or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
-INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 
-PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 
-CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
-OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-***************************************************************************************/
-
-typedef struct{
-	float
-		seed01, seed02, seed03, seed04,
-		seed05, seed06, seed07, seed08,
-		seed09, seed10, seed11, seed12,
-		seed13, seed14, seed15, seed16,
-		seed17, seed18, seed19, seed20,
-		seed21, seed22, seed23, seed24;
-	float carry;
-	float dummy; //Causes struct to be a multiple of 128 bits
-	int in24;
-	int stepnr;
-} ranluxcl_state_t;
-
-#define RANLUXCL_TWOM24 0.000000059604644775f
-#define RANLUXCL_TWOM12 0.000244140625f
-
-#ifdef RANLUXCL_LUX
-#if RANLUXCL_LUX < 0
-#error ranluxcl: lux must be zero or positive.
-#endif
-#else
-#define RANLUXCL_LUX 4 //Default to high quality
-#endif //RANLUXCL_LUX
-
-//Here the luxury values are defined
-#if RANLUXCL_LUX == 0
-#define RANLUXCL_NSKIP 0
-#elif RANLUXCL_LUX == 1
-#define RANLUXCL_NSKIP 24
-#elif RANLUXCL_LUX == 2
-#define RANLUXCL_NSKIP 76
-#elif RANLUXCL_LUX == 3
-#define RANLUXCL_NSKIP 200
-#elif RANLUXCL_LUX == 4
-#define RANLUXCL_NSKIP 380
-#else
-#define RANLUXCL_NSKIP (RANLUXCL_LUX - 24)
-#endif //RANLUXCL_LUX == 0
-
-//Check that nskip is a permissible value
-#if RANLUXCL_NSKIP % 4 != 0 
-#error nskip must be divisible by 4!
-#endif
-#if RANLUXCL_NSKIP < 24 && RANLUXCL_NSKIP != 0
-#error nskip must be either 0 or >= 24!
-#endif
-#if RANLUXCL_NSKIP < 0
-#error nskip is negative!
-#endif
-
-//Check if planar scheme is recovered
-#if RANLUXCL_NSKIP % 24 == 0
-#define RANLUXCL_PLANAR
-#endif
-
-//Check if we will skip at all
-#if RANLUXCL_NSKIP == 0
-#define RANLUXCL_NOSKIP
-#endif
-
-//Single-value global size and id
-#define RANLUXCL_NUMWORKITEMS (get_global_size(0) * get_global_size(1) * get_global_size(2))
-#define RANLUXCL_MYID (get_global_id(0) + get_global_id(1) * get_global_size(0) + get_global_id(2) * get_global_size(0) * get_global_size(1))
-
-void ranluxcl_download_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab)
-{
-	(*rst) = ranluxcltab[RANLUXCL_MYID];
-}
-
-void ranluxcl_upload_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab)
-{
-	ranluxcltab[RANLUXCL_MYID] = (*rst);
-}
-
-float ranluxcl_onestep(float sj24m1, float sj24, float *si24, float *carry){
-	float uni, out;
-	uni = sj24 - (*si24) - (*carry);
-	if(uni < 0.0f){
-		uni += 1.0f;
-		(*carry) = RANLUXCL_TWOM24;
-	} else (*carry) = 0.0f;
-	out = ((*si24) = uni);
-
-	if(uni < RANLUXCL_TWOM12){
-		out += RANLUXCL_TWOM24 * sj24m1;
-		if(out == 0.0f) out = RANLUXCL_TWOM24 * RANLUXCL_TWOM24;
-	}
-	return out;
-}
-
-float4 ranluxcl32(ranluxcl_state_t *rst)
-{
-	//ranluxcl32 returns a 4-component float vector where each component is uniformly distributed
-	//between 0-1, end points not included.
-
-	float4 out;
-
-	if((*rst).stepnr == 0){
-		out.x = ranluxcl_onestep((*rst).seed09, (*rst).seed10, &((*rst).seed24), &((*rst).carry));
-		out.y = ranluxcl_onestep((*rst).seed08, (*rst).seed09, &((*rst).seed23), &((*rst).carry));
-		out.z = ranluxcl_onestep((*rst).seed07, (*rst).seed08, &((*rst).seed22), &((*rst).carry));
-		out.w = ranluxcl_onestep((*rst).seed06, (*rst).seed07, &((*rst).seed21), &((*rst).carry));
-		(*rst).stepnr += 4;
-	}
-
-	else if((*rst).stepnr == 4){
-		out.x = ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry));
-		out.y = ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry));
-		out.z = ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry));
-		out.w = ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry));
-		(*rst).stepnr += 4;
-	}
-
-	else if((*rst).stepnr == 8){
-		out.x = ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry));
-		out.y = ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry));
-		out.z = ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry));
-		out.w = ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry));
-		(*rst).stepnr += 4;
-	}
-
-	else if((*rst).stepnr == 12){
-		out.x = ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry));
-		out.y = ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry));
-		out.z = ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry));
-		out.w = ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry));
-		(*rst).stepnr += 4;
-	}
-
-	else if((*rst).stepnr == 16){
-		out.x = ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry));
-		out.y = ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry));
-		out.z = ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry));
-		out.w = ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry));
-		(*rst).stepnr += 4;
-	}
-
-	else if((*rst).stepnr == 20){
-		out.x = ranluxcl_onestep((*rst).seed13, (*rst).seed14, &((*rst).seed04), &((*rst).carry));
-		out.y = ranluxcl_onestep((*rst).seed12, (*rst).seed13, &((*rst).seed03), &((*rst).carry));
-		out.z = ranluxcl_onestep((*rst).seed11, (*rst).seed12, &((*rst).seed02), &((*rst).carry));
-		out.w = ranluxcl_onestep((*rst).seed10, (*rst).seed11, &((*rst).seed01), &((*rst).carry));
-		(*rst).stepnr = 0;
-
-//The below preprocessor directives are here to recover the simpler planar scheme when nskip is a multiple of 24.
-//For the most general planar shift approach, just ignore all #if's below.
-#ifndef RANLUXCL_PLANAR
-	}
-
-	(*&((*rst).in24)) += 4;
-	if((*&((*rst).in24)) == 24){
-		(*&((*rst).in24)) = 0;
-#endif //RANLUXCL_PLANAR
-
-		int initialskips = ((*rst).stepnr) ? (24 - (*rst).stepnr) : 0;
-		int bulkskips = ((RANLUXCL_NSKIP - initialskips)/24) * 24;
-		int remainingskips = RANLUXCL_NSKIP - initialskips - bulkskips;
-
-//We know there won't be any initial skips in the planar scheme
-#ifndef RANLUXCL_PLANAR
-		//Do initial skips (lack of breaks in switch is intentional).
-		switch(initialskips){
-			case(20):
-				ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry));
-			case(16):
-				ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry));
-			case(12):
-				ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry));
-			case(8):
-				ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry));
-			case(4):
-				ranluxcl_onestep((*rst).seed13, (*rst).seed14, &((*rst).seed04), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed12, (*rst).seed13, &((*rst).seed03), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed11, (*rst).seed12, &((*rst).seed02), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed10, (*rst).seed11, &((*rst).seed01), &((*rst).carry));
-		}
-#endif //RANLUXCL_PLANAR
-
-//Also check if we will ever need to skip at all
-#ifndef RANLUXCL_NOSKIP
-		for(int i=0; i<bulkskips/24; i++){
-			ranluxcl_onestep((*rst).seed09, (*rst).seed10, &((*rst).seed24), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed08, (*rst).seed09, &((*rst).seed23), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed07, (*rst).seed08, &((*rst).seed22), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed06, (*rst).seed07, &((*rst).seed21), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed13, (*rst).seed14, &((*rst).seed04), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed12, (*rst).seed13, &((*rst).seed03), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed11, (*rst).seed12, &((*rst).seed02), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed10, (*rst).seed11, &((*rst).seed01), &((*rst).carry));
-		}
-#endif //RANLUXCL_NOSKIP
-
-//There also won't be any remaining skips in the planar scheme
-#ifndef RANLUXCL_PLANAR
-		//Do remaining skips
-		if(remainingskips){
-			ranluxcl_onestep((*rst).seed09, (*rst).seed10, &((*rst).seed24), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed08, (*rst).seed09, &((*rst).seed23), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed07, (*rst).seed08, &((*rst).seed22), &((*rst).carry));
-			ranluxcl_onestep((*rst).seed06, (*rst).seed07, &((*rst).seed21), &((*rst).carry));
-
-			if(remainingskips > 4){
-				ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry));
-			}
-
-			if(remainingskips > 8){
-				ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry));
-			}
-
-			if(remainingskips > 12){
-				ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry));
-			}
-
-			if(remainingskips > 16){
-				ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry));
-				ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry));
-			}
-		}
-#endif //RANLUXCL_PLANAR
-
-		//Initial skips brought stepnr down to 0. The bulk skips did only full cycles.
-		//Therefore stepnr is now equal to remainingskips.
-		(*rst).stepnr = remainingskips;
-	}
-
-	return out;
-}
-
-void ranluxcl_synchronize(ranluxcl_state_t *rst){
-	//This function generates numbers so that the generator is at the beginning,
-	//i.e. ready to generate 24 numbers before the next skipping sequence. This is
-	//useful if different work-items have called ranluxcl a different number of times.
-	//Since that would lead to out of sync execution it could be rather inefficient on
-	//SIMD architectures like GPUs. This function thus allows us to resynchronize
-	//execution across all work-items.
-
-	//Do necessary number of calls to ranluxcl so that stepnr == 0 at the end.
-	if((*rst).stepnr == 4)
-		ranluxcl32(rst);
-	if((*rst).stepnr == 8)
-		ranluxcl32(rst);
-	if((*rst).stepnr == 12)
-		ranluxcl32(rst);
-	if((*rst).stepnr == 16)
-		ranluxcl32(rst);
-	if((*rst).stepnr == 20)
-		ranluxcl32(rst);
-}
-
-void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
-{
-	ranluxcl_state_t rst;
-
-	#ifdef RANLUXCL_USE_LEGACY_INITIALIZATION
-	//Using legacy initialization from original Fortan 77 implementation
-
-	//ins is scaled so that if the user makes another call somewhere else
-	//with ins + 1 there should be no overlap. Also adding one
-	//allows us to use ins = 0.
-	int k, maxWorkitems;
-
-	#ifdef RANLUXCL_MAXWORKITEMS
-	maxWorkitems = RANLUXCL_MAXWORKITEMS;
-	#else
-	maxWorkitems = RANLUXCL_NUMWORKITEMS;
-	#endif //RANLUXCL_MAXWORKITEMS
-
-	int scaledins = ins * maxWorkitems + 1;
-
-	int js = scaledins + RANLUXCL_MYID;
-
-	//Make sure js is not too small (should really be an error)
-	if(js < 1)
-		js = 1;
-
-	#define IC 2147483563
-	#define ITWO24 16777216
-
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed01=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed02=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed03=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed04=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed05=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed06=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed07=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed08=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed09=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed10=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed11=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed12=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed13=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed14=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed15=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed16=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed17=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed18=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed19=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed20=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed21=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed22=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed23=(js%ITWO24)*RANLUXCL_TWOM24;
-	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed24=(js%ITWO24)*RANLUXCL_TWOM24;
-
-	#undef IC
-	#undef ITWO24
-
-	#else //RANLUXCL_USE_LEGACY_INITIALIZATION
-	//Using default initialization
-
-	#define RANLUXCL_POW2_24 16777216
-	#define RANLUXCL_56 0x00FFFFFFFFFFFFFF
-	#define RANLUXCL_48 0x0000FFFFFFFFFFFF
-	#define RANLUXCL_40 0x000000FFFFFFFFFF
-	#define RANLUXCL_32 0x00000000FFFFFFFF
-	#define RANLUXCL_24 0x0000000000FFFFFF
-	#define RANLUXCL_16 0x000000000000FFFF
-	#define RANLUXCL_8  0x00000000000000FF
-
-	//We scale ins by (2^32)-1. As long as we never use more than (2^32)-1 work-items per
-	//NDRange the initial states should never be the same. We use a simple 64-bit LCG from
-	//the SPRNG library. We choose a single prime b, while in SPRNG for parallel applications
-	//different b give rise to different sequences in parallel (a feature not used here).
-	ulong x1, x2, x3;
-	ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * 4294967295;
-	ulong a = 2862933555777941757;
-	ulong b = 3037000493;
-
-	//Logical shifts used so that all 64 bits of LCG output are used (24 bits per float),
-	//to be certain that all initial states are different.
-	x1 = x = ((a * x + b) & ULONG_MAX);
-	x2 = x = ((a * x + b) & ULONG_MAX);
-	x3 = x = ((a * x + b) & ULONG_MAX);
-	rst.seed01 = (float)  (x1 >> 40)                              / (float)RANLUXCL_POW2_24;
-	rst.seed02 = (float) ((x1 & RANLUXCL_40) >> 16)               / (float)RANLUXCL_POW2_24;
-	rst.seed03 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))  / (float)RANLUXCL_POW2_24;
-	rst.seed04 = (float) ((x2 & RANLUXCL_56) >> 32)               / (float)RANLUXCL_POW2_24;
-	rst.seed05 = (float) ((x2 & RANLUXCL_32) >> 8)                / (float)RANLUXCL_POW2_24;
-	rst.seed06 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24;
-	rst.seed07 = (float) ((x3 & RANLUXCL_48) >> 24)               / (float)RANLUXCL_POW2_24;
-	rst.seed08 = (float)  (x3 & RANLUXCL_24)                      / (float)RANLUXCL_POW2_24;
-
-	x1 = x = ((a * x + b) & ULONG_MAX);
-	x2 = x = ((a * x + b) & ULONG_MAX);
-	x3 = x = ((a * x + b) & ULONG_MAX);
-	rst.seed09 = (float)  (x1 >> 40)                              / (float)RANLUXCL_POW2_24;
-	rst.seed10 = (float) ((x1 & RANLUXCL_40) >> 16)               / (float)RANLUXCL_POW2_24;
-	rst.seed11 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))  / (float)RANLUXCL_POW2_24;
-	rst.seed12 = (float) ((x2 & RANLUXCL_56) >> 32)               / (float)RANLUXCL_POW2_24;
-	rst.seed13 = (float) ((x2 & RANLUXCL_32) >> 8)                / (float)RANLUXCL_POW2_24;
-	rst.seed14 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24;
-	rst.seed15 = (float) ((x3 & RANLUXCL_48) >> 24)               / (float)RANLUXCL_POW2_24;
-	rst.seed16 = (float)  (x3 & RANLUXCL_24)                      / (float)RANLUXCL_POW2_24;
-
-	x1 = x = ((a * x + b) & ULONG_MAX);
-	x2 = x = ((a * x + b) & ULONG_MAX);
-	x3 = x = ((a * x + b) & ULONG_MAX);
-	rst.seed17 = (float)  (x1 >> 40)                              / (float)RANLUXCL_POW2_24;
-	rst.seed18 = (float) ((x1 & RANLUXCL_40) >> 16)               / (float)RANLUXCL_POW2_24;
-	rst.seed19 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))  / (float)RANLUXCL_POW2_24;
-	rst.seed20 = (float) ((x2 & RANLUXCL_56) >> 32)               / (float)RANLUXCL_POW2_24;
-	rst.seed21 = (float) ((x2 & RANLUXCL_32) >> 8)                / (float)RANLUXCL_POW2_24;
-	rst.seed22 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24;
-	rst.seed23 = (float) ((x3 & RANLUXCL_48) >> 24)               / (float)RANLUXCL_POW2_24;
-	rst.seed24 = (float)  (x3 & RANLUXCL_24)                      / (float)RANLUXCL_POW2_24;
-
-	#undef RANLUXCL_POW2_24
-	#undef RANLUXCL_56
-	#undef RANLUXCL_48
-	#undef RANLUXCL_40
-	#undef RANLUXCL_32
-	#undef RANLUXCL_24
-	#undef RANLUXCL_16
-	#undef RANLUXCL_8
-
-	#endif //RANLUXCL_USE_LEGACY_INITIALIZATION
-
-	rst.in24 = 0;
-	rst.stepnr = 0;
-	rst.carry = 0.0f;
-	if(rst.seed24 == 0.0f)
-		rst.carry = RANLUXCL_TWOM24;
-
-	#ifndef RANLUXCL_NO_WARMUP
-	//Warming up the generator, ensuring there are no initial correlations.
-	//16 is a "magic number". It is the number of times we must generate
-	//a batch of 24 numbers to ensure complete decorrelation.
-	for(int i=0; i<16; i++){
-		ranluxcl_onestep(rst.seed09, rst.seed10, &(rst.seed24), &(rst.carry));
-		ranluxcl_onestep(rst.seed08, rst.seed09, &(rst.seed23), &(rst.carry));
-		ranluxcl_onestep(rst.seed07, rst.seed08, &(rst.seed22), &(rst.carry));
-		ranluxcl_onestep(rst.seed06, rst.seed07, &(rst.seed21), &(rst.carry));
-		ranluxcl_onestep(rst.seed05, rst.seed06, &(rst.seed20), &(rst.carry));
-		ranluxcl_onestep(rst.seed04, rst.seed05, &(rst.seed19), &(rst.carry));
-		ranluxcl_onestep(rst.seed03, rst.seed04, &(rst.seed18), &(rst.carry));
-		ranluxcl_onestep(rst.seed02, rst.seed03, &(rst.seed17), &(rst.carry));
-		ranluxcl_onestep(rst.seed01, rst.seed02, &(rst.seed16), &(rst.carry));
-		ranluxcl_onestep(rst.seed24, rst.seed01, &(rst.seed15), &(rst.carry));
-		ranluxcl_onestep(rst.seed23, rst.seed24, &(rst.seed14), &(rst.carry));
-		ranluxcl_onestep(rst.seed22, rst.seed23, &(rst.seed13), &(rst.carry));
-		ranluxcl_onestep(rst.seed21, rst.seed22, &(rst.seed12), &(rst.carry));
-		ranluxcl_onestep(rst.seed20, rst.seed21, &(rst.seed11), &(rst.carry));
-		ranluxcl_onestep(rst.seed19, rst.seed20, &(rst.seed10), &(rst.carry));
-		ranluxcl_onestep(rst.seed18, rst.seed19, &(rst.seed09), &(rst.carry));
-		ranluxcl_onestep(rst.seed17, rst.seed18, &(rst.seed08), &(rst.carry));
-		ranluxcl_onestep(rst.seed16, rst.seed17, &(rst.seed07), &(rst.carry));
-		ranluxcl_onestep(rst.seed15, rst.seed16, &(rst.seed06), &(rst.carry));
-		ranluxcl_onestep(rst.seed14, rst.seed15, &(rst.seed05), &(rst.carry));
-		ranluxcl_onestep(rst.seed13, rst.seed14, &(rst.seed04), &(rst.carry));
-		ranluxcl_onestep(rst.seed12, rst.seed13, &(rst.seed03), &(rst.carry));
-		ranluxcl_onestep(rst.seed11, rst.seed12, &(rst.seed02), &(rst.carry));
-		ranluxcl_onestep(rst.seed10, rst.seed11, &(rst.seed01), &(rst.carry));
-	}
-	#endif //RANLUXCL_NO_WARMUP
-
-	//Upload the state
-	ranluxcl_upload_seed(&rst, ranluxcltab);
-}
-
-float4 ranluxcl32norm(ranluxcl_state_t *rst)
-{
-	//Returns a vector where each component is a normally
-	//distributed PRN centered on 0, with standard deviation
-	//1. Note: M_PI_F is an OpenCL macro for the value of pi.
-	//M_PI would be the 64-bit double version.
-
-	float4 U = ranluxcl32(rst);
-
-	float4 Z;
-	float R, phi;
-
-	R = sqrt(-2 * log(U.x));
-	phi = 2 * M_PI_F * U.y;
-	Z.x = R * cos(phi);
-	Z.y = R * sin(phi);
-
-	R = sqrt(-2 * log(U.z));
-	phi = 2 * M_PI_F * U.w;
-	Z.z = R * cos(phi);
-	Z.w = R * sin(phi);
-
-	return Z;
-}
-
-#ifdef RANLUXCL_SUPPORT_DOUBLE
-double4 ranluxcl64(ranluxcl_state_t *rst)
-{
-	double4 out;
-	float4 randvec;
-
-	//We know this value is caused by the never-zero part
-	//of the original algorithm, but we want to allow zero for
-	//the most significant bits in the double precision result.
-	randvec = ranluxcl32(rst);
-	if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
-		randvec.x = 0.0f;
-	if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
-		randvec.z = 0.0f;
-
-	out.x = (double)(randvec.x) + (double)(randvec.y) / 16777216;
-	out.y = (double)(randvec.z) + (double)(randvec.w) / 16777216;
-
-	randvec = ranluxcl32(rst);
-	if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
-		randvec.x = 0.0f;
-	if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
-		randvec.z = 0.0f;
-
-	out.z = (double)(randvec.x) + (double)(randvec.y) / 16777216;
-	out.w = (double)(randvec.z) + (double)(randvec.w) / 16777216;
-
-	return out;
-}
-
-double4 ranluxcl64norm(ranluxcl_state_t *rst)
-{
-	//Returns a vector where each component is a normally
-	//distributed PRN centered on 0, with standard deviation
-	//1.
-
-	double4 U = ranluxcl64(rst);
-
-	double4 Z;
-	double R, phi;
-
-	R = sqrt(-2 * log(U.x));
-	phi = 2 * M_PI * U.y;
-	Z.x = R * cos(phi);
-	Z.y = R * sin(phi);
-
-	R = sqrt(-2 * log(U.z));
-	phi = 2 * M_PI * U.w;
-	Z.z = R * cos(phi);
-	Z.w = R * sin(phi);
-
-	return Z;
-}
-#endif //RANLUXCL_SUPPORT_DOUBLE
-
-#undef RANLUXCL_TWOM24
-#undef RANLUXCL_TWOM12
-#undef RANLUXCL_NUMWORKITEMS
-#undef RANLUXCL_MYID
-#undef RANLUXCL_PLANAR
-#undef RANLUXCL_NOSKIP
-
-#endif //RANLUXCL_CL
+#ifndef RANLUXCL_CL
+#define RANLUXCL_CL
+
+/**** RANLUXCL v1.3.1 MODIFIED *************************************************
+
+Implements the RANLUX generator of Matrin Luscher, based on the Fortran 77
+implementation by Fred James. This OpenCL code is a complete implementation
+which should perfectly replicate the numbers generated by the original Fortran
+77 implementation (if using the legacy initialization routine).
+
+***** QUICK USAGE DESCRIPTION **************************************************
+
+1. Create an OpenCL buffer with room for at least 28 32-bit variables
+(112 byte). I.e., in C/C++: size_t buffSize = numWorkitems * 112;
+
+2. Pass the buffer and an unsigned integer seed <ins> to a kernel that launches
+the ranluxcl_initialization function. The seed <ins> can be any unsigned 32-bit
+integer, and must be different on different OpenCL devices/NDRanges to ensure
+different sequences. As long as the number of work-items on each device/NDRange
+is less than 2^32 = 4294967296 all sequences will be different.
+An examle initialization kernel would be:
+	#include "ranluxcl.cl"
+	kernel void Kernel_Ranluxcl_Init(private uint ins,
+		global ranluxcl_state_t *ranluxcltab)
+	{
+		ranluxcl_initialization(ins, ranluxcltab);
+	}
+
+3. Now the generator is ready for use. Remember to download the seeds first,
+and upload them again when done. Example kernel that downloads seeds, generates
+a float4 where each component is uniformly distributed between 0 and 1, end
+points not included, then uploads the seeds again:
+	#include "ranluxcl.cl"
+	kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab)
+	{
+		//ranluxclstate is a struct of 7 float4 variables
+		//storing the state of the generator.
+		ranluxcl_state_t ranluxclstate;
+
+		//Download state into ranluxclstate struct.
+		ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
+
+		//Generate a float4 with each component on (0,1),
+		//end points not included. We can call ranluxcl as many
+		//times as we like until we upload the state again.
+		float4 randomnr = ranluxcl32(&ranluxclstate);
+
+		//Upload state again so that we don't get the same
+		//numbers over again the next time we use ranluxcl.
+		ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
+	}
+
+***** MACROS *******************************************************************
+
+The following macros can optionally be defined:
+
+RANLUXCL_LUX:
+Sets the luxury level of the generator. Should be 0-4, or if it is 24 or larger
+it sets the p-value of the generator (generally not needed). If this macro is
+not set then lux=4 is the default (highest quality). For many applications the
+high quality of lux=4 may not be needed. Indeed if two values (each value
+having 24 random bits) are glued together to form a 48-bit value the generator
+passes all tests in the TestU01 suite already with lux=2. See
+"TestU01: A C Library for Empirical Testing of Random Number Generators" by
+PIERRE LÆECUYER and RICHARD SIMARD. SWB(224, 10, 24)[24, l] is RANLUX with
+two values glued together to create 48-bit numbers, and we see that it passes
+all tests already at luxury value 2.
+
+RANLUXCL_NO_WARMUP:
+Turns off the warmup functionality in ranluxcl_initialization. This macro
+should generally not be used, since the generators will initially be correlated
+if it is defined. The only advantage is that the numbers generated will exactly
+correspond to those of the original Fortran 77 implementation.
+
+RANLUXCL_SUPPORT_DOUBLE:
+Enables double precision functions. Please enable the OpenCL double precision
+extension yourself, usually by "#pragma OPENCL EXTENSION cl_khr_fp64 : enable".
+
+RANLUXCL_USE_LEGACY_INITIALIZATION
+Uses exactly the same initialization routine as in the original Fortran 77 code,
+leading to the same sequences. If using legacy initialization there are some
+restrictions on what the seed <ins> can be, and it may also be necessary to
+define RANLUXCL_MAXWORKITEMS if several sequences are to be run in parallel.
+
+RANLUXCL_MAXWORKITEMS:
+When RANLUXCL_USE_LEGACY_INITIALIZATION is defined we may need this macro.
+If several OpenCL NDRanges will be running in parallel and the parallel
+sequences should be different then this macro should have a value equal or
+larger than the
+largest number of work-items in any of the parallel runs. The default is to
+use the current global size, so if all NDRanges are of the same size this need
+not be defined.
+	Each parallel instance must also have different seeds <ins>. For example if
+we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2 we would
+use different seeds for the two generators, and RANLUXCL_MAXWORKITEMS must be
+defined to be at least 10240. If GPU1 and GPU2 had the same number of work-items
+this would not be necessary. 
+	An underestimate of the highest permissible seed <ins> is given by the
+smallest of:
+(<maxins> = 10^9 / <numWorkitems>) or (<maxins> = 10^9 / RANLUXCL_MAXWORKITEMS).
+Please make sure that <ins> is never higher than this since it could cause
+undetected problems. For example with 10240 work-items the highest permissible
+<ins> is about 100 000.
+	Again note that this is only relevant when using the legacy initialization
+function enabled by RANLUXCL_USE_LEGACY_INITIALIZATION. When not using the
+legacy initialization this macro is effectively set to a very high value of
+2^32-1.
+
+***** FUNCTIONS: INITIALIZATION ************************************************
+
+The initialization function is defined as:
+void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
+Run once at the very beginning. ranluxcltab should be a buffer with space for
+112 byte per work-item in the NDRange. <ins> is the seed to the generator.
+For a given <ins> each work-item in the NDRange will generate a different
+sequence. If more than one NDRange is used in parallel then <ins> must be
+different for each NDRange to avoid identical sequences.
+
+***** FUNCTIONS: SEED UPLOAD/DOWNLOAD ******************************************
+
+The following two functions should be launced at the beginning and end of a
+kernel that uses ranluxcl to generate numbers, respectively:
+
+void ranluxcl_download_seed(ranluxcl_state_t *rst,
+	global ranluxcl_state_t *ranluxcltab)
+Run at the beginning of a kernel to download ranluxcl state data
+
+void ranluxcl_upload_seed(ranluxcl_state_t *rst,
+	global ranluxcl_state_t *ranluxcltab)
+Run at the end of a kernel to upload state data
+
+***** FUNCTIONS: GENERATION AND SYNCHRONIZATION ********************************
+
+float4 ranluxcl32(ranluxcl_state_t *rst)
+Run to generate a pseudo-random float4 where each component is a number between
+0 and 1, end points not included (meaning the number will never be exactly 0 or
+1).
+
+double4 ranluxcl64(ranluxcl_state_t *rst)
+Double precision version of the above function. The preprocessor macro
+RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.
+This function "glues" together two single-precision numbers to make one double
+precision number. Most of the work is still done in single precision, so the
+performance will be roughly halved regardless of the double precision
+performance of the hardware.
+
+float4 ranluxcl32norm(ranluxcl_state_t *rst)
+Run to generate a pseudo-random float4 where each component is normally
+distributed with mean 0 and standard deviation 1.
+
+double4 ranluxcl64norm(ranluxcl_state_t *rst)
+Double precision version of the above function. The preprocessor macro
+RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.
+
+void ranluxcl_synchronize(ranluxcl_state_t *rst)
+Run to synchronize execution in case different work-items have made a different
+number of calls to ranluxcl. On SIMD machines this could lead to inefficient
+execution. ranluxcl_synchronize allows us to make sure all generators are
+SIMD-friendly again. Not needed if all work-items always call ranluxcl the same
+number of times.
+
+***** PERFORMANCE **************************************************************
+
+For luxury setting 4, performance on AMD Cypress should be ~4.5*10^9 pseudo-
+random values per second, when not downloading values to host memory (i.e. the
+values are just generated, but not used for anything in particular).
+
+***** DESCRIPTION OF THE IMPLEMENTATION ****************************************
+
+This code closely follows the original Fortran 77 code (see credit section).
+Here the differences (and similarities) between RANLUXCL (this implementation)
+and the original RANLUX are discussed.
+
+The Fortran 77 implementation uses a simple LCG to initialize the generator, and
+so the same approach is taken here. If RANLUXCL is initialized with <ins> = 0 as
+seed, the first work-item behaves like the original RANLUX with seed equal 1,
+the second work-item as if with seed equal 2 and so on. If <ins> = 1 then the
+first work-item behaves like the original RANLUX with seed equal to
+<numWorkitems> + 1, and so on for higher <ins> so that we never have overlapping
+sequences. This is why the RANLUXCL_MAXWORKITEMS macro must be set if we have
+different NDRanges with a different number of work-items.
+
+RANLUX is based on chaos theory, and what we are actually doing when selecting
+a luxury value is setting how many values to skip over (causing decorrelation).
+The number of values to skip is controlled by the so-called p-value of the
+generator. After generating 24 values we skip p - 24 values until again
+generating 24 values.
+
+This implementation is somewhat modified from the original fortran
+implementation by F. James. Because of the way the OpenCL code is optimized with
+4-component 32-bit float vectors, it is most convenient to always throw away
+some multiple of 24 values (i.e. p is always a multiple of 24).
+
+However, there might be some resonances if we always throw away a multiple of
+the seeds table size. Therefore the implementation is slightly more intricate
+where p can be a multiple of 4 instead, at a cost to performance (only about 10%
+lower than the cleaner 24 values approach on AMD Cypress). These two approaches
+are termed planar and planar shift respectively. The idea for the planar
+approach comes from the following paper:
+Vadim Demchik, Pseudo-random number generators for Monte Carlo simulations on
+Graphics Processing Units, arXiv:1003.1898v1 [hep-lat]
+
+Below the p-values for the original reference implementation are listed along
+with those of the planar shift implementation. Suggested values for the planar
+approach are also presented. When this function is called with RANLUXCL_LUX
+set to 0-4, the planar shift values are used. To use the pure planar approach
+(for some extra performance with likely undetectable quality decrease), set lux
+equal to the specific p-value.
+
+Luxury setting (RANLUXCL_LUX):                   0   1   2   3   4
+Original fortran77 implementation by F. James:  24  48  97  223 389
+Planar (suggested):                             24  48  120 240 408
+Planar shift:                                   24  48  100 224 404
+
+Note that levels 0 and 1 are the same as in the original implementation for both
+planar and planar shift. Level 4 of planar shift where p=404 is the same as
+chosen for luxury level 1 by Martin Luescher for his v3 version of RANLUX.
+Therefore if it is considered important to only use "official" values, luxury
+settings 0, 1 or 4 of planar shift should be used. It is however unlikely that
+the other values are bad, they just haven't been as extensively used and tested
+by others.
+
+Variable names are generally the same as in the fortran77 implementation,
+however because of the way the generator is implemented, the i24 and j24
+variables are no longer needed.
+
+***** CREDIT *******************************************************************
+
+I have been told by Fred James (the coder) that the original Fortran 77
+implementation (which is the subject of the second paper below) is free to use
+and share. Therefore I am using the MIT license (below). But most importantly
+please always remember to give credit to the two articles by Martin Luscher and
+Fred James, describing the generator and the Fortran 77 implementation on which
+this implementation is based, respectively:
+
+Martin Lscher, A portable high-quality random number generator for lattice
+field theory simulations, Computer Physics Communications 79 (1994) 100-110
+
+F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom
+number generator of Lscher, Computer Physics Communications 79 (1994) 111-114
+
+***** LICENSE ******************************************************************
+
+Copyright (c) 2011 Ivar Ursin Nikolaisen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*******************************************************************************/
+
+typedef struct{
+	float
+		s01, s02, s03, s04,
+		s05, s06, s07, s08,
+		s09, s10, s11, s12,
+		s13, s14, s15, s16,
+		s17, s18, s19, s20,
+		s21, s22, s23, s24;
+	float carry;
+	float dummy; //Causes struct to be a multiple of 128 bits
+	int in24;
+	int stepnr;
+} ranluxcl_state_t;
+
+//Initial prototypes makes Apple's compiler happy
+void ranluxcl_download_seed(ranluxcl_state_t *, global ranluxcl_state_t *);
+void ranluxcl_upload_seed(ranluxcl_state_t *, global ranluxcl_state_t *);
+float ranluxcl_os(float, float, float *, float *);
+float4 ranluxcl32(ranluxcl_state_t *);
+void ranluxcl_synchronize(ranluxcl_state_t *);
+void ranluxcl_initialization(uint, global ranluxcl_state_t *);
+float4 ranluxcl32norm(ranluxcl_state_t *);
+
+#ifdef RANLUXCL_SUPPORT_DOUBLE
+double4 ranluxcl64(ranluxcl_state_t *);
+double4 ranluxcl64norm(ranluxcl_state_t *);
+#endif
+
+#define RANLUXCL_TWOM24 0.000000059604644775f
+#define RANLUXCL_TWOM12 0.000244140625f
+
+#ifdef RANLUXCL_LUX
+#if RANLUXCL_LUX < 0
+#error ranluxcl: lux must be zero or positive.
+#endif
+#else
+#define RANLUXCL_LUX 4 //Default to high quality
+#endif //RANLUXCL_LUX
+
+//Here the luxury values are defined
+#if RANLUXCL_LUX == 0
+#define RANLUXCL_NSKIP 0
+#elif RANLUXCL_LUX == 1
+#define RANLUXCL_NSKIP 24
+#elif RANLUXCL_LUX == 2
+#define RANLUXCL_NSKIP 76
+#elif RANLUXCL_LUX == 3
+#define RANLUXCL_NSKIP 200
+#elif RANLUXCL_LUX == 4
+#define RANLUXCL_NSKIP 380
+#else
+#define RANLUXCL_NSKIP (RANLUXCL_LUX - 24)
+#endif //RANLUXCL_LUX == 0
+
+//Check that nskip is a permissible value
+#if RANLUXCL_NSKIP % 4 != 0 
+#error nskip must be divisible by 4!
+#endif
+#if RANLUXCL_NSKIP < 24 && RANLUXCL_NSKIP != 0
+#error nskip must be either 0 or >= 24!
+#endif
+#if RANLUXCL_NSKIP < 0
+#error nskip is negative!
+#endif
+
+//Check if planar scheme is recovered
+#if RANLUXCL_NSKIP % 24 == 0
+#define RANLUXCL_PLANAR
+#endif
+
+//Check if we will skip at all
+#if RANLUXCL_NSKIP == 0
+#define RANLUXCL_NOSKIP
+#endif
+
+//Single-value global size and id
+#define RANLUXCL_NUMWORKITEMS \
+	(get_global_size(0) * get_global_size(1) * get_global_size(2))
+#define RANLUXCL_MYID \
+	(get_global_id(0) + get_global_id(1) * get_global_size(0) + \
+	 get_global_id(2) * get_global_size(0) * get_global_size(1))
+
+void ranluxcl_download_seed(ranluxcl_state_t *rst,
+	global ranluxcl_state_t *ranluxcltab)
+{
+	(*rst) = ranluxcltab[RANLUXCL_MYID];
+}
+
+void ranluxcl_upload_seed(ranluxcl_state_t *rst,
+	global ranluxcl_state_t *ranluxcltab)
+{
+        global ranluxcl_state_t *myentry = ranluxcltab + RANLUXCL_MYID;
+        *myentry = *rst;
+        /*
+        printf((char const *) "(w) id: (%d/%d,%d/%d,%d/%d,LLL,%d/%d,%d/%d/%d/%d) %d\n",
+            get_global_id(0),
+            get_global_size(0),
+            get_global_id(1),
+            get_global_size(1),
+            get_global_id(2),
+            get_global_size(2),
+
+            get_local_id(0),
+            get_local_size(0),
+            get_local_id(1),
+            get_local_size(1),
+            get_local_id(2),
+            get_local_size(2),
+            RANLUXCL_MYID);
+            */
+
+        ranluxcltab[RANLUXCL_MYID] = (*rst);
+}
+
+/*
+ * Performs one "step" (generates a single value or skip). Only used internally,
+ * not intended to be called from user code.
+ */
+float ranluxcl_os(float sj24m1, float sj24, float *si24, float *carry)
+{
+	float uni, out;
+	uni = sj24 - (*si24) - (*carry);
+	if(uni < 0.0f){
+		uni += 1.0f;
+		(*carry) = RANLUXCL_TWOM24;
+	} else (*carry) = 0.0f;
+	out = ((*si24) = uni);
+
+	if(uni < RANLUXCL_TWOM12){
+		out += RANLUXCL_TWOM24 * sj24m1;
+		if(out == 0.0f) out = RANLUXCL_TWOM24 * RANLUXCL_TWOM24;
+	}
+	return out;
+}
+
+/*
+ * Return a float4 where each component is a uniformly distributed pseudo-
+ * random value between 0 and 1, end points not included.
+ */
+float4 ranluxcl32(ranluxcl_state_t *rst)
+{
+	float4 out;
+
+	if(rst->stepnr == 0){
+		out.x = ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry));
+		out.y = ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry));
+		out.z = ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry));
+		out.w = ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry));
+		rst->stepnr += 4;
+	}
+
+	else if(rst->stepnr == 4){
+		out.x = ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
+		out.y = ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
+		out.z = ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
+		out.w = ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
+		rst->stepnr += 4;
+	}
+
+	else if(rst->stepnr == 8){
+		out.x = ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
+		out.y = ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
+		out.z = ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
+		out.w = ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
+		rst->stepnr += 4;
+	}
+
+	else if(rst->stepnr == 12){
+		out.x = ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
+		out.y = ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
+		out.z = ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
+		out.w = ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
+		rst->stepnr += 4;
+	}
+
+	else if(rst->stepnr == 16){
+		out.x = ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
+		out.y = ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
+		out.z = ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
+		out.w = ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
+		rst->stepnr += 4;
+	}
+
+	else if(rst->stepnr == 20){
+		out.x = ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry));
+		out.y = ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry));
+		out.z = ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry));
+		out.w = ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry));
+		rst->stepnr = 0;
+
+// The below preprocessor directives are here to recover the simpler planar
+// scheme when nskip is a multiple of 24. For the most general planar shift
+// approach, just ignore all #if's below.
+#ifndef RANLUXCL_PLANAR
+	}
+
+	(*&(rst->in24)) += 4;
+	if((*&(rst->in24)) == 24){
+		(*&(rst->in24)) = 0;
+#endif //RANLUXCL_PLANAR
+
+		int initialskips = (rst->stepnr) ? (24 - rst->stepnr) : 0;
+		int bulkskips = ((RANLUXCL_NSKIP - initialskips)/24) * 24;
+		int remainingskips = RANLUXCL_NSKIP - initialskips - bulkskips;
+
+//We know there won't be any initial skips in the planar scheme
+#ifndef RANLUXCL_PLANAR
+		//Do initial skips (lack of breaks in switch is intentional).
+		switch(initialskips){
+			case(20):
+				ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
+				ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
+				ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
+				ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
+			case(16):
+				ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
+				ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
+				ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
+				ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
+			case(12):
+				ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
+				ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
+				ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
+				ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
+			case(8):
+				ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
+				ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
+				ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
+				ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
+			case(4):
+				ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry));
+				ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry));
+				ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry));
+				ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry));
+		}
+#endif //RANLUXCL_PLANAR
+
+//Also check if we will ever need to skip at all
+#ifndef RANLUXCL_NOSKIP
+		for(int i=0; i<bulkskips/24; i++){
+			ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry));
+			ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry));
+			ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry));
+			ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry));
+			ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
+			ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
+			ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
+			ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
+			ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
+			ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
+			ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
+			ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
+			ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
+			ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
+			ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
+			ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
+			ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
+			ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
+			ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
+			ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
+			ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry));
+			ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry));
+			ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry));
+			ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry));
+		}
+#endif //RANLUXCL_NOSKIP
+
+//There also won't be any remaining skips in the planar scheme
+#ifndef RANLUXCL_PLANAR
+		//Do remaining skips
+		if(remainingskips){
+			ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry));
+			ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry));
+			ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry));
+			ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry));
+
+			if(remainingskips > 4){
+				ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
+				ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
+				ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
+				ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
+			}
+
+			if(remainingskips > 8){
+				ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
+				ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
+				ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
+				ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
+			}
+
+			if(remainingskips > 12){
+				ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
+				ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
+				ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
+				ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
+			}
+
+			if(remainingskips > 16){
+				ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
+				ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
+				ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
+				ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
+			}
+		}
+#endif //RANLUXCL_PLANAR
+
+		// Initial skips brought stepnr down to 0. The bulk skips did only
+		// full cycles. Therefore stepnr is now equal to remainingskips.
+		rst->stepnr = remainingskips;
+	}
+
+	return out;
+}
+
+/*
+ * Perform the necessary operations to set the generator to the "beginning",
+ * i.e., ready to generate 24 numbers before the next skipping sequence. This
+ * is useful if different work-items have called ranluxcl a different number
+ * of times. Since that would lead to out of sync execution on different work-
+ * items it could be rather inefficient on SIMD architectures (like current
+ * GPUs). This function thus allows us to resynchronize execution across work-
+ * items.
+ */
+void ranluxcl_synchronize(ranluxcl_state_t *rst)
+{
+	// Do necessary number of calls to ranluxcl so that stepnr == 0 at the end.
+	if(rst->stepnr == 4)
+		ranluxcl32(rst);
+	if(rst->stepnr == 8)
+		ranluxcl32(rst);
+	if(rst->stepnr == 12)
+		ranluxcl32(rst);
+	if(rst->stepnr == 16)
+		ranluxcl32(rst);
+	if(rst->stepnr == 20)
+		ranluxcl32(rst);
+}
+
+void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
+{
+	ranluxcl_state_t rst;
+
+	#ifdef RANLUXCL_USE_LEGACY_INITIALIZATION
+	//Using legacy initialization from original Fortan 77 implementation
+
+	//ins is scaled so that if the user makes another call somewhere else
+	//with ins + 1 there should be no overlap. Also adding one
+	//allows us to use ins = 0.
+	int k, maxWorkitems;
+
+	#ifdef RANLUXCL_MAXWORKITEMS
+	maxWorkitems = RANLUXCL_MAXWORKITEMS;
+	#else
+	maxWorkitems = RANLUXCL_NUMWORKITEMS;
+	#endif //RANLUXCL_MAXWORKITEMS
+
+	int scaledins = ins * maxWorkitems + 1;
+
+	int js = scaledins + RANLUXCL_MYID;
+
+	//Make sure js is not too small (should really be an error)
+	if(js < 1)
+		js = 1;
+
+	#define IC 2147483563
+	#define ITWO24 16777216
+
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s01=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s02=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s03=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s04=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s05=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s06=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s07=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s08=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s09=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s10=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s11=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s12=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s13=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s14=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s15=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s16=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s17=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s18=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s19=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s20=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s21=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s22=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s23=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
+		rst.s24=(js%ITWO24)*RANLUXCL_TWOM24;
+
+	#undef IC
+	#undef ITWO24
+
+	#else //RANLUXCL_USE_LEGACY_INITIALIZATION
+	//Using default initialization
+
+	#define RANLUXCL_POW2_24 16777216
+	#define RANLUXCL_56 0x00FFFFFFFFFFFFFF
+	#define RANLUXCL_48 0x0000FFFFFFFFFFFF
+	#define RANLUXCL_40 0x000000FFFFFFFFFF
+	#define RANLUXCL_32 0x00000000FFFFFFFF
+	#define RANLUXCL_24 0x0000000000FFFFFF
+	#define RANLUXCL_16 0x000000000000FFFF
+	#define RANLUXCL_8  0x00000000000000FF
+
+	//We scale ins by (2^32)-1. As long as we never use more than (2^32)-1
+	//work-items per NDRange the initial states should never be the same.
+	//We use a simple 64-bit xorshift RNG by George Marsaglia, which has a
+	//period of 2^64 - 1.
+
+	ulong x1, x2, x3;
+	ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * ((ulong)UINT_MAX + 1);
+
+	//Logical shifts used so that all 64 bits of output are used (24 bits
+	//per float), to be certain that all initial states are different.
+	x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x;
+	x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x;
+	x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x;
+	rst.s01 = (float)  (x1 >> 40)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s02 = (float) ((x1 & RANLUXCL_40) >> 16)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s03 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))
+		/ (float)RANLUXCL_POW2_24;
+	rst.s04 = (float) ((x2 & RANLUXCL_56) >> 32)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s05 = (float) ((x2 & RANLUXCL_32) >> 8)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s06 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48))
+		/ (float)RANLUXCL_POW2_24;
+	rst.s07 = (float) ((x3 & RANLUXCL_48) >> 24)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s08 = (float)  (x3 & RANLUXCL_24)
+		/ (float)RANLUXCL_POW2_24;
+
+	x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x;
+	x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x;
+	x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x;
+	rst.s09 = (float)  (x1 >> 40)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s10 = (float) ((x1 & RANLUXCL_40) >> 16)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s11 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))
+		/ (float)RANLUXCL_POW2_24;
+	rst.s12 = (float) ((x2 & RANLUXCL_56) >> 32)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s13 = (float) ((x2 & RANLUXCL_32) >> 8)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s14 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48))
+		/ (float)RANLUXCL_POW2_24;
+	rst.s15 = (float) ((x3 & RANLUXCL_48) >> 24)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s16 = (float)  (x3 & RANLUXCL_24)
+		/ (float)RANLUXCL_POW2_24;
+
+	x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x;
+	x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x;
+	x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x;
+	rst.s17 = (float)  (x1 >> 40)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s18 = (float) ((x1 & RANLUXCL_40) >> 16)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s19 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))
+		/ (float)RANLUXCL_POW2_24;
+	rst.s20 = (float) ((x2 & RANLUXCL_56) >> 32)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s21 = (float) ((x2 & RANLUXCL_32) >> 8)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s22 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48))
+		/ (float)RANLUXCL_POW2_24;
+	rst.s23 = (float) ((x3 & RANLUXCL_48) >> 24)
+		/ (float)RANLUXCL_POW2_24;
+	rst.s24 = (float)  (x3 & RANLUXCL_24)
+		/ (float)RANLUXCL_POW2_24;
+
+	#undef RANLUXCL_POW2_24
+	#undef RANLUXCL_56
+	#undef RANLUXCL_48
+	#undef RANLUXCL_40
+	#undef RANLUXCL_32
+	#undef RANLUXCL_24
+	#undef RANLUXCL_16
+	#undef RANLUXCL_8
+
+	#endif //RANLUXCL_USE_LEGACY_INITIALIZATION
+
+	rst.in24 = 0;
+	rst.stepnr = 0;
+	rst.carry = 0.0f;
+	if(rst.s24 == 0.0f)
+		rst.carry = RANLUXCL_TWOM24;
+
+	#ifndef RANLUXCL_NO_WARMUP
+	//Warming up the generator, ensuring there are no initial correlations.
+	//16 is a "magic number". It is the number of times we must generate
+	//a batch of 24 numbers to ensure complete decorrelation.
+	for(int i=0; i<16; i++){
+		ranluxcl_os(rst.s09, rst.s10, &(rst.s24), &(rst.carry));
+		ranluxcl_os(rst.s08, rst.s09, &(rst.s23), &(rst.carry));
+		ranluxcl_os(rst.s07, rst.s08, &(rst.s22), &(rst.carry));
+		ranluxcl_os(rst.s06, rst.s07, &(rst.s21), &(rst.carry));
+		ranluxcl_os(rst.s05, rst.s06, &(rst.s20), &(rst.carry));
+		ranluxcl_os(rst.s04, rst.s05, &(rst.s19), &(rst.carry));
+		ranluxcl_os(rst.s03, rst.s04, &(rst.s18), &(rst.carry));
+		ranluxcl_os(rst.s02, rst.s03, &(rst.s17), &(rst.carry));
+		ranluxcl_os(rst.s01, rst.s02, &(rst.s16), &(rst.carry));
+		ranluxcl_os(rst.s24, rst.s01, &(rst.s15), &(rst.carry));
+		ranluxcl_os(rst.s23, rst.s24, &(rst.s14), &(rst.carry));
+		ranluxcl_os(rst.s22, rst.s23, &(rst.s13), &(rst.carry));
+		ranluxcl_os(rst.s21, rst.s22, &(rst.s12), &(rst.carry));
+		ranluxcl_os(rst.s20, rst.s21, &(rst.s11), &(rst.carry));
+		ranluxcl_os(rst.s19, rst.s20, &(rst.s10), &(rst.carry));
+		ranluxcl_os(rst.s18, rst.s19, &(rst.s09), &(rst.carry));
+		ranluxcl_os(rst.s17, rst.s18, &(rst.s08), &(rst.carry));
+		ranluxcl_os(rst.s16, rst.s17, &(rst.s07), &(rst.carry));
+		ranluxcl_os(rst.s15, rst.s16, &(rst.s06), &(rst.carry));
+		ranluxcl_os(rst.s14, rst.s15, &(rst.s05), &(rst.carry));
+		ranluxcl_os(rst.s13, rst.s14, &(rst.s04), &(rst.carry));
+		ranluxcl_os(rst.s12, rst.s13, &(rst.s03), &(rst.carry));
+		ranluxcl_os(rst.s11, rst.s12, &(rst.s02), &(rst.carry));
+		ranluxcl_os(rst.s10, rst.s11, &(rst.s01), &(rst.carry));
+	}
+	#endif //RANLUXCL_NO_WARMUP
+
+	//Upload the state
+	ranluxcl_upload_seed(&rst, ranluxcltab);
+}
+
+float4 ranluxcl32norm(ranluxcl_state_t *rst)
+{
+	//Returns a vector where each component is a normally
+	//distributed PRN centered on 0, with standard deviation 1.
+
+	//Roll our own since M_PI_F does not exist in OpenCL 1.0.
+	#define RANLUXCL_PI_F 3.1415926535f
+
+	float4 U = ranluxcl32(rst);
+
+	float4 Z;
+	float R, phi;
+
+	R = sqrt(-2 * log(U.x));
+	phi = 2 * RANLUXCL_PI_F * U.y;
+	Z.x = R * cos(phi);
+	Z.y = R * sin(phi);
+
+	R = sqrt(-2 * log(U.z));
+	phi = 2 * RANLUXCL_PI_F * U.w;
+	Z.z = R * cos(phi);
+	Z.w = R * sin(phi);
+
+	return Z;
+
+	#undef RANLUXCL_PI_F
+}
+
+#ifdef RANLUXCL_SUPPORT_DOUBLE
+double4 ranluxcl64(ranluxcl_state_t *rst)
+{
+	double4 out;
+	float4 randvec;
+
+	//We know this value is caused by the never-zero part
+	//of the original algorithm, but we want to allow zero for
+	//the most significant bits in the double precision result.
+	randvec = ranluxcl32(rst);
+	if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
+		randvec.x = 0.0f;
+	if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
+		randvec.z = 0.0f;
+
+	out.x = (double)(randvec.x) + (double)(randvec.y) / 16777216;
+	out.y = (double)(randvec.z) + (double)(randvec.w) / 16777216;
+
+	randvec = ranluxcl32(rst);
+	if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
+		randvec.x = 0.0f;
+	if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
+		randvec.z = 0.0f;
+
+	out.z = (double)(randvec.x) + (double)(randvec.y) / 16777216;
+	out.w = (double)(randvec.z) + (double)(randvec.w) / 16777216;
+
+	return out;
+}
+
+double4 ranluxcl64norm(ranluxcl_state_t *rst)
+{
+	//Returns a vector where each component is a normally
+	//distributed PRN centered on 0, with standard deviation
+	//1.
+
+	double4 U = ranluxcl64(rst);
+
+	double4 Z;
+	double R, phi;
+
+	R = sqrt(-2 * log(U.x));
+	phi = 2 * M_PI * U.y;
+	Z.x = R * cos(phi);
+	Z.y = R * sin(phi);
+
+	R = sqrt(-2 * log(U.z));
+	phi = 2 * M_PI * U.w;
+	Z.z = R * cos(phi);
+	Z.w = R * sin(phi);
+
+	return Z;
+}
+#endif //RANLUXCL_SUPPORT_DOUBLE
+
+#undef RANLUXCL_TWOM24
+#undef RANLUXCL_TWOM12
+#undef RANLUXCL_NUMWORKITEMS
+#undef RANLUXCL_MYID
+#undef RANLUXCL_PLANAR
+#undef RANLUXCL_NOSKIP
+
+#endif //RANLUXCL_CL