diff --git a/pyopencl/array.py b/pyopencl/array.py index c27ea813cacf4232822ee7972efd694f21cef37f..00a634cbe6609888e4820165a95b1f68bb9f4d50 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -144,6 +144,7 @@ def elwise_kernel_runner(kernel_getter): queue = kwargs.pop("queue", None) or repr_ary.queue knl = kernel_getter(*args) + gs, ls = repr_ary.get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, diff --git a/pyopencl/characterize.py b/pyopencl/characterize.py index 6836362e2c9265b6dc8399be1048ccc04e575f1c..b5283f0c7fa3ef49b3544f6aa23f907dc713668d 100644 --- a/pyopencl/characterize.py +++ b/pyopencl/characterize.py @@ -188,7 +188,6 @@ def why_not_local_access_conflict_free(dev, itemsize, gran = local_memory_access_granularity(dev) if itemsize != gran: from warnings import warn - print gran warn("local conflict info might be inaccurate " "for itemsize != %d" % gran, CLCharacterizationWarning) diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py index c0475854f5e3b11888f6cc00be2d28c71de63972..4724d35f2e75af0a13122ae49e4cbdb27fcedb90 100644 --- a/pyopencl/clrandom.py +++ b/pyopencl/clrandom.py @@ -1,3 +1,4 @@ +from __future__ import division import pyopencl as cl import pyopencl.array as cl_array from pyopencl.tools import first_arg_dependent_memoize @@ -11,7 +12,7 @@ import numpy as np class RanluxGenerator(object): def __init__(self, queue, num_work_items, luxury=None, seed=None, no_warmup=False, - use_legacy_init=True, max_work_items=None): + use_legacy_init=False, max_work_items=None): if luxury is None: luxury = 4 @@ -42,8 +43,31 @@ class RanluxGenerator(object): """ % self.generate_settings_defines() prg = cl.Program(queue.context, src).build() + # {{{ compute work group size + + wg_size = prg.init_ranlux.get_work_group_info( + cl.kernel_work_group_info.WORK_GROUP_SIZE, + queue.device) + + while num_work_items % wg_size != 0: + wg_size //= 2 + + import sys + import platform + if ("darwin" in sys.platform + and "Apple" in queue.device.platform.vendor + and platform.mac_ver()[0].startswith("10.7") + and queue.device.type == cl.device_type.CPU): + wg_size = 1 + + self.wg_size = wg_size + + # }}} + self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8) - prg.init_ranlux(queue, (num_work_items,), None, np.uint32(seed), + self.state.fill(17) + + prg.init_ranlux(queue, (num_work_items,), (self.wg_size,), np.uint32(seed), self.state.data) def generate_settings_defines(self, include_double_pragma=True): @@ -150,7 +174,8 @@ class RanluxGenerator(object): if queue is None: queue = ary.queue - self.get_gen_kernel(ary.dtype, "")(queue, (self.num_work_items,), None, + self.get_gen_kernel(ary.dtype, "")(queue, + (self.num_work_items,), (self.wg_size,), self.state.data, ary.data, ary.size, b-a, a) @@ -167,7 +192,8 @@ class RanluxGenerator(object): if queue is None: queue = ary.queue - self.get_gen_kernel(ary.dtype, "norm")(queue, (self.num_work_items,), None, + self.get_gen_kernel(ary.dtype, "norm")(queue, + (self.num_work_items,), (self.wg_size,), self.state.data, ary.data, ary.size, sigma, mu) def normal(self, *args, **kwargs): diff --git a/src/cl/pyopencl-ranluxcl.cl b/src/cl/pyopencl-ranluxcl.cl index 71606a15c1a0123c0d52d1b8ac90110c811051a4..a0794eef5176e2cc8833a4f1d0689fa6c6bf4976 100644 --- a/src/cl/pyopencl-ranluxcl.cl +++ b/src/cl/pyopencl-ranluxcl.cl @@ -1,798 +1,910 @@ -#ifndef RANLUXCL_CL -#define RANLUXCL_CL - -/**** RANLUXCL v1.3.1 MODIFIED ***************************************************** - -Implements the RANLUX generator of Matrin Luscher, based on the Fortran 77 -implementation by Fred James. This OpenCL code is a complete implementation which -should perfectly replicate the numbers generated by the original Fortran 77 -implementation (if using the legacy initialization routine). - -***** QUICK USAGE DESCRIPTION ****************************************************** - -1. Create an OpenCL buffer with room for at least 28 32-bit variables (112 byte). -I.e., in C/C++: size_t buffSize = numWorkitems * 112; - -2. Pass the buffer and an unsigned integer seed <ins> to a kernel that launches the -ranluxcl_initialization function. The seed <ins> can be any unsigned 32-bit integer, -and must be different on different OpenCL devices/NDRanges to ensure different -sequences. As long as the number of work-items on each device/NDRange is less than -2^32 = 4294967296 all sequences will be different. -An examle initialization kernel would be: - #include "ranluxcl.cl" - kernel void Kernel_Ranluxcl_Init(private uint ins, global ranluxcl_state_t *ranluxcltab){ - ranluxcl_initialization(ins, ranluxcltab); - } - -3. Now the generator is ready for use. Remember to download the seeds first, and -upload them again when done. Example kernel that downloads seeds, generates a float4 -where each component is uniformly distributed between 0 and 1, end points not included, -then uploads the seeds again: - #include "ranluxcl.cl" - kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab){ - //ranluxclstate is a struct of 7 float4 variables - //storing the state of the generator. - ranluxcl_state_t ranluxclstate; - - //Download state into ranluxclstate struct. - ranluxcl_download_seed(&ranluxclstate, ranluxcltab); - - //Generate a float4 with each component on (0,1), - //end points not included. We can call ranluxcl as many - //times as we like until we upload the state again. - float4 randomnr = ranluxcl32(&ranluxclstate); - - //Upload state again so that we don't get the same - //numbers over again the next time we use ranluxcl. - ranluxcl_upload_seed(&ranluxclstate, ranluxcltab); - } - -***** MACROS *********************************************************************** - -The following macros can optionally be defined: - -RANLUXCL_LUX: -Sets the luxury level of the generator. Should be 0-4, or if it is 24 or larger it -sets the p-value of the generator (generally not needed). If this macro is not set -then lux=4 is the default (highest quality). For many applications the high quality -of lux=4 may not be needed. Indeed if two values (each value having 24 random bits) -are glued together to form a 48-bit value the generator passes all tests in the TestU01 -suite already with lux=2. See "TestU01: A C Library for Empirical Testing of Random -Number Generators" by PIERRE L’ECUYER and RICHARD SIMARD. SWB(224, 10, 24)[24, l] is -RANLUX with two values glued together to create 48-bit numbers, and we see that it -passes all tests already at luxury value 2. - -RANLUXCL_NO_WARMUP: -Turns off the warmup functionality in ranluxcl_initialization. This macro should -generally not be used, since the generators will initially be correlated if it is -defined. The only advantage is that the numbers generated will exactly correspond -to those of the original Fortran 77 implementation. - -RANLUXCL_SUPPORT_DOUBLE: -Enables double precision functions. Please enable the OpenCL double precision -extension yourself, usually by "#pragma OPENCL EXTENSION cl_khr_fp64 : enable". - -RANLUXCL_USE_LEGACY_INITIALIZATION -Uses exactly the same initialization routine as in the original Fortran 77 code, -leading to the same sequences. If using legacy initialization there are some -restrictions on what the seed <ins> can be, and it may also be necessary to define -RANLUXCL_MAXWORKITEMS if several sequences are to be run in parallel. - -RANLUXCL_MAXWORKITEMS: -When RANLUXCL_USE_LEGACY_INITIALIZATION is defined we may need this macro. -If several OpenCL NDRanges will be running in parallel and the parallel sequences should -be different then this macro should have a value equal or larger than the -largest number of work-items in any of the parallel runs. The default is to use the -current global size, so if all NDRanges are of the same size this need not be -defined. - Each parallel instance must also have different seeds <ins>. For example if -we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2 we would use -different seeds for the two generators, and RANLUXCL_MAXWORKITEMS must be defined to -be at least 10240. If GPU1 and GPU2 had the same number of work-items this would not -be necessary. - An underestimate of the highest permissible seed <ins> is given by the smallest of: -(<maxins> = 10^9 / <numWorkitems>) or (<maxins> = 10^9 / RANLUXCL_MAXWORKITEMS). -Please make sure that <ins> is never higher than this since it could cause undetected -problems. For example with 10240 work-items the highest permissible <ins> is about -100 000. - Again note that this is only relevant when using the legacy -initialization function enabled by RANLUXCL_USE_LEGACY_INITIALIZATION. When not using -the legacy initialization this macro is effectively set to a very high value of 2^32-1. - -***** FUNCTIONS: INITIALIZATION **************************************************** - -The initialization function is defined as: -void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab) -Run once at the very beginning. ranluxcltab should be a buffer with space for -112 byte per work-item in the NDRange. <ins> is the seed to the generator. -For a given <ins> each work-item in the NDRange will generate a different sequence. -If more than one NDRange is used in parallel then <ins> must be different for each -NDRange to avoid identical sequences. - -***** FUNCTIONS: SEED UPLOAD/DOWNLOAD ********************************************** - -The following two functions should be launced at the beginning and end of a kernel -that uses ranluxcl to generate numbers, respectively: - -void ranluxcl_download_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab) -Run at the beginning of a kernel to download ranluxcl state data - -void ranluxcl_upload_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab) -Run at the end of a kernel to upload state data - -***** FUNCTIONS: GENERATION AND SYNCHRONIZATION ************************************ - -float4 ranluxcl32(ranluxcl_state_t *rst) -Run to generate a pseudo-random float4 where each component is a number between -0 and 1, end points not included (meaning the number will never be exactly 0 or 1). - -double4 ranluxcl64(ranluxcl_state_t *rst) -Double precision version of the above function. The preprocessor macro -RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available. -This function "glues" together two single-precision numbers to make one double -precision number. Most of the work is still done in single precision, so the -performance will be roughly halved regardless of the double precision performance -of the hardware. - -float4 ranluxcl32norm(ranluxcl_state_t *rst) -Run to generate a pseudo-random float4 where each component is normally distributed -with mean 0 and standard deviation 1. - -double4 ranluxcl64norm(ranluxcl_state_t *rst) -Double precision version of the above function. The preprocessor macro -RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available. - -void ranluxcl_synchronize(ranluxcl_state_t *rst) -Run to synchronize execution in case different work-items have made a different -number of calls to ranluxcl. On SIMD machines this could lead to inefficient execution. -ranluxcl_synchronize allows us to make sure all generators are SIMD-friendly again. Not -needed if all work-items always call ranluxcl the same number of times. - -***** PERFORMANCE ****************************************************************** - -For luxury setting 4, performance on AMD Cypress should be ~4.5*10^9 pseudorandom -values per second, when not downloading values to host memory (i.e. the values are -just generated, but not used for anything in particular). - -***** DESCRIPTION OF THE IMPLEMENTATION ******************************************** - -This code closely follows the original Fortran 77 code (see credit section). Here -the differences (and similarities) between RANLUXCL (this implementation) and the -original RANLUX are discussed. - -The Fortran 77 implementation uses a simple LCG to initialize the generator, and -so the same approach is taken here. If RANLUXCL is initialized with <ins> = 0 as -seed, the first work-item behaves like the original RANLUX with seed equal 1, the -second work-item as if with seed equal 2 and so on. If <ins> = 1 then the first -work-item behaves like the original RANLUX with seed equal to <numWorkitems> + 1, -and so on for higher <ins> so that we never have overlapping sequences. This is -why the RANLUXCL_MAXWORKITEMS macro must be set if we have different NDRanges with -a different number of work-items. - -RANLUX is based on chaos theory, and what we are actually doing when selecting -a luxury value is setting how many values to skip over (causing decorrelation). -The number of values to skip is controlled by the so-called p-value of the -generator. After generating 24 values we skip p - 24 values until again generating -24 values. - -This implementation is somewhat modified from the original fortran implementation -by F. James. Because of the way the OpenCL code is optimized with 4-component -32-bit float vectors, it is most convenient to always throw away some multiple -of 24 values (i.e. p is always a multiple of 24). - -However, there might be some resonances if we always throw away a multiple of -the seeds table size. Therefore the implementation is slightly more intricate -where p can be a multiple of 4 instead, at a cost to performance (only about 10% -lower than the cleaner 24 values approach on AMD Cypress). These two approaches -are termed planar and planar shift respectively. The idea for the planar approach -comes from the following paper: -Vadim Demchik, Pseudo-random number generators for Monte Carlo simulations on -Graphics Processing Units, arXiv:1003.1898v1 [hep-lat] - -Below the p-values for the original reference implementation are listed along with -those of the planar shift implementation. Suggested values for the planar approach -are also presented. When this function is called with RANLUXCL_LUX set to 0-4, the -planar shift values are used. To use the pure planar approach (for some extra -performance with likely undetectable quality decrease), set lux equal to the specific -p-value. - -Luxury setting (RANLUXCL_LUX): 0 1 2 3 4 -Original fortran77 implementation by F. James: 24 48 97 223 389 -Planar (suggested): 24 48 120 240 408 -Planar shift: 24 48 100 224 404 - -Note that levels 0 and 1 are the same as in the original implementation for both -planar and planar shift. Level 4 of planar shift where p=404 is the same as chosen -for luxury level 1 by Martin Luescher for his v3 version of RANLUX. Therefore if -it is considered important to only use "official" values, luxury settings 0, 1 or -4 of planar shift should be used. It is however unlikely that the other values are -bad, they just haven't been as extensively used and tested by others. - -Variable names are generally the same as in the fortran77 implementation, however -because of the way the generator is implemented, the i24 and j24 variables are -no longer needed. - -***** CREDIT *********************************************************************** - -I have been told by Fred James (the coder) that the original Fortran 77 -implementation (which is the subject of the second paper below) is free to use and -share. Therefore I am using the MIT license (below). But most importantly please -always remember to give credit to the two articles by Martin Luscher and Fred James, -describing the generator and the Fortran 77 implementation on which this -implementation is based, respectively: - -Martin Lüscher, A portable high-quality random number generator for lattice -field theory simulations, Computer Physics Communications 79 (1994) 100-110 - -F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom -number generator of Lüscher, Computer Physics Communications 79 (1994) 111-114 - -***** LICENSE ********************************************************************** - -Copyright (c) 2011 Ivar Ursin Nikolaisen - -Permission is hereby granted, free of charge, to any person obtaining a copy of this -software and associated documentation files (the "Software"), to deal in the Software -without restriction, including without limitation the rights to use, copy, modify, -merge, publish, distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to the following -conditions: - -The above copyright notice and this permission notice shall be included in all copies -or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A -PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF -CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE -OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -***************************************************************************************/ - -typedef struct{ - float - seed01, seed02, seed03, seed04, - seed05, seed06, seed07, seed08, - seed09, seed10, seed11, seed12, - seed13, seed14, seed15, seed16, - seed17, seed18, seed19, seed20, - seed21, seed22, seed23, seed24; - float carry; - float dummy; //Causes struct to be a multiple of 128 bits - int in24; - int stepnr; -} ranluxcl_state_t; - -#define RANLUXCL_TWOM24 0.000000059604644775f -#define RANLUXCL_TWOM12 0.000244140625f - -#ifdef RANLUXCL_LUX -#if RANLUXCL_LUX < 0 -#error ranluxcl: lux must be zero or positive. -#endif -#else -#define RANLUXCL_LUX 4 //Default to high quality -#endif //RANLUXCL_LUX - -//Here the luxury values are defined -#if RANLUXCL_LUX == 0 -#define RANLUXCL_NSKIP 0 -#elif RANLUXCL_LUX == 1 -#define RANLUXCL_NSKIP 24 -#elif RANLUXCL_LUX == 2 -#define RANLUXCL_NSKIP 76 -#elif RANLUXCL_LUX == 3 -#define RANLUXCL_NSKIP 200 -#elif RANLUXCL_LUX == 4 -#define RANLUXCL_NSKIP 380 -#else -#define RANLUXCL_NSKIP (RANLUXCL_LUX - 24) -#endif //RANLUXCL_LUX == 0 - -//Check that nskip is a permissible value -#if RANLUXCL_NSKIP % 4 != 0 -#error nskip must be divisible by 4! -#endif -#if RANLUXCL_NSKIP < 24 && RANLUXCL_NSKIP != 0 -#error nskip must be either 0 or >= 24! -#endif -#if RANLUXCL_NSKIP < 0 -#error nskip is negative! -#endif - -//Check if planar scheme is recovered -#if RANLUXCL_NSKIP % 24 == 0 -#define RANLUXCL_PLANAR -#endif - -//Check if we will skip at all -#if RANLUXCL_NSKIP == 0 -#define RANLUXCL_NOSKIP -#endif - -//Single-value global size and id -#define RANLUXCL_NUMWORKITEMS (get_global_size(0) * get_global_size(1) * get_global_size(2)) -#define RANLUXCL_MYID (get_global_id(0) + get_global_id(1) * get_global_size(0) + get_global_id(2) * get_global_size(0) * get_global_size(1)) - -void ranluxcl_download_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab) -{ - (*rst) = ranluxcltab[RANLUXCL_MYID]; -} - -void ranluxcl_upload_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab) -{ - ranluxcltab[RANLUXCL_MYID] = (*rst); -} - -float ranluxcl_onestep(float sj24m1, float sj24, float *si24, float *carry){ - float uni, out; - uni = sj24 - (*si24) - (*carry); - if(uni < 0.0f){ - uni += 1.0f; - (*carry) = RANLUXCL_TWOM24; - } else (*carry) = 0.0f; - out = ((*si24) = uni); - - if(uni < RANLUXCL_TWOM12){ - out += RANLUXCL_TWOM24 * sj24m1; - if(out == 0.0f) out = RANLUXCL_TWOM24 * RANLUXCL_TWOM24; - } - return out; -} - -float4 ranluxcl32(ranluxcl_state_t *rst) -{ - //ranluxcl32 returns a 4-component float vector where each component is uniformly distributed - //between 0-1, end points not included. - - float4 out; - - if((*rst).stepnr == 0){ - out.x = ranluxcl_onestep((*rst).seed09, (*rst).seed10, &((*rst).seed24), &((*rst).carry)); - out.y = ranluxcl_onestep((*rst).seed08, (*rst).seed09, &((*rst).seed23), &((*rst).carry)); - out.z = ranluxcl_onestep((*rst).seed07, (*rst).seed08, &((*rst).seed22), &((*rst).carry)); - out.w = ranluxcl_onestep((*rst).seed06, (*rst).seed07, &((*rst).seed21), &((*rst).carry)); - (*rst).stepnr += 4; - } - - else if((*rst).stepnr == 4){ - out.x = ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry)); - out.y = ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry)); - out.z = ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry)); - out.w = ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry)); - (*rst).stepnr += 4; - } - - else if((*rst).stepnr == 8){ - out.x = ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry)); - out.y = ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry)); - out.z = ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry)); - out.w = ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry)); - (*rst).stepnr += 4; - } - - else if((*rst).stepnr == 12){ - out.x = ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry)); - out.y = ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry)); - out.z = ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry)); - out.w = ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry)); - (*rst).stepnr += 4; - } - - else if((*rst).stepnr == 16){ - out.x = ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry)); - out.y = ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry)); - out.z = ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry)); - out.w = ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry)); - (*rst).stepnr += 4; - } - - else if((*rst).stepnr == 20){ - out.x = ranluxcl_onestep((*rst).seed13, (*rst).seed14, &((*rst).seed04), &((*rst).carry)); - out.y = ranluxcl_onestep((*rst).seed12, (*rst).seed13, &((*rst).seed03), &((*rst).carry)); - out.z = ranluxcl_onestep((*rst).seed11, (*rst).seed12, &((*rst).seed02), &((*rst).carry)); - out.w = ranluxcl_onestep((*rst).seed10, (*rst).seed11, &((*rst).seed01), &((*rst).carry)); - (*rst).stepnr = 0; - -//The below preprocessor directives are here to recover the simpler planar scheme when nskip is a multiple of 24. -//For the most general planar shift approach, just ignore all #if's below. -#ifndef RANLUXCL_PLANAR - } - - (*&((*rst).in24)) += 4; - if((*&((*rst).in24)) == 24){ - (*&((*rst).in24)) = 0; -#endif //RANLUXCL_PLANAR - - int initialskips = ((*rst).stepnr) ? (24 - (*rst).stepnr) : 0; - int bulkskips = ((RANLUXCL_NSKIP - initialskips)/24) * 24; - int remainingskips = RANLUXCL_NSKIP - initialskips - bulkskips; - -//We know there won't be any initial skips in the planar scheme -#ifndef RANLUXCL_PLANAR - //Do initial skips (lack of breaks in switch is intentional). - switch(initialskips){ - case(20): - ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry)); - ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry)); - ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry)); - ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry)); - case(16): - ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry)); - ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry)); - ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry)); - ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry)); - case(12): - ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry)); - ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry)); - ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry)); - ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry)); - case(8): - ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry)); - ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry)); - ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry)); - ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry)); - case(4): - ranluxcl_onestep((*rst).seed13, (*rst).seed14, &((*rst).seed04), &((*rst).carry)); - ranluxcl_onestep((*rst).seed12, (*rst).seed13, &((*rst).seed03), &((*rst).carry)); - ranluxcl_onestep((*rst).seed11, (*rst).seed12, &((*rst).seed02), &((*rst).carry)); - ranluxcl_onestep((*rst).seed10, (*rst).seed11, &((*rst).seed01), &((*rst).carry)); - } -#endif //RANLUXCL_PLANAR - -//Also check if we will ever need to skip at all -#ifndef RANLUXCL_NOSKIP - for(int i=0; i<bulkskips/24; i++){ - ranluxcl_onestep((*rst).seed09, (*rst).seed10, &((*rst).seed24), &((*rst).carry)); - ranluxcl_onestep((*rst).seed08, (*rst).seed09, &((*rst).seed23), &((*rst).carry)); - ranluxcl_onestep((*rst).seed07, (*rst).seed08, &((*rst).seed22), &((*rst).carry)); - ranluxcl_onestep((*rst).seed06, (*rst).seed07, &((*rst).seed21), &((*rst).carry)); - ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry)); - ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry)); - ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry)); - ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry)); - ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry)); - ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry)); - ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry)); - ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry)); - ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry)); - ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry)); - ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry)); - ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry)); - ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry)); - ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry)); - ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry)); - ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry)); - ranluxcl_onestep((*rst).seed13, (*rst).seed14, &((*rst).seed04), &((*rst).carry)); - ranluxcl_onestep((*rst).seed12, (*rst).seed13, &((*rst).seed03), &((*rst).carry)); - ranluxcl_onestep((*rst).seed11, (*rst).seed12, &((*rst).seed02), &((*rst).carry)); - ranluxcl_onestep((*rst).seed10, (*rst).seed11, &((*rst).seed01), &((*rst).carry)); - } -#endif //RANLUXCL_NOSKIP - -//There also won't be any remaining skips in the planar scheme -#ifndef RANLUXCL_PLANAR - //Do remaining skips - if(remainingskips){ - ranluxcl_onestep((*rst).seed09, (*rst).seed10, &((*rst).seed24), &((*rst).carry)); - ranluxcl_onestep((*rst).seed08, (*rst).seed09, &((*rst).seed23), &((*rst).carry)); - ranluxcl_onestep((*rst).seed07, (*rst).seed08, &((*rst).seed22), &((*rst).carry)); - ranluxcl_onestep((*rst).seed06, (*rst).seed07, &((*rst).seed21), &((*rst).carry)); - - if(remainingskips > 4){ - ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry)); - ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry)); - ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry)); - ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry)); - } - - if(remainingskips > 8){ - ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry)); - ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry)); - ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry)); - ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry)); - } - - if(remainingskips > 12){ - ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry)); - ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry)); - ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry)); - ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry)); - } - - if(remainingskips > 16){ - ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry)); - ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry)); - ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry)); - ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry)); - } - } -#endif //RANLUXCL_PLANAR - - //Initial skips brought stepnr down to 0. The bulk skips did only full cycles. - //Therefore stepnr is now equal to remainingskips. - (*rst).stepnr = remainingskips; - } - - return out; -} - -void ranluxcl_synchronize(ranluxcl_state_t *rst){ - //This function generates numbers so that the generator is at the beginning, - //i.e. ready to generate 24 numbers before the next skipping sequence. This is - //useful if different work-items have called ranluxcl a different number of times. - //Since that would lead to out of sync execution it could be rather inefficient on - //SIMD architectures like GPUs. This function thus allows us to resynchronize - //execution across all work-items. - - //Do necessary number of calls to ranluxcl so that stepnr == 0 at the end. - if((*rst).stepnr == 4) - ranluxcl32(rst); - if((*rst).stepnr == 8) - ranluxcl32(rst); - if((*rst).stepnr == 12) - ranluxcl32(rst); - if((*rst).stepnr == 16) - ranluxcl32(rst); - if((*rst).stepnr == 20) - ranluxcl32(rst); -} - -void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab) -{ - ranluxcl_state_t rst; - - #ifdef RANLUXCL_USE_LEGACY_INITIALIZATION - //Using legacy initialization from original Fortan 77 implementation - - //ins is scaled so that if the user makes another call somewhere else - //with ins + 1 there should be no overlap. Also adding one - //allows us to use ins = 0. - int k, maxWorkitems; - - #ifdef RANLUXCL_MAXWORKITEMS - maxWorkitems = RANLUXCL_MAXWORKITEMS; - #else - maxWorkitems = RANLUXCL_NUMWORKITEMS; - #endif //RANLUXCL_MAXWORKITEMS - - int scaledins = ins * maxWorkitems + 1; - - int js = scaledins + RANLUXCL_MYID; - - //Make sure js is not too small (should really be an error) - if(js < 1) - js = 1; - - #define IC 2147483563 - #define ITWO24 16777216 - - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed01=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed02=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed03=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed04=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed05=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed06=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed07=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed08=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed09=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed10=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed11=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed12=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed13=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed14=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed15=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed16=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed17=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed18=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed19=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed20=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed21=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed22=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed23=(js%ITWO24)*RANLUXCL_TWOM24; - k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed24=(js%ITWO24)*RANLUXCL_TWOM24; - - #undef IC - #undef ITWO24 - - #else //RANLUXCL_USE_LEGACY_INITIALIZATION - //Using default initialization - - #define RANLUXCL_POW2_24 16777216 - #define RANLUXCL_56 0x00FFFFFFFFFFFFFF - #define RANLUXCL_48 0x0000FFFFFFFFFFFF - #define RANLUXCL_40 0x000000FFFFFFFFFF - #define RANLUXCL_32 0x00000000FFFFFFFF - #define RANLUXCL_24 0x0000000000FFFFFF - #define RANLUXCL_16 0x000000000000FFFF - #define RANLUXCL_8 0x00000000000000FF - - //We scale ins by (2^32)-1. As long as we never use more than (2^32)-1 work-items per - //NDRange the initial states should never be the same. We use a simple 64-bit LCG from - //the SPRNG library. We choose a single prime b, while in SPRNG for parallel applications - //different b give rise to different sequences in parallel (a feature not used here). - ulong x1, x2, x3; - ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * 4294967295; - ulong a = 2862933555777941757; - ulong b = 3037000493; - - //Logical shifts used so that all 64 bits of LCG output are used (24 bits per float), - //to be certain that all initial states are different. - x1 = x = ((a * x + b) & ULONG_MAX); - x2 = x = ((a * x + b) & ULONG_MAX); - x3 = x = ((a * x + b) & ULONG_MAX); - rst.seed01 = (float) (x1 >> 40) / (float)RANLUXCL_POW2_24; - rst.seed02 = (float) ((x1 & RANLUXCL_40) >> 16) / (float)RANLUXCL_POW2_24; - rst.seed03 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) / (float)RANLUXCL_POW2_24; - rst.seed04 = (float) ((x2 & RANLUXCL_56) >> 32) / (float)RANLUXCL_POW2_24; - rst.seed05 = (float) ((x2 & RANLUXCL_32) >> 8) / (float)RANLUXCL_POW2_24; - rst.seed06 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24; - rst.seed07 = (float) ((x3 & RANLUXCL_48) >> 24) / (float)RANLUXCL_POW2_24; - rst.seed08 = (float) (x3 & RANLUXCL_24) / (float)RANLUXCL_POW2_24; - - x1 = x = ((a * x + b) & ULONG_MAX); - x2 = x = ((a * x + b) & ULONG_MAX); - x3 = x = ((a * x + b) & ULONG_MAX); - rst.seed09 = (float) (x1 >> 40) / (float)RANLUXCL_POW2_24; - rst.seed10 = (float) ((x1 & RANLUXCL_40) >> 16) / (float)RANLUXCL_POW2_24; - rst.seed11 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) / (float)RANLUXCL_POW2_24; - rst.seed12 = (float) ((x2 & RANLUXCL_56) >> 32) / (float)RANLUXCL_POW2_24; - rst.seed13 = (float) ((x2 & RANLUXCL_32) >> 8) / (float)RANLUXCL_POW2_24; - rst.seed14 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24; - rst.seed15 = (float) ((x3 & RANLUXCL_48) >> 24) / (float)RANLUXCL_POW2_24; - rst.seed16 = (float) (x3 & RANLUXCL_24) / (float)RANLUXCL_POW2_24; - - x1 = x = ((a * x + b) & ULONG_MAX); - x2 = x = ((a * x + b) & ULONG_MAX); - x3 = x = ((a * x + b) & ULONG_MAX); - rst.seed17 = (float) (x1 >> 40) / (float)RANLUXCL_POW2_24; - rst.seed18 = (float) ((x1 & RANLUXCL_40) >> 16) / (float)RANLUXCL_POW2_24; - rst.seed19 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) / (float)RANLUXCL_POW2_24; - rst.seed20 = (float) ((x2 & RANLUXCL_56) >> 32) / (float)RANLUXCL_POW2_24; - rst.seed21 = (float) ((x2 & RANLUXCL_32) >> 8) / (float)RANLUXCL_POW2_24; - rst.seed22 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24; - rst.seed23 = (float) ((x3 & RANLUXCL_48) >> 24) / (float)RANLUXCL_POW2_24; - rst.seed24 = (float) (x3 & RANLUXCL_24) / (float)RANLUXCL_POW2_24; - - #undef RANLUXCL_POW2_24 - #undef RANLUXCL_56 - #undef RANLUXCL_48 - #undef RANLUXCL_40 - #undef RANLUXCL_32 - #undef RANLUXCL_24 - #undef RANLUXCL_16 - #undef RANLUXCL_8 - - #endif //RANLUXCL_USE_LEGACY_INITIALIZATION - - rst.in24 = 0; - rst.stepnr = 0; - rst.carry = 0.0f; - if(rst.seed24 == 0.0f) - rst.carry = RANLUXCL_TWOM24; - - #ifndef RANLUXCL_NO_WARMUP - //Warming up the generator, ensuring there are no initial correlations. - //16 is a "magic number". It is the number of times we must generate - //a batch of 24 numbers to ensure complete decorrelation. - for(int i=0; i<16; i++){ - ranluxcl_onestep(rst.seed09, rst.seed10, &(rst.seed24), &(rst.carry)); - ranluxcl_onestep(rst.seed08, rst.seed09, &(rst.seed23), &(rst.carry)); - ranluxcl_onestep(rst.seed07, rst.seed08, &(rst.seed22), &(rst.carry)); - ranluxcl_onestep(rst.seed06, rst.seed07, &(rst.seed21), &(rst.carry)); - ranluxcl_onestep(rst.seed05, rst.seed06, &(rst.seed20), &(rst.carry)); - ranluxcl_onestep(rst.seed04, rst.seed05, &(rst.seed19), &(rst.carry)); - ranluxcl_onestep(rst.seed03, rst.seed04, &(rst.seed18), &(rst.carry)); - ranluxcl_onestep(rst.seed02, rst.seed03, &(rst.seed17), &(rst.carry)); - ranluxcl_onestep(rst.seed01, rst.seed02, &(rst.seed16), &(rst.carry)); - ranluxcl_onestep(rst.seed24, rst.seed01, &(rst.seed15), &(rst.carry)); - ranluxcl_onestep(rst.seed23, rst.seed24, &(rst.seed14), &(rst.carry)); - ranluxcl_onestep(rst.seed22, rst.seed23, &(rst.seed13), &(rst.carry)); - ranluxcl_onestep(rst.seed21, rst.seed22, &(rst.seed12), &(rst.carry)); - ranluxcl_onestep(rst.seed20, rst.seed21, &(rst.seed11), &(rst.carry)); - ranluxcl_onestep(rst.seed19, rst.seed20, &(rst.seed10), &(rst.carry)); - ranluxcl_onestep(rst.seed18, rst.seed19, &(rst.seed09), &(rst.carry)); - ranluxcl_onestep(rst.seed17, rst.seed18, &(rst.seed08), &(rst.carry)); - ranluxcl_onestep(rst.seed16, rst.seed17, &(rst.seed07), &(rst.carry)); - ranluxcl_onestep(rst.seed15, rst.seed16, &(rst.seed06), &(rst.carry)); - ranluxcl_onestep(rst.seed14, rst.seed15, &(rst.seed05), &(rst.carry)); - ranluxcl_onestep(rst.seed13, rst.seed14, &(rst.seed04), &(rst.carry)); - ranluxcl_onestep(rst.seed12, rst.seed13, &(rst.seed03), &(rst.carry)); - ranluxcl_onestep(rst.seed11, rst.seed12, &(rst.seed02), &(rst.carry)); - ranluxcl_onestep(rst.seed10, rst.seed11, &(rst.seed01), &(rst.carry)); - } - #endif //RANLUXCL_NO_WARMUP - - //Upload the state - ranluxcl_upload_seed(&rst, ranluxcltab); -} - -float4 ranluxcl32norm(ranluxcl_state_t *rst) -{ - //Returns a vector where each component is a normally - //distributed PRN centered on 0, with standard deviation - //1. Note: M_PI_F is an OpenCL macro for the value of pi. - //M_PI would be the 64-bit double version. - - float4 U = ranluxcl32(rst); - - float4 Z; - float R, phi; - - R = sqrt(-2 * log(U.x)); - phi = 2 * M_PI_F * U.y; - Z.x = R * cos(phi); - Z.y = R * sin(phi); - - R = sqrt(-2 * log(U.z)); - phi = 2 * M_PI_F * U.w; - Z.z = R * cos(phi); - Z.w = R * sin(phi); - - return Z; -} - -#ifdef RANLUXCL_SUPPORT_DOUBLE -double4 ranluxcl64(ranluxcl_state_t *rst) -{ - double4 out; - float4 randvec; - - //We know this value is caused by the never-zero part - //of the original algorithm, but we want to allow zero for - //the most significant bits in the double precision result. - randvec = ranluxcl32(rst); - if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) - randvec.x = 0.0f; - if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) - randvec.z = 0.0f; - - out.x = (double)(randvec.x) + (double)(randvec.y) / 16777216; - out.y = (double)(randvec.z) + (double)(randvec.w) / 16777216; - - randvec = ranluxcl32(rst); - if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) - randvec.x = 0.0f; - if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) - randvec.z = 0.0f; - - out.z = (double)(randvec.x) + (double)(randvec.y) / 16777216; - out.w = (double)(randvec.z) + (double)(randvec.w) / 16777216; - - return out; -} - -double4 ranluxcl64norm(ranluxcl_state_t *rst) -{ - //Returns a vector where each component is a normally - //distributed PRN centered on 0, with standard deviation - //1. - - double4 U = ranluxcl64(rst); - - double4 Z; - double R, phi; - - R = sqrt(-2 * log(U.x)); - phi = 2 * M_PI * U.y; - Z.x = R * cos(phi); - Z.y = R * sin(phi); - - R = sqrt(-2 * log(U.z)); - phi = 2 * M_PI * U.w; - Z.z = R * cos(phi); - Z.w = R * sin(phi); - - return Z; -} -#endif //RANLUXCL_SUPPORT_DOUBLE - -#undef RANLUXCL_TWOM24 -#undef RANLUXCL_TWOM12 -#undef RANLUXCL_NUMWORKITEMS -#undef RANLUXCL_MYID -#undef RANLUXCL_PLANAR -#undef RANLUXCL_NOSKIP - -#endif //RANLUXCL_CL +#ifndef RANLUXCL_CL +#define RANLUXCL_CL + +/**** RANLUXCL v1.3.1 MODIFIED ************************************************* + +Implements the RANLUX generator of Matrin Luscher, based on the Fortran 77 +implementation by Fred James. This OpenCL code is a complete implementation +which should perfectly replicate the numbers generated by the original Fortran +77 implementation (if using the legacy initialization routine). + +***** QUICK USAGE DESCRIPTION ************************************************** + +1. Create an OpenCL buffer with room for at least 28 32-bit variables +(112 byte). I.e., in C/C++: size_t buffSize = numWorkitems * 112; + +2. Pass the buffer and an unsigned integer seed <ins> to a kernel that launches +the ranluxcl_initialization function. The seed <ins> can be any unsigned 32-bit +integer, and must be different on different OpenCL devices/NDRanges to ensure +different sequences. As long as the number of work-items on each device/NDRange +is less than 2^32 = 4294967296 all sequences will be different. +An examle initialization kernel would be: + #include "ranluxcl.cl" + kernel void Kernel_Ranluxcl_Init(private uint ins, + global ranluxcl_state_t *ranluxcltab) + { + ranluxcl_initialization(ins, ranluxcltab); + } + +3. Now the generator is ready for use. Remember to download the seeds first, +and upload them again when done. Example kernel that downloads seeds, generates +a float4 where each component is uniformly distributed between 0 and 1, end +points not included, then uploads the seeds again: + #include "ranluxcl.cl" + kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab) + { + //ranluxclstate is a struct of 7 float4 variables + //storing the state of the generator. + ranluxcl_state_t ranluxclstate; + + //Download state into ranluxclstate struct. + ranluxcl_download_seed(&ranluxclstate, ranluxcltab); + + //Generate a float4 with each component on (0,1), + //end points not included. We can call ranluxcl as many + //times as we like until we upload the state again. + float4 randomnr = ranluxcl32(&ranluxclstate); + + //Upload state again so that we don't get the same + //numbers over again the next time we use ranluxcl. + ranluxcl_upload_seed(&ranluxclstate, ranluxcltab); + } + +***** MACROS ******************************************************************* + +The following macros can optionally be defined: + +RANLUXCL_LUX: +Sets the luxury level of the generator. Should be 0-4, or if it is 24 or larger +it sets the p-value of the generator (generally not needed). If this macro is +not set then lux=4 is the default (highest quality). For many applications the +high quality of lux=4 may not be needed. Indeed if two values (each value +having 24 random bits) are glued together to form a 48-bit value the generator +passes all tests in the TestU01 suite already with lux=2. See +"TestU01: A C Library for Empirical Testing of Random Number Generators" by +PIERRE LÆECUYER and RICHARD SIMARD. SWB(224, 10, 24)[24, l] is RANLUX with +two values glued together to create 48-bit numbers, and we see that it passes +all tests already at luxury value 2. + +RANLUXCL_NO_WARMUP: +Turns off the warmup functionality in ranluxcl_initialization. This macro +should generally not be used, since the generators will initially be correlated +if it is defined. The only advantage is that the numbers generated will exactly +correspond to those of the original Fortran 77 implementation. + +RANLUXCL_SUPPORT_DOUBLE: +Enables double precision functions. Please enable the OpenCL double precision +extension yourself, usually by "#pragma OPENCL EXTENSION cl_khr_fp64 : enable". + +RANLUXCL_USE_LEGACY_INITIALIZATION +Uses exactly the same initialization routine as in the original Fortran 77 code, +leading to the same sequences. If using legacy initialization there are some +restrictions on what the seed <ins> can be, and it may also be necessary to +define RANLUXCL_MAXWORKITEMS if several sequences are to be run in parallel. + +RANLUXCL_MAXWORKITEMS: +When RANLUXCL_USE_LEGACY_INITIALIZATION is defined we may need this macro. +If several OpenCL NDRanges will be running in parallel and the parallel +sequences should be different then this macro should have a value equal or +larger than the +largest number of work-items in any of the parallel runs. The default is to +use the current global size, so if all NDRanges are of the same size this need +not be defined. + Each parallel instance must also have different seeds <ins>. For example if +we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2 we would +use different seeds for the two generators, and RANLUXCL_MAXWORKITEMS must be +defined to be at least 10240. If GPU1 and GPU2 had the same number of work-items +this would not be necessary. + An underestimate of the highest permissible seed <ins> is given by the +smallest of: +(<maxins> = 10^9 / <numWorkitems>) or (<maxins> = 10^9 / RANLUXCL_MAXWORKITEMS). +Please make sure that <ins> is never higher than this since it could cause +undetected problems. For example with 10240 work-items the highest permissible +<ins> is about 100 000. + Again note that this is only relevant when using the legacy initialization +function enabled by RANLUXCL_USE_LEGACY_INITIALIZATION. When not using the +legacy initialization this macro is effectively set to a very high value of +2^32-1. + +***** FUNCTIONS: INITIALIZATION ************************************************ + +The initialization function is defined as: +void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab) +Run once at the very beginning. ranluxcltab should be a buffer with space for +112 byte per work-item in the NDRange. <ins> is the seed to the generator. +For a given <ins> each work-item in the NDRange will generate a different +sequence. If more than one NDRange is used in parallel then <ins> must be +different for each NDRange to avoid identical sequences. + +***** FUNCTIONS: SEED UPLOAD/DOWNLOAD ****************************************** + +The following two functions should be launced at the beginning and end of a +kernel that uses ranluxcl to generate numbers, respectively: + +void ranluxcl_download_seed(ranluxcl_state_t *rst, + global ranluxcl_state_t *ranluxcltab) +Run at the beginning of a kernel to download ranluxcl state data + +void ranluxcl_upload_seed(ranluxcl_state_t *rst, + global ranluxcl_state_t *ranluxcltab) +Run at the end of a kernel to upload state data + +***** FUNCTIONS: GENERATION AND SYNCHRONIZATION ******************************** + +float4 ranluxcl32(ranluxcl_state_t *rst) +Run to generate a pseudo-random float4 where each component is a number between +0 and 1, end points not included (meaning the number will never be exactly 0 or +1). + +double4 ranluxcl64(ranluxcl_state_t *rst) +Double precision version of the above function. The preprocessor macro +RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available. +This function "glues" together two single-precision numbers to make one double +precision number. Most of the work is still done in single precision, so the +performance will be roughly halved regardless of the double precision +performance of the hardware. + +float4 ranluxcl32norm(ranluxcl_state_t *rst) +Run to generate a pseudo-random float4 where each component is normally +distributed with mean 0 and standard deviation 1. + +double4 ranluxcl64norm(ranluxcl_state_t *rst) +Double precision version of the above function. The preprocessor macro +RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available. + +void ranluxcl_synchronize(ranluxcl_state_t *rst) +Run to synchronize execution in case different work-items have made a different +number of calls to ranluxcl. On SIMD machines this could lead to inefficient +execution. ranluxcl_synchronize allows us to make sure all generators are +SIMD-friendly again. Not needed if all work-items always call ranluxcl the same +number of times. + +***** PERFORMANCE ************************************************************** + +For luxury setting 4, performance on AMD Cypress should be ~4.5*10^9 pseudo- +random values per second, when not downloading values to host memory (i.e. the +values are just generated, but not used for anything in particular). + +***** DESCRIPTION OF THE IMPLEMENTATION **************************************** + +This code closely follows the original Fortran 77 code (see credit section). +Here the differences (and similarities) between RANLUXCL (this implementation) +and the original RANLUX are discussed. + +The Fortran 77 implementation uses a simple LCG to initialize the generator, and +so the same approach is taken here. If RANLUXCL is initialized with <ins> = 0 as +seed, the first work-item behaves like the original RANLUX with seed equal 1, +the second work-item as if with seed equal 2 and so on. If <ins> = 1 then the +first work-item behaves like the original RANLUX with seed equal to +<numWorkitems> + 1, and so on for higher <ins> so that we never have overlapping +sequences. This is why the RANLUXCL_MAXWORKITEMS macro must be set if we have +different NDRanges with a different number of work-items. + +RANLUX is based on chaos theory, and what we are actually doing when selecting +a luxury value is setting how many values to skip over (causing decorrelation). +The number of values to skip is controlled by the so-called p-value of the +generator. After generating 24 values we skip p - 24 values until again +generating 24 values. + +This implementation is somewhat modified from the original fortran +implementation by F. James. Because of the way the OpenCL code is optimized with +4-component 32-bit float vectors, it is most convenient to always throw away +some multiple of 24 values (i.e. p is always a multiple of 24). + +However, there might be some resonances if we always throw away a multiple of +the seeds table size. Therefore the implementation is slightly more intricate +where p can be a multiple of 4 instead, at a cost to performance (only about 10% +lower than the cleaner 24 values approach on AMD Cypress). These two approaches +are termed planar and planar shift respectively. The idea for the planar +approach comes from the following paper: +Vadim Demchik, Pseudo-random number generators for Monte Carlo simulations on +Graphics Processing Units, arXiv:1003.1898v1 [hep-lat] + +Below the p-values for the original reference implementation are listed along +with those of the planar shift implementation. Suggested values for the planar +approach are also presented. When this function is called with RANLUXCL_LUX +set to 0-4, the planar shift values are used. To use the pure planar approach +(for some extra performance with likely undetectable quality decrease), set lux +equal to the specific p-value. + +Luxury setting (RANLUXCL_LUX): 0 1 2 3 4 +Original fortran77 implementation by F. James: 24 48 97 223 389 +Planar (suggested): 24 48 120 240 408 +Planar shift: 24 48 100 224 404 + +Note that levels 0 and 1 are the same as in the original implementation for both +planar and planar shift. Level 4 of planar shift where p=404 is the same as +chosen for luxury level 1 by Martin Luescher for his v3 version of RANLUX. +Therefore if it is considered important to only use "official" values, luxury +settings 0, 1 or 4 of planar shift should be used. It is however unlikely that +the other values are bad, they just haven't been as extensively used and tested +by others. + +Variable names are generally the same as in the fortran77 implementation, +however because of the way the generator is implemented, the i24 and j24 +variables are no longer needed. + +***** CREDIT ******************************************************************* + +I have been told by Fred James (the coder) that the original Fortran 77 +implementation (which is the subject of the second paper below) is free to use +and share. Therefore I am using the MIT license (below). But most importantly +please always remember to give credit to the two articles by Martin Luscher and +Fred James, describing the generator and the Fortran 77 implementation on which +this implementation is based, respectively: + +Martin Lscher, A portable high-quality random number generator for lattice +field theory simulations, Computer Physics Communications 79 (1994) 100-110 + +F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom +number generator of Lscher, Computer Physics Communications 79 (1994) 111-114 + +***** LICENSE ****************************************************************** + +Copyright (c) 2011 Ivar Ursin Nikolaisen + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*******************************************************************************/ + +typedef struct{ + float + s01, s02, s03, s04, + s05, s06, s07, s08, + s09, s10, s11, s12, + s13, s14, s15, s16, + s17, s18, s19, s20, + s21, s22, s23, s24; + float carry; + float dummy; //Causes struct to be a multiple of 128 bits + int in24; + int stepnr; +} ranluxcl_state_t; + +//Initial prototypes makes Apple's compiler happy +void ranluxcl_download_seed(ranluxcl_state_t *, global ranluxcl_state_t *); +void ranluxcl_upload_seed(ranluxcl_state_t *, global ranluxcl_state_t *); +float ranluxcl_os(float, float, float *, float *); +float4 ranluxcl32(ranluxcl_state_t *); +void ranluxcl_synchronize(ranluxcl_state_t *); +void ranluxcl_initialization(uint, global ranluxcl_state_t *); +float4 ranluxcl32norm(ranluxcl_state_t *); + +#ifdef RANLUXCL_SUPPORT_DOUBLE +double4 ranluxcl64(ranluxcl_state_t *); +double4 ranluxcl64norm(ranluxcl_state_t *); +#endif + +#define RANLUXCL_TWOM24 0.000000059604644775f +#define RANLUXCL_TWOM12 0.000244140625f + +#ifdef RANLUXCL_LUX +#if RANLUXCL_LUX < 0 +#error ranluxcl: lux must be zero or positive. +#endif +#else +#define RANLUXCL_LUX 4 //Default to high quality +#endif //RANLUXCL_LUX + +//Here the luxury values are defined +#if RANLUXCL_LUX == 0 +#define RANLUXCL_NSKIP 0 +#elif RANLUXCL_LUX == 1 +#define RANLUXCL_NSKIP 24 +#elif RANLUXCL_LUX == 2 +#define RANLUXCL_NSKIP 76 +#elif RANLUXCL_LUX == 3 +#define RANLUXCL_NSKIP 200 +#elif RANLUXCL_LUX == 4 +#define RANLUXCL_NSKIP 380 +#else +#define RANLUXCL_NSKIP (RANLUXCL_LUX - 24) +#endif //RANLUXCL_LUX == 0 + +//Check that nskip is a permissible value +#if RANLUXCL_NSKIP % 4 != 0 +#error nskip must be divisible by 4! +#endif +#if RANLUXCL_NSKIP < 24 && RANLUXCL_NSKIP != 0 +#error nskip must be either 0 or >= 24! +#endif +#if RANLUXCL_NSKIP < 0 +#error nskip is negative! +#endif + +//Check if planar scheme is recovered +#if RANLUXCL_NSKIP % 24 == 0 +#define RANLUXCL_PLANAR +#endif + +//Check if we will skip at all +#if RANLUXCL_NSKIP == 0 +#define RANLUXCL_NOSKIP +#endif + +//Single-value global size and id +#define RANLUXCL_NUMWORKITEMS \ + (get_global_size(0) * get_global_size(1) * get_global_size(2)) +#define RANLUXCL_MYID \ + (get_global_id(0) + get_global_id(1) * get_global_size(0) + \ + get_global_id(2) * get_global_size(0) * get_global_size(1)) + +void ranluxcl_download_seed(ranluxcl_state_t *rst, + global ranluxcl_state_t *ranluxcltab) +{ + (*rst) = ranluxcltab[RANLUXCL_MYID]; +} + +void ranluxcl_upload_seed(ranluxcl_state_t *rst, + global ranluxcl_state_t *ranluxcltab) +{ + global ranluxcl_state_t *myentry = ranluxcltab + RANLUXCL_MYID; + *myentry = *rst; + /* + printf((char const *) "(w) id: (%d/%d,%d/%d,%d/%d,LLL,%d/%d,%d/%d/%d/%d) %d\n", + get_global_id(0), + get_global_size(0), + get_global_id(1), + get_global_size(1), + get_global_id(2), + get_global_size(2), + + get_local_id(0), + get_local_size(0), + get_local_id(1), + get_local_size(1), + get_local_id(2), + get_local_size(2), + RANLUXCL_MYID); + */ + + ranluxcltab[RANLUXCL_MYID] = (*rst); +} + +/* + * Performs one "step" (generates a single value or skip). Only used internally, + * not intended to be called from user code. + */ +float ranluxcl_os(float sj24m1, float sj24, float *si24, float *carry) +{ + float uni, out; + uni = sj24 - (*si24) - (*carry); + if(uni < 0.0f){ + uni += 1.0f; + (*carry) = RANLUXCL_TWOM24; + } else (*carry) = 0.0f; + out = ((*si24) = uni); + + if(uni < RANLUXCL_TWOM12){ + out += RANLUXCL_TWOM24 * sj24m1; + if(out == 0.0f) out = RANLUXCL_TWOM24 * RANLUXCL_TWOM24; + } + return out; +} + +/* + * Return a float4 where each component is a uniformly distributed pseudo- + * random value between 0 and 1, end points not included. + */ +float4 ranluxcl32(ranluxcl_state_t *rst) +{ + float4 out; + + if(rst->stepnr == 0){ + out.x = ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry)); + out.y = ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry)); + out.z = ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry)); + out.w = ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry)); + rst->stepnr += 4; + } + + else if(rst->stepnr == 4){ + out.x = ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry)); + out.y = ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry)); + out.z = ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry)); + out.w = ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry)); + rst->stepnr += 4; + } + + else if(rst->stepnr == 8){ + out.x = ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry)); + out.y = ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry)); + out.z = ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry)); + out.w = ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry)); + rst->stepnr += 4; + } + + else if(rst->stepnr == 12){ + out.x = ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry)); + out.y = ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry)); + out.z = ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry)); + out.w = ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry)); + rst->stepnr += 4; + } + + else if(rst->stepnr == 16){ + out.x = ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry)); + out.y = ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry)); + out.z = ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry)); + out.w = ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry)); + rst->stepnr += 4; + } + + else if(rst->stepnr == 20){ + out.x = ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry)); + out.y = ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry)); + out.z = ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry)); + out.w = ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry)); + rst->stepnr = 0; + +// The below preprocessor directives are here to recover the simpler planar +// scheme when nskip is a multiple of 24. For the most general planar shift +// approach, just ignore all #if's below. +#ifndef RANLUXCL_PLANAR + } + + (*&(rst->in24)) += 4; + if((*&(rst->in24)) == 24){ + (*&(rst->in24)) = 0; +#endif //RANLUXCL_PLANAR + + int initialskips = (rst->stepnr) ? (24 - rst->stepnr) : 0; + int bulkskips = ((RANLUXCL_NSKIP - initialskips)/24) * 24; + int remainingskips = RANLUXCL_NSKIP - initialskips - bulkskips; + +//We know there won't be any initial skips in the planar scheme +#ifndef RANLUXCL_PLANAR + //Do initial skips (lack of breaks in switch is intentional). + switch(initialskips){ + case(20): + ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry)); + ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry)); + ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry)); + ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry)); + case(16): + ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry)); + ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry)); + ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry)); + ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry)); + case(12): + ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry)); + ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry)); + ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry)); + ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry)); + case(8): + ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry)); + ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry)); + ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry)); + ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry)); + case(4): + ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry)); + ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry)); + ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry)); + ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry)); + } +#endif //RANLUXCL_PLANAR + +//Also check if we will ever need to skip at all +#ifndef RANLUXCL_NOSKIP + for(int i=0; i<bulkskips/24; i++){ + ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry)); + ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry)); + ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry)); + ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry)); + ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry)); + ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry)); + ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry)); + ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry)); + ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry)); + ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry)); + ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry)); + ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry)); + ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry)); + ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry)); + ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry)); + ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry)); + ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry)); + ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry)); + ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry)); + ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry)); + ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry)); + ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry)); + ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry)); + ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry)); + } +#endif //RANLUXCL_NOSKIP + +//There also won't be any remaining skips in the planar scheme +#ifndef RANLUXCL_PLANAR + //Do remaining skips + if(remainingskips){ + ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry)); + ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry)); + ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry)); + ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry)); + + if(remainingskips > 4){ + ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry)); + ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry)); + ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry)); + ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry)); + } + + if(remainingskips > 8){ + ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry)); + ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry)); + ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry)); + ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry)); + } + + if(remainingskips > 12){ + ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry)); + ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry)); + ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry)); + ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry)); + } + + if(remainingskips > 16){ + ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry)); + ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry)); + ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry)); + ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry)); + } + } +#endif //RANLUXCL_PLANAR + + // Initial skips brought stepnr down to 0. The bulk skips did only + // full cycles. Therefore stepnr is now equal to remainingskips. + rst->stepnr = remainingskips; + } + + return out; +} + +/* + * Perform the necessary operations to set the generator to the "beginning", + * i.e., ready to generate 24 numbers before the next skipping sequence. This + * is useful if different work-items have called ranluxcl a different number + * of times. Since that would lead to out of sync execution on different work- + * items it could be rather inefficient on SIMD architectures (like current + * GPUs). This function thus allows us to resynchronize execution across work- + * items. + */ +void ranluxcl_synchronize(ranluxcl_state_t *rst) +{ + // Do necessary number of calls to ranluxcl so that stepnr == 0 at the end. + if(rst->stepnr == 4) + ranluxcl32(rst); + if(rst->stepnr == 8) + ranluxcl32(rst); + if(rst->stepnr == 12) + ranluxcl32(rst); + if(rst->stepnr == 16) + ranluxcl32(rst); + if(rst->stepnr == 20) + ranluxcl32(rst); +} + +void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab) +{ + ranluxcl_state_t rst; + + #ifdef RANLUXCL_USE_LEGACY_INITIALIZATION + //Using legacy initialization from original Fortan 77 implementation + + //ins is scaled so that if the user makes another call somewhere else + //with ins + 1 there should be no overlap. Also adding one + //allows us to use ins = 0. + int k, maxWorkitems; + + #ifdef RANLUXCL_MAXWORKITEMS + maxWorkitems = RANLUXCL_MAXWORKITEMS; + #else + maxWorkitems = RANLUXCL_NUMWORKITEMS; + #endif //RANLUXCL_MAXWORKITEMS + + int scaledins = ins * maxWorkitems + 1; + + int js = scaledins + RANLUXCL_MYID; + + //Make sure js is not too small (should really be an error) + if(js < 1) + js = 1; + + #define IC 2147483563 + #define ITWO24 16777216 + + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s01=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s02=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s03=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s04=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s05=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s06=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s07=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s08=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s09=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s10=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s11=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s12=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s13=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s14=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s15=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s16=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s17=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s18=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s19=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s20=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s21=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s22=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s23=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; + rst.s24=(js%ITWO24)*RANLUXCL_TWOM24; + + #undef IC + #undef ITWO24 + + #else //RANLUXCL_USE_LEGACY_INITIALIZATION + //Using default initialization + + #define RANLUXCL_POW2_24 16777216 + #define RANLUXCL_56 0x00FFFFFFFFFFFFFF + #define RANLUXCL_48 0x0000FFFFFFFFFFFF + #define RANLUXCL_40 0x000000FFFFFFFFFF + #define RANLUXCL_32 0x00000000FFFFFFFF + #define RANLUXCL_24 0x0000000000FFFFFF + #define RANLUXCL_16 0x000000000000FFFF + #define RANLUXCL_8 0x00000000000000FF + + //We scale ins by (2^32)-1. As long as we never use more than (2^32)-1 + //work-items per NDRange the initial states should never be the same. + //We use a simple 64-bit xorshift RNG by George Marsaglia, which has a + //period of 2^64 - 1. + + ulong x1, x2, x3; + ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * ((ulong)UINT_MAX + 1); + + //Logical shifts used so that all 64 bits of output are used (24 bits + //per float), to be certain that all initial states are different. + x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x; + x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x; + x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x; + rst.s01 = (float) (x1 >> 40) + / (float)RANLUXCL_POW2_24; + rst.s02 = (float) ((x1 & RANLUXCL_40) >> 16) + / (float)RANLUXCL_POW2_24; + rst.s03 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) + / (float)RANLUXCL_POW2_24; + rst.s04 = (float) ((x2 & RANLUXCL_56) >> 32) + / (float)RANLUXCL_POW2_24; + rst.s05 = (float) ((x2 & RANLUXCL_32) >> 8) + / (float)RANLUXCL_POW2_24; + rst.s06 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) + / (float)RANLUXCL_POW2_24; + rst.s07 = (float) ((x3 & RANLUXCL_48) >> 24) + / (float)RANLUXCL_POW2_24; + rst.s08 = (float) (x3 & RANLUXCL_24) + / (float)RANLUXCL_POW2_24; + + x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x; + x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x; + x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x; + rst.s09 = (float) (x1 >> 40) + / (float)RANLUXCL_POW2_24; + rst.s10 = (float) ((x1 & RANLUXCL_40) >> 16) + / (float)RANLUXCL_POW2_24; + rst.s11 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) + / (float)RANLUXCL_POW2_24; + rst.s12 = (float) ((x2 & RANLUXCL_56) >> 32) + / (float)RANLUXCL_POW2_24; + rst.s13 = (float) ((x2 & RANLUXCL_32) >> 8) + / (float)RANLUXCL_POW2_24; + rst.s14 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) + / (float)RANLUXCL_POW2_24; + rst.s15 = (float) ((x3 & RANLUXCL_48) >> 24) + / (float)RANLUXCL_POW2_24; + rst.s16 = (float) (x3 & RANLUXCL_24) + / (float)RANLUXCL_POW2_24; + + x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x; + x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x; + x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x; + rst.s17 = (float) (x1 >> 40) + / (float)RANLUXCL_POW2_24; + rst.s18 = (float) ((x1 & RANLUXCL_40) >> 16) + / (float)RANLUXCL_POW2_24; + rst.s19 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) + / (float)RANLUXCL_POW2_24; + rst.s20 = (float) ((x2 & RANLUXCL_56) >> 32) + / (float)RANLUXCL_POW2_24; + rst.s21 = (float) ((x2 & RANLUXCL_32) >> 8) + / (float)RANLUXCL_POW2_24; + rst.s22 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) + / (float)RANLUXCL_POW2_24; + rst.s23 = (float) ((x3 & RANLUXCL_48) >> 24) + / (float)RANLUXCL_POW2_24; + rst.s24 = (float) (x3 & RANLUXCL_24) + / (float)RANLUXCL_POW2_24; + + #undef RANLUXCL_POW2_24 + #undef RANLUXCL_56 + #undef RANLUXCL_48 + #undef RANLUXCL_40 + #undef RANLUXCL_32 + #undef RANLUXCL_24 + #undef RANLUXCL_16 + #undef RANLUXCL_8 + + #endif //RANLUXCL_USE_LEGACY_INITIALIZATION + + rst.in24 = 0; + rst.stepnr = 0; + rst.carry = 0.0f; + if(rst.s24 == 0.0f) + rst.carry = RANLUXCL_TWOM24; + + #ifndef RANLUXCL_NO_WARMUP + //Warming up the generator, ensuring there are no initial correlations. + //16 is a "magic number". It is the number of times we must generate + //a batch of 24 numbers to ensure complete decorrelation. + for(int i=0; i<16; i++){ + ranluxcl_os(rst.s09, rst.s10, &(rst.s24), &(rst.carry)); + ranluxcl_os(rst.s08, rst.s09, &(rst.s23), &(rst.carry)); + ranluxcl_os(rst.s07, rst.s08, &(rst.s22), &(rst.carry)); + ranluxcl_os(rst.s06, rst.s07, &(rst.s21), &(rst.carry)); + ranluxcl_os(rst.s05, rst.s06, &(rst.s20), &(rst.carry)); + ranluxcl_os(rst.s04, rst.s05, &(rst.s19), &(rst.carry)); + ranluxcl_os(rst.s03, rst.s04, &(rst.s18), &(rst.carry)); + ranluxcl_os(rst.s02, rst.s03, &(rst.s17), &(rst.carry)); + ranluxcl_os(rst.s01, rst.s02, &(rst.s16), &(rst.carry)); + ranluxcl_os(rst.s24, rst.s01, &(rst.s15), &(rst.carry)); + ranluxcl_os(rst.s23, rst.s24, &(rst.s14), &(rst.carry)); + ranluxcl_os(rst.s22, rst.s23, &(rst.s13), &(rst.carry)); + ranluxcl_os(rst.s21, rst.s22, &(rst.s12), &(rst.carry)); + ranluxcl_os(rst.s20, rst.s21, &(rst.s11), &(rst.carry)); + ranluxcl_os(rst.s19, rst.s20, &(rst.s10), &(rst.carry)); + ranluxcl_os(rst.s18, rst.s19, &(rst.s09), &(rst.carry)); + ranluxcl_os(rst.s17, rst.s18, &(rst.s08), &(rst.carry)); + ranluxcl_os(rst.s16, rst.s17, &(rst.s07), &(rst.carry)); + ranluxcl_os(rst.s15, rst.s16, &(rst.s06), &(rst.carry)); + ranluxcl_os(rst.s14, rst.s15, &(rst.s05), &(rst.carry)); + ranluxcl_os(rst.s13, rst.s14, &(rst.s04), &(rst.carry)); + ranluxcl_os(rst.s12, rst.s13, &(rst.s03), &(rst.carry)); + ranluxcl_os(rst.s11, rst.s12, &(rst.s02), &(rst.carry)); + ranluxcl_os(rst.s10, rst.s11, &(rst.s01), &(rst.carry)); + } + #endif //RANLUXCL_NO_WARMUP + + //Upload the state + ranluxcl_upload_seed(&rst, ranluxcltab); +} + +float4 ranluxcl32norm(ranluxcl_state_t *rst) +{ + //Returns a vector where each component is a normally + //distributed PRN centered on 0, with standard deviation 1. + + //Roll our own since M_PI_F does not exist in OpenCL 1.0. + #define RANLUXCL_PI_F 3.1415926535f + + float4 U = ranluxcl32(rst); + + float4 Z; + float R, phi; + + R = sqrt(-2 * log(U.x)); + phi = 2 * RANLUXCL_PI_F * U.y; + Z.x = R * cos(phi); + Z.y = R * sin(phi); + + R = sqrt(-2 * log(U.z)); + phi = 2 * RANLUXCL_PI_F * U.w; + Z.z = R * cos(phi); + Z.w = R * sin(phi); + + return Z; + + #undef RANLUXCL_PI_F +} + +#ifdef RANLUXCL_SUPPORT_DOUBLE +double4 ranluxcl64(ranluxcl_state_t *rst) +{ + double4 out; + float4 randvec; + + //We know this value is caused by the never-zero part + //of the original algorithm, but we want to allow zero for + //the most significant bits in the double precision result. + randvec = ranluxcl32(rst); + if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) + randvec.x = 0.0f; + if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) + randvec.z = 0.0f; + + out.x = (double)(randvec.x) + (double)(randvec.y) / 16777216; + out.y = (double)(randvec.z) + (double)(randvec.w) / 16777216; + + randvec = ranluxcl32(rst); + if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) + randvec.x = 0.0f; + if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) + randvec.z = 0.0f; + + out.z = (double)(randvec.x) + (double)(randvec.y) / 16777216; + out.w = (double)(randvec.z) + (double)(randvec.w) / 16777216; + + return out; +} + +double4 ranluxcl64norm(ranluxcl_state_t *rst) +{ + //Returns a vector where each component is a normally + //distributed PRN centered on 0, with standard deviation + //1. + + double4 U = ranluxcl64(rst); + + double4 Z; + double R, phi; + + R = sqrt(-2 * log(U.x)); + phi = 2 * M_PI * U.y; + Z.x = R * cos(phi); + Z.y = R * sin(phi); + + R = sqrt(-2 * log(U.z)); + phi = 2 * M_PI * U.w; + Z.z = R * cos(phi); + Z.w = R * sin(phi); + + return Z; +} +#endif //RANLUXCL_SUPPORT_DOUBLE + +#undef RANLUXCL_TWOM24 +#undef RANLUXCL_TWOM12 +#undef RANLUXCL_NUMWORKITEMS +#undef RANLUXCL_MYID +#undef RANLUXCL_PLANAR +#undef RANLUXCL_NOSKIP + +#endif //RANLUXCL_CL