diff --git a/doc/source/array.rst b/doc/source/array.rst index 48525e96e930c7d1a18bbc83f75028844ea7003d..7264d84ea351bcadc93b7c702c328c882b4f6fbe 100644 --- a/doc/source/array.rst +++ b/doc/source/array.rst @@ -361,11 +361,80 @@ Generating Arrays of Random Numbers .. module:: pyopencl.clrandom +.. class:: RanluxGenerator(self, queue, num_work_items, max_work_items, luxury=2, seed=None) + + :param queue: :class:`pyopencl.CommandQueue`, only used for initialization + :param luxury: the "luxury value" of the generator, and should be 0-4, where 0 is fastest + and 4 produces the best numbers. It can also be >=24, in which case it directly + sets the p-value of RANLUXCL. + :param num_work_items: is the number of generators to initialize, usually corresponding + to the number of work-items in the NDRange RANLUXCL will be used with. + :param max_work_items: should reflect the maximum number of work-items that will be used + on any parallel instance of RANLUXCL. So for instance if we are launching 5120 + work-items on GPU1 and 10240 work-items on GPU2, GPU1's RANLUXCLTab would be + generated by calling ranluxcl_intialization with numWorkitems = 5120 while + GPU2's RANLUXCLTab would use numWorkitems = 10240. However maxWorkitems must + be at least 10240 for both GPU1 and GPU2, and it must be set to the same value + for both. + + .. attribute:: state + + A :class:`pyopencl.array.Array` containing the state of the generator. + + .. attribute:: nskip + + nskip is an integer which can (optionally) be defined in the kernel code + as RANLUXCL_NSKIP. If this is done the generator will be faster for luxury setting + 0 and 1, or when the p-value is manually set to a multiple of 24. + + .. method:: fill_uniform(ary, a=0, b=1, queue=None) + + Fill *ary* with uniformly distributed random numbers in the interval + *(a, b)*, endpoints excluded. + + .. method:: uniform(queue, shape, dtype, order="C", allocator=None, base=None, data=None, a=0, b=1) + + Make a new empty array, apply :meth:`fill_uniform` to it. + + .. method:: fill_normal(ary, mu=0, sigma=1, queue=None): + + Fill *ary* with normally distributed numbers with mean *mu* and + standard deviation *sigma*. + + .. method:: normal(queue, shape, dtype, order="C", allocator=None, base=None, data=None, mu=0, sigma=1) + + Make a new empty array, apply :meth:`fill_normal` to it. + + .. method:: synchronize() + + The generator gets inefficient when different work items invoke + the generator a differing number of times. This function + ensures efficiency. + .. function:: rand(queue, shape, dtype) Return an array of `shape` filled with random values of `dtype` in the range [0,1). +PyOpenCL now includes and uses the `RANLUXCL random number generator +`_ by Ivar Ursin Nikolaisen. In +addition to being usable through the convenience functions above, it is +available in any piece of code compiled through PyOpenCL by:: + + #include + +The RANLUX generator is described in the following two articles. If you use the +generator for scientific purposes, please consider citing them: + +* Martin Lüscher, A portable high-quality random number generator for lattice + field theory simulations, `Computer Physics Communications 79 (1994) 100-110 + `_ + +* F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom + number generator of Lüscher, `Computer Physics Communications 79 (1994) 111-114 + `_ + + Single-pass Custom Expression Evaluation ---------------------------------------- diff --git a/doc/source/misc.rst b/doc/source/misc.rst index c69f95ccdede76632397ff54bb1e4fb8a16b216c..dfabd69b7f437f09eb038e2d54982c74d5d08262 100644 --- a/doc/source/misc.rst +++ b/doc/source/misc.rst @@ -79,10 +79,12 @@ Version 2011.2 severe consequences on the execution time of :class:`pyopencl.array.Array` operations. Henrik Andresen at a `PyOpenCL workshop at DTU `_ - first noticed the timings + first noticed the strange timings. * All comparable PyOpenCL objects are now also hashable. * Add :func:`pyopencl.tools.context_dependent_memoize` to the documented functionality. +* Base :mod:`pyopencl.clrandom` on `RANLUXCL `_, + add functionality. Version 2011.1.2 ---------------- @@ -299,6 +301,27 @@ implementation). These parts are licensed as follows: with software licensed exclusively under GPL2. (Most software is licensed as GPL2 or later, in which case this is not an issue.) +PyOpenCL includes the RANLUXCL random number generator: + + Copyright (c) 2011 Ivar Ursin Nikolaisen + + Permission is hereby granted, free of charge, to any person obtaining a copy of this + software and associated documentation files (the "Software"), to deal in the Software + without restriction, including without limitation the rights to use, copy, modify, + merge, publish, distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to the following + conditions: + + The above copyright notice and this permission notice shall be included in all copies + or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A + PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE + OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + Frequently Asked Questions ========================== diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index b1fcb788f240fc8c5b21a409942d464e14f54e00..e689051cf8f800a0b0f47fe392cfc80f8155b6a3 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -368,6 +368,39 @@ _add_functionality() +# {{{ find pyopencl shipped source code + +def _find_pyopencl_include_path(): + from imp import find_module + import sys + file, pathname, descr = find_module("pyopencl") + + # Who knew Python installation is so uniform and predictable? + from os.path import join, exists + possible_include_paths = [ + join(pathname, "..", "include", "pyopencl"), + join(pathname, "..", "src", "cl"), + join(pathname, "..", "..", "..", "src", "cl"), + join(pathname, "..", "..", "..", "..", "include", "pyopencl"), + join(pathname, "..", "..", "..", "include", "pyopencl"), + ] + + if sys.platform in ("linux2", "darwin"): + possible_include_paths.extend([ + join(sys.prefix, "include" , "pyopencl"), + "/usr/include/pyopencl", + "/usr/local/include/pyopencl" + ]) + + for inc_path in possible_include_paths: + if exists(inc_path): + return inc_path + + raise RuntimeError("could not find path to PyOpenCL's CL" + " header files, searched in : %s" + % '\n'.join(possible_include_paths)) + +# }}} # {{{ Program (including caching support) @@ -425,6 +458,11 @@ class Program(object): "info attribute or as a kernel name" % attr) def build(self, options=[], devices=None, cache_dir=None): + if isinstance(options, str): + options = [options] + + options = options + ["-I", _find_pyopencl_include_path()] + if self._prg is not None: self._prg._build(options, devices) else: diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py index 2b13047eef792e23bd023d238f74aa03b604c41f..ab4805b20e85b1ed30dc88c8302573da61aa310d 100644 --- a/pyopencl/clrandom.py +++ b/pyopencl/clrandom.py @@ -1,235 +1,219 @@ import pyopencl as cl import pyopencl.array as cl_array -from pyopencl.tools import context_dependent_memoize - - - - -md5_code = """ -/* - ********************************************************************** - ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. ** - ** ** - ** License to copy and use this software is granted provided that ** - ** it is identified as the "RSA Data Security, Inc. MD5 Message ** - ** Digest Algorithm" in all material mentioning or referencing this ** - ** software or this function. ** - ** ** - ** License is also granted to make and use derivative works ** - ** provided that such works are identified as "derived from the RSA ** - ** Data Security, Inc. MD5 Message Digest Algorithm" in all ** - ** material mentioning or referencing the derived work. ** - ** ** - ** RSA Data Security, Inc. makes no representations concerning ** - ** either the merchantability of this software or the suitability ** - ** of this software for any particular purpose. It is provided "as ** - ** is" without express or implied warranty of any kind. ** - ** ** - ** These notices must be retained in any copies of any part of this ** - ** documentation and/or software. ** - ********************************************************************** - */ - -/* F, G and H are basic MD5 functions: selection, majority, parity */ -#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) -#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) -#define H(x, y, z) ((x) ^ (y) ^ (z)) -#define I(x, y, z) ((y) ^ ((x) | (~z))) - -/* ROTATE_LEFT rotates x left n bits */ -#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) - -/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */ -/* Rotation is separate from addition to prevent recomputation */ -#define FF(a, b, c, d, x, s, ac) \ - {(a) += F ((b), (c), (d)) + (x) + (ac); \ - (a) = ROTATE_LEFT ((a), (s)); \ - (a) += (b); \ - } -#define GG(a, b, c, d, x, s, ac) \ - {(a) += G ((b), (c), (d)) + (x) + (ac); \ - (a) = ROTATE_LEFT ((a), (s)); \ - (a) += (b); \ - } -#define HH(a, b, c, d, x, s, ac) \ - {(a) += H ((b), (c), (d)) + (x) + (ac); \ - (a) = ROTATE_LEFT ((a), (s)); \ - (a) += (b); \ - } -#define II(a, b, c, d, x, s, ac) \ - {(a) += I ((b), (c), (d)) + (x) + (ac); \ - (a) = ROTATE_LEFT ((a), (s)); \ - (a) += (b); \ - } - -#define X0 get_local_id(0) -#define X1 get_local_id(1) -#define X2 get_local_id(2) -#define X3 get_group_id(0) -#define X4 get_group_id(1) -#define X5 get_group_id(2) -#define X6 seed -#define X7 i -#define X8 n -#define X9 get_local_size(0) -#define X10 get_local_size(1) -#define X11 get_local_size(2) -#define X12 get_global_size(0) -#define X13 get_global_size(1) -#define X14 get_global_size(2) -#define X15 0 - - unsigned int a = 0x67452301; - unsigned int b = 0xefcdab89; - unsigned int c = 0x98badcfe; - unsigned int d = 0x10325476; - - /* Round 1 */ -#define S11 7 -#define S12 12 -#define S13 17 -#define S14 22 - FF ( a, b, c, d, X0 , S11, 3614090360); /* 1 */ - FF ( d, a, b, c, X1 , S12, 3905402710); /* 2 */ - FF ( c, d, a, b, X2 , S13, 606105819); /* 3 */ - FF ( b, c, d, a, X3 , S14, 3250441966); /* 4 */ - FF ( a, b, c, d, X4 , S11, 4118548399); /* 5 */ - FF ( d, a, b, c, X5 , S12, 1200080426); /* 6 */ - FF ( c, d, a, b, X6 , S13, 2821735955); /* 7 */ - FF ( b, c, d, a, X7 , S14, 4249261313); /* 8 */ - FF ( a, b, c, d, X8 , S11, 1770035416); /* 9 */ - FF ( d, a, b, c, X9 , S12, 2336552879); /* 10 */ - FF ( c, d, a, b, X10, S13, 4294925233); /* 11 */ - FF ( b, c, d, a, X11, S14, 2304563134); /* 12 */ - FF ( a, b, c, d, X12, S11, 1804603682); /* 13 */ - FF ( d, a, b, c, X13, S12, 4254626195); /* 14 */ - FF ( c, d, a, b, X14, S13, 2792965006); /* 15 */ - FF ( b, c, d, a, X15, S14, 1236535329); /* 16 */ - - /* Round 2 */ -#define S21 5 -#define S22 9 -#define S23 14 -#define S24 20 - GG ( a, b, c, d, X1 , S21, 4129170786); /* 17 */ - GG ( d, a, b, c, X6 , S22, 3225465664); /* 18 */ - GG ( c, d, a, b, X11, S23, 643717713); /* 19 */ - GG ( b, c, d, a, X0 , S24, 3921069994); /* 20 */ - GG ( a, b, c, d, X5 , S21, 3593408605); /* 21 */ - GG ( d, a, b, c, X10, S22, 38016083); /* 22 */ - GG ( c, d, a, b, X15, S23, 3634488961); /* 23 */ - GG ( b, c, d, a, X4 , S24, 3889429448); /* 24 */ - GG ( a, b, c, d, X9 , S21, 568446438); /* 25 */ - GG ( d, a, b, c, X14, S22, 3275163606); /* 26 */ - GG ( c, d, a, b, X3 , S23, 4107603335); /* 27 */ - GG ( b, c, d, a, X8 , S24, 1163531501); /* 28 */ - GG ( a, b, c, d, X13, S21, 2850285829); /* 29 */ - GG ( d, a, b, c, X2 , S22, 4243563512); /* 30 */ - GG ( c, d, a, b, X7 , S23, 1735328473); /* 31 */ - GG ( b, c, d, a, X12, S24, 2368359562); /* 32 */ - - /* Round 3 */ -#define S31 4 -#define S32 11 -#define S33 16 -#define S34 23 - HH ( a, b, c, d, X5 , S31, 4294588738); /* 33 */ - HH ( d, a, b, c, X8 , S32, 2272392833); /* 34 */ - HH ( c, d, a, b, X11, S33, 1839030562); /* 35 */ - HH ( b, c, d, a, X14, S34, 4259657740); /* 36 */ - HH ( a, b, c, d, X1 , S31, 2763975236); /* 37 */ - HH ( d, a, b, c, X4 , S32, 1272893353); /* 38 */ - HH ( c, d, a, b, X7 , S33, 4139469664); /* 39 */ - HH ( b, c, d, a, X10, S34, 3200236656); /* 40 */ - HH ( a, b, c, d, X13, S31, 681279174); /* 41 */ - HH ( d, a, b, c, X0 , S32, 3936430074); /* 42 */ - HH ( c, d, a, b, X3 , S33, 3572445317); /* 43 */ - HH ( b, c, d, a, X6 , S34, 76029189); /* 44 */ - HH ( a, b, c, d, X9 , S31, 3654602809); /* 45 */ - HH ( d, a, b, c, X12, S32, 3873151461); /* 46 */ - HH ( c, d, a, b, X15, S33, 530742520); /* 47 */ - HH ( b, c, d, a, X2 , S34, 3299628645); /* 48 */ - - /* Round 4 */ -#define S41 6 -#define S42 10 -#define S43 15 -#define S44 21 - II ( a, b, c, d, X0 , S41, 4096336452); /* 49 */ - II ( d, a, b, c, X7 , S42, 1126891415); /* 50 */ - II ( c, d, a, b, X14, S43, 2878612391); /* 51 */ - II ( b, c, d, a, X5 , S44, 4237533241); /* 52 */ - II ( a, b, c, d, X12, S41, 1700485571); /* 53 */ - II ( d, a, b, c, X3 , S42, 2399980690); /* 54 */ - II ( c, d, a, b, X10, S43, 4293915773); /* 55 */ - II ( b, c, d, a, X1 , S44, 2240044497); /* 56 */ - II ( a, b, c, d, X8 , S41, 1873313359); /* 57 */ - II ( d, a, b, c, X15, S42, 4264355552); /* 58 */ - II ( c, d, a, b, X6 , S43, 2734768916); /* 59 */ - II ( b, c, d, a, X13, S44, 1309151649); /* 60 */ - II ( a, b, c, d, X4 , S41, 4149444226); /* 61 */ - II ( d, a, b, c, X11, S42, 3174756917); /* 62 */ - II ( c, d, a, b, X2 , S43, 718787259); /* 63 */ - II ( b, c, d, a, X9 , S44, 3951481745); /* 64 */ - - a += 0x67452301; - b += 0xefcdab89; - c += 0x98badcfe; - d += 0x10325476; -""" +from pyopencl.tools import first_arg_dependent_memoize +from pytools import memoize_method import numpy as np -@context_dependent_memoize -def get_rand_kernel(context, dtype): - from pyopencl.elementwise import get_elwise_kernel - if dtype == np.float32: - return get_elwise_kernel(context, - "float *dest, unsigned int seed", - md5_code + """ - #define POW_2_M32 (1/4294967296.0f) - dest[i] = a*POW_2_M32; - if ((i += gsize) < n) - dest[i] = b*POW_2_M32; - if ((i += gsize) < n) - dest[i] = c*POW_2_M32; - if ((i += gsize) < n) - dest[i] = d*POW_2_M32; - """, - "md5_rng_float") - elif dtype == np.float64: - return get_elwise_kernel(context, - "double *dest, unsigned int seed", - md5_code + """ - #define POW_2_M32 (1/4294967296.0) - #define POW_2_M64 (1/18446744073709551616.) - - dest[i] = a*POW_2_M32 + b*POW_2_M64; - - if ((i += gsize) < n) +class RanluxGenerator(object): + def __init__(self, queue, num_work_items, + luxury=4, seed=None, no_warmup=False, + use_legacy_init=False, max_work_items=None): + if seed is None: + from time import time + seed = int(time()*1e6) % 2<<30 + + self.context = queue.context + self.luxury = luxury + self.num_work_items = num_work_items + + from pyopencl.characterize import has_double_support + self.support_double = has_double_support(queue.device) + + self.no_warmup = no_warmup + self.use_legacy_init = use_legacy_init + self.max_work_items = max_work_items + + prg = cl.Program(queue.context, """ + %s + + #include + + kernel void init_ranlux(unsigned seeds, global ranluxcl_state_t *ranluxcltab) + { + ranluxcl_initialization(seeds, ranluxcltab); + } + """ % self.generate_settings_defines()).build() + + self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8) + prg.init_ranlux(queue, (num_work_items,), None, np.uint32(seed), + self.state.data) + + def generate_settings_defines(self, include_double_pragma=True): + lines = [] + if include_double_pragma and self.support_double: + lines.append("#pragma OPENCL EXTENSION cl_khr_fp64 : enable") + + lines.append("#define RANLUXCL_LUX %d" % self.luxury) + + if self.no_warmup: + lines.append("#define RANLUXCL_NO_WARMUP") + + if self.support_double: + lines.append("#define RANLUXCL_SUPPORT_DOUBLE") + + if self.use_legacy_init: + lines.append("#define RANLUXCL_USE_LEGACY_INITIALIZATION") + + if self.max_work_items: + lines.append("#define RANLUXCL_MAXWORKITEMS %d" % self.max_work_items) + + return "\n".join(lines) + + @memoize_method + def get_gen_kernel(self, dtype, flavor=""): + if dtype == np.float64: + bits = 64 + c_type = "double" + rng_expr = "(shift + scale * gen)" + elif dtype == np.float32: + bits = 32 + c_type = "float" + rng_expr = "(shift + scale * gen)" + elif dtype == np.int32: + assert flavor == "" + bits = 32 + c_type = "int" + rng_expr = ("(shift " + "+ convert_int4(scale * gen) " + "+ convert_int4((scale / (1<<24)) * gen))" + ) + else: + raise TypeError("unsupported RNG data type '%s'" % dtype) + + rl_flavor = "%d%s" % (bits, flavor) + + src = """//CL// + %(defines)s + + #include + + typedef %(output_t)s output_t; + typedef %(output_t)s4 output_vec_t; + #define NUM_WORKITEMS %(num_work_items)d + #define RANLUX_FUNC ranluxcl##%(rlflavor)s + #define GET_RANDOM_NUM(gen) %(rng_expr)s + + kernel void generate( + global ranluxcl_state_t *ranluxcltab, + global output_t *output, + unsigned long out_size, + output_t scale, + output_t shift) { - dest[i] = c*POW_2_M32 + d*POW_2_M64; + ranluxcl_state_t ranluxclstate; + ranluxcl_download_seed(&ranluxclstate, ranluxcltab); + + // output bulk + unsigned long idx = get_global_id(0)*4; + while (idx + 4 < out_size) + { + vstore4(GET_RANDOM_NUM(RANLUX_FUNC(&ranluxclstate)), idx >> 2, output); + idx += 4*NUM_WORKITEMS; + } + + // output tail + output_vec_t tail_ran = GET_RANDOM_NUM(RANLUX_FUNC(&ranluxclstate)); + if (idx < out_size) + output[idx] = tail_ran.x; + if (idx+1 < out_size) + output[idx+1] = tail_ran.y; + if (idx+2 < out_size) + output[idx+2] = tail_ran.z; + if (idx+3 < out_size) + output[idx+2] = tail_ran.w; + + ranluxcl_upload_seed(&ranluxclstate, ranluxcltab); } - """, - "md5_rng_float") - elif dtype in [np.int32, np.uint32]: - return get_elwise_kernel(context, - "unsigned int *dest, unsigned int seed", - md5_code + """ - dest[i] = a; - if ((i += gsize) < n) - dest[i] = b; - if ((i += gsize) < n) - dest[i] = c; - if ((i += gsize) < n) - dest[i] = d; - """, - "md5_rng_int") + """ % { + "defines": self.generate_settings_defines(), + "rlflavor": rl_flavor, + "output_t": c_type, + "num_work_items": self.num_work_items, + "rng_expr": rng_expr + } + + prg = cl.Program(self.context, src).build() + knl = prg.generate + knl.set_scalar_arg_dtypes([None, None, np.uint64, dtype, dtype]) + + return knl + + def fill_uniform(self, ary, a=0, b=1, queue=None): + if queue is None: + queue = ary.queue + + self.get_gen_kernel(ary.dtype, "")(queue, (self.num_work_items,), None, + self.state.data, ary.data, ary.size, + b-a, a) + + def uniform(self, *args, **kwargs): + a = kwargs.pop("a", 0) + b = kwargs.pop("b", 1) + + result = cl_array.empty(*args, **kwargs) + + self.fill_uniform(result, queue=result.queue, a=a, b=b) + return result + + def fill_normal(self, ary, mu=0, sigma=1, queue=None): + if queue is None: + queue = ary.queue + + self.get_gen_kernel(ary.dtype, "norm")(queue, (self.num_work_items,), None, + self.state.data, ary.data, ary.size, sigma, mu) + + def normal(self, *args, **kwargs): + mu = kwargs.pop("mu", 0) + sigma = kwargs.pop("sigma", 1) + + result = cl_array.empty(*args, **kwargs) + + self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma) + return result + + @memoize_method + def get_sync_kernel(self): + src = """//CL// + %(defines)s + + #include + + kernel void sync( + global ranluxcl_state_t *ranluxcltab) + { + ranluxcl_state_t ranluxclstate; + ranluxcl_download_seed(&ranluxclstate, ranluxcltab); + ranluxcl_synchronize(&ranluxclstate); + ranluxcl_upload_seed(&ranluxclstate, ranluxcltab); + } + """ % { + "defines": self.generate_settings_defines(), + } + prg = cl.Program(self.context, src).build() + return prg.sync + + def synchronize(self, queue): + self.get_sync_kernel()(queue, (self.num_work_items,), None, self.state.data) + + + + + + +@first_arg_dependent_memoize +def _get_generator(queue): + if queue.device.type == cl.device_type.CPU: + num_work_items = 8 * queue.device.max_compute_units else: - raise NotImplementedError + num_work_items = 64 * queue.device.max_compute_units + + gen = RanluxGenerator(queue, num_work_items) + queue.finish() + return gen @@ -245,8 +229,9 @@ def rand(*args, **kwargs): def inner_rand(queue, shape, dtype): from pyopencl.array import Array + gen = _get_generator(queue) result = Array(queue, shape, dtype) - _rand(result, np.random.randint(2**31-1)) + gen.fill_uniform(result) return result if isinstance(args[0], cl.Context): @@ -259,27 +244,7 @@ def rand(*args, **kwargs): return inner_rand(*args, **kwargs) -if __name__ == "__main__": - import sys - ctx = cl.create_some_context() - queue = cl.CommandQueue(ctx) - if "generate" in sys.argv[1:]: - N = 256 - print N, "MB" - r = rand(ctx, queue, (N*2**18,), np.uint32) - print "generated" - r.get().tofile("random.dat") - print "written" - else: - from pylab import plot, show, subplot - N = 250 - r1 = rand(ctx, queue, (N,), np.uint32) - r2 = rand(ctx, queue, (N,), np.int32) - r3 = rand(ctx, queue, (N,), np.float32) - - subplot(131); plot( r1.get(),"x-") - subplot(132); plot( r2.get(),"x-") - subplot(133); plot( r3.get(),"x-") - show() + +# vim: filetype=pyopencl diff --git a/pyopencl/tools.py b/pyopencl/tools.py index fce7f7db12d0ebc8e253b7d42d7f3fbad4fb23df..986eb885d23d50dd436c7bd45623693698c77208 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -43,40 +43,42 @@ MemoryPool = cl.MemoryPool -context_dependent_memoized_functions = [] +first_arg_dependent_memoized_functions = [] @decorator -def context_dependent_memoize(func, context, *args): +def first_arg_dependent_memoize(func, context, *args): """Provides memoization for things that get created inside a context, i.e. mainly programs and kernels. Assumes that the first argument of the decorated function is the context. """ try: - ctx_dict = func._pyopencl_ctx_dep_memoize_dic + ctx_dict = func._pyopencl_first_arg_dep_memoize_dic except AttributeError: # FIXME: This may keep contexts alive longer than desired. # But I guess since the memory in them is freed, who cares. - ctx_dict = func._pyopencl_ctx_dep_memoize_dic = {} + ctx_dict = func._pyopencl_first_arg_dep_memoize_dic = {} try: return ctx_dict[context][args] except KeyError: - context_dependent_memoized_functions.append(func) + first_arg_dependent_memoized_functions.append(func) arg_dict = ctx_dict.setdefault(context, {}) result = func(context, *args) arg_dict[args] = result return result +context_dependent_memoize = first_arg_dependent_memoize + def clear_context_caches(): - for func in context_dependent_memoized_functions: + for func in first_arg_dependent_memoized_functions: try: - ctx_dict = func._pycuda_ctx_dep_memoize_dic + ctx_dict = func._pyopencl_first_arg_dep_memoize_dic except AttributeError: pass else: diff --git a/setup.py b/setup.py index 78a8700184a72b3343fa7d967784d2c25e2cfdf0..27e4c864f5f9e537e7c8e067350ac9cc85cdbc2c 100644 --- a/setup.py +++ b/setup.py @@ -224,6 +224,11 @@ def main(): ), ], + data_files=[ + ("include/pyopencl", + glob.glob("src/cl/*.cl") + glob.glob("src/cl/*.h")) + ], + # 2to3 invocation cmdclass={'build_py': build_py}) diff --git a/src/cl/pyopencl-ranluxcl.cl b/src/cl/pyopencl-ranluxcl.cl new file mode 100644 index 0000000000000000000000000000000000000000..1a1a0e53b0f5d4d1a2036c7f86bd86c691d50ac9 --- /dev/null +++ b/src/cl/pyopencl-ranluxcl.cl @@ -0,0 +1,798 @@ +#ifndef RANLUXCL_CL +#define RANLUXCL_CL + +/**** RANLUXCL v1.3.1 MODIFIED ***************************************************** + +Implements the RANLUX generator of Matrin Luscher, based on the Fortran 77 +implementation by Fred James. This OpenCL code is a complete implementation which +should perfectly replicate the numbers generated by the original Fortran 77 +implementation (if using the legacy initialization routine). + +***** QUICK USAGE DESCRIPTION ****************************************************** + +1. Create an OpenCL buffer with room for at least 28 32-bit variables (112 byte). +I.e., in C/C++: size_t buffSize = numWorkitems * 112; + +2. Pass the buffer and an unsigned integer seed to a kernel that launches the +ranluxcl_initialization function. The seed can be any unsigned 32-bit integer, +and must be different on different OpenCL devices/NDRanges to ensure different +sequences. As long as the number of work-items on each device/NDRange is less than +2^32 = 4294967296 all sequences will be different. +An examle initialization kernel would be: + #include "ranluxcl.cl" + kernel void Kernel_Ranluxcl_Init(private uint ins, global ranluxcl_state_t *ranluxcltab){ + ranluxcl_initialization(ins, ranluxcltab); + } + +3. Now the generator is ready for use. Remember to download the seeds first, and +upload them again when done. Example kernel that downloads seeds, generates a float4 +where each component is uniformly distributed between 0 and 1, end points not included, +then uploads the seeds again: + #include "ranluxcl.cl" + kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab){ + //ranluxclstate is a struct of 7 float4 variables + //storing the state of the generator. + ranluxcl_state_t ranluxclstate; + + //Download state into ranluxclstate struct. + ranluxcl_download_seed(&ranluxclstate, ranluxcltab); + + //Generate a float4 with each component on (0,1), + //end points not included. We can call ranluxcl as many + //times as we like until we upload the state again. + float4 randomnr = ranluxcl32(&ranluxclstate); + + //Upload state again so that we don't get the same + //numbers over again the next time we use ranluxcl. + ranluxcl_upload_seed(&ranluxclstate, ranluxcltab); + } + +***** MACROS *********************************************************************** + +The following macros can optionally be defined: + +RANLUXCL_LUX: +Sets the luxury level of the generator. Should be 0-4, or if it is 24 or larger it +sets the p-value of the generator (generally not needed). If this macro is not set +then lux=4 is the default (highest quality). For many applications the high quality +of lux=4 may not be needed. Indeed if two values (each value having 24 random bits) +are glued together to form a 48-bit value the generator passes all tests in the TestU01 +suite already with lux=2. See "TestU01: A C Library for Empirical Testing of Random +Number Generators" by PIERRE L’ECUYER and RICHARD SIMARD. SWB(224, 10, 24)[24, l] is +RANLUX with two values glued together to create 48-bit numbers, and we see that it +passes all tests already at luxury value 2. + +RANLUXCL_NO_WARMUP: +Turns off the warmup functionality in ranluxcl_initialization. This macro should +generally not be used, since the generators will initially be correlated if it is +defined. The only advantage is that the numbers generated will exactly correspond +to those of the original Fortran 77 implementation. + +RANLUXCL_SUPPORT_DOUBLE: +Enables double precision functions. Please enable the OpenCL double precision +extension yourself, usually by "#pragma OPENCL EXTENSION cl_khr_fp64 : enable". + +RANLUXCL_USE_LEGACY_INITIALIZATION +Uses exactly the same initialization routine as in the original Fortran 77 code, +leading to the same sequences. If using legacy initialization there are some +restrictions on what the seed can be, and it may also be necessary to define +RANLUXCL_MAXWORKITEMS if several sequences are to be run in parallel. + +RANLUXCL_MAXWORKITEMS: +When RANLUXCL_USE_LEGACY_INITIALIZATION is defined we may need this macro. +If several OpenCL NDRanges will be running in parallel and the parallel sequences should +be different then this macro should have a value equal or larger than the +largest number of work-items in any of the parallel runs. The default is to use the +current global size, so if all NDRanges are of the same size this need not be +defined. + Each parallel instance must also have different seeds . For example if +we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2 we would use +different seeds for the two generators, and RANLUXCL_MAXWORKITEMS must be defined to +be at least 10240. If GPU1 and GPU2 had the same number of work-items this would not +be necessary. + An underestimate of the highest permissible seed is given by the smallest of: +( = 10^9 / ) or ( = 10^9 / RANLUXCL_MAXWORKITEMS). +Please make sure that is never higher than this since it could cause undetected +problems. For example with 10240 work-items the highest permissible is about +100 000. + Again note that this is only relevant when using the legacy +initialization function enabled by RANLUXCL_USE_LEGACY_INITIALIZATION. When not using +the legacy initialization this macro is effectively set to a very high value of 2^32-1. + +***** FUNCTIONS: INITIALIZATION **************************************************** + +The initialization function is defined as: +void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab) +Run once at the very beginning. ranluxcltab should be a buffer with space for +112 byte per work-item in the NDRange. is the seed to the generator. +For a given each work-item in the NDRange will generate a different sequence. +If more than one NDRange is used in parallel then must be different for each +NDRange to avoid identical sequences. + +***** FUNCTIONS: SEED UPLOAD/DOWNLOAD ********************************************** + +The following two functions should be launced at the beginning and end of a kernel +that uses ranluxcl to generate numbers, respectively: + +void ranluxcl_download_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab) +Run at the beginning of a kernel to download ranluxcl state data + +void ranluxcl_upload_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab) +Run at the end of a kernel to upload state data + +***** FUNCTIONS: GENERATION AND SYNCHRONIZATION ************************************ + +float4 ranluxcl32(ranluxcl_state_t *rst) +Run to generate a pseudo-random float4 where each component is a number between +0 and 1, end points not included (meaning the number will never be exactly 0 or 1). + +double4 ranluxcl64(ranluxcl_state_t *rst) +Double precision version of the above function. The preprocessor macro +RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available. +This function "glues" together two single-precision numbers to make one double +precision number. Most of the work is still done in single precision, so the +performance will be roughly halved regardless of the double precision performance +of the hardware. + +float4 ranluxcl32norm(ranluxcl_state_t *rst) +Run to generate a pseudo-random float4 where each component is normally distributed +with mean 0 and standard deviation 1. + +double4 ranluxcl64norm(ranluxcl_state_t *rst) +Double precision version of the above function. The preprocessor macro +RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available. + +void ranluxcl_synchronize(ranluxcl_state_t *rst) +Run to synchronize execution in case different work-items have made a different +number of calls to ranluxcl. On SIMD machines this could lead to inefficient execution. +ranluxcl_synchronize allows us to make sure all generators are SIMD-friendly again. Not +needed if all work-items always call ranluxcl the same number of times. + +***** PERFORMANCE ****************************************************************** + +For luxury setting 4, performance on AMD Cypress should be ~4.5*10^9 pseudorandom +values per second, when not downloading values to host memory (i.e. the values are +just generated, but not used for anything in particular). + +***** DESCRIPTION OF THE IMPLEMENTATION ******************************************** + +This code closely follows the original Fortran 77 code (see credit section). Here +the differences (and similarities) between RANLUXCL (this implementation) and the +original RANLUX are discussed. + +The Fortran 77 implementation uses a simple LCG to initialize the generator, and +so the same approach is taken here. If RANLUXCL is initialized with = 0 as +seed, the first work-item behaves like the original RANLUX with seed equal 1, the +second work-item as if with seed equal 2 and so on. If = 1 then the first +work-item behaves like the original RANLUX with seed equal to + 1, +and so on for higher so that we never have overlapping sequences. This is +why the RANLUXCL_MAXWORKITEMS macro must be set if we have different NDRanges with +a different number of work-items. + +RANLUX is based on chaos theory, and what we are actually doing when selecting +a luxury value is setting how many values to skip over (causing decorrelation). +The number of values to skip is controlled by the so-called p-value of the +generator. After generating 24 values we skip p - 24 values until again generating +24 values. + +This implementation is somewhat modified from the original fortran implementation +by F. James. Because of the way the OpenCL code is optimized with 4-component +32-bit float vectors, it is most convenient to always throw away some multiple +of 24 values (i.e. p is always a multiple of 24). + +However, there might be some resonances if we always throw away a multiple of +the seeds table size. Therefore the implementation is slightly more intricate +where p can be a multiple of 4 instead, at a cost to performance (only about 10% +lower than the cleaner 24 values approach on AMD Cypress). These two approaches +are termed planar and planar shift respectively. The idea for the planar approach +comes from the following paper: +Vadim Demchik, Pseudo-random number generators for Monte Carlo simulations on +Graphics Processing Units, arXiv:1003.1898v1 [hep-lat] + +Below the p-values for the original reference implementation are listed along with +those of the planar shift implementation. Suggested values for the planar approach +are also presented. When this function is called with RANLUXCL_LUX set to 0-4, the +planar shift values are used. To use the pure planar approach (for some extra +performance with likely undetectable quality decrease), set lux equal to the specific +p-value. + +Luxury setting (RANLUXCL_LUX): 0 1 2 3 4 +Original fortran77 implementation by F. James: 24 48 97 223 389 +Planar (suggested): 24 48 120 240 408 +Planar shift: 24 48 100 224 404 + +Note that levels 0 and 1 are the same as in the original implementation for both +planar and planar shift. Level 4 of planar shift where p=404 is the same as chosen +for luxury level 1 by Martin Luescher for his v3 version of RANLUX. Therefore if +it is considered important to only use "official" values, luxury settings 0, 1 or +4 of planar shift should be used. It is however unlikely that the other values are +bad, they just haven't been as extensively used and tested by others. + +Variable names are generally the same as in the fortran77 implementation, however +because of the way the generator is implemented, the i24 and j24 variables are +no longer needed. + +***** CREDIT *********************************************************************** + +I have been told by Fred James (the coder) that the original Fortran 77 +implementation (which is the subject of the second paper below) is free to use and +share. Therefore I am using the MIT license (below). But most importantly please +always remember to give credit to the two articles by Martin Luscher and Fred James, +describing the generator and the Fortran 77 implementation on which this +implementation is based, respectively: + +Martin Lüscher, A portable high-quality random number generator for lattice +field theory simulations, Computer Physics Communications 79 (1994) 100-110 + +F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom +number generator of Lüscher, Computer Physics Communications 79 (1994) 111-114 + +***** LICENSE ********************************************************************** + +Copyright (c) 2011 Ivar Ursin Nikolaisen + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be included in all copies +or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +***************************************************************************************/ + +typedef struct{ + float + seed01, seed02, seed03, seed04, + seed05, seed06, seed07, seed08, + seed09, seed10, seed11, seed12, + seed13, seed14, seed15, seed16, + seed17, seed18, seed19, seed20, + seed21, seed22, seed23, seed24; + float carry; + float dummy; //Causes struct to be a multiple of 128 bits + int in24; + int stepnr; +} ranluxcl_state_t; + +#define RANLUXCL_TWOM24 0.000000059604644775f +#define RANLUXCL_TWOM12 0.000244140625f + +#ifdef RANLUXCL_LUX +#if RANLUXCL_LUX < 0 +#error ranluxcl: lux must be zero or positive. +#endif +#else +#define RANLUXCL_LUX 4 //Default to high quality +#endif //RANLUXCL_LUX + +//Here the luxury values are defined +#if RANLUXCL_LUX == 0 +#define RANLUXCL_NSKIP 0 +#elif RANLUXCL_LUX == 1 +#define RANLUXCL_NSKIP 24 +#elif RANLUXCL_LUX == 2 +#define RANLUXCL_NSKIP 76 +#elif RANLUXCL_LUX == 3 +#define RANLUXCL_NSKIP 200 +#elif RANLUXCL_LUX == 4 +#define RANLUXCL_NSKIP 380 +#else +#define RANLUXCL_NSKIP (RANLUXCL_LUX - 24) +#endif //RANLUXCL_LUX == 0 + +//Check that nskip is a permissible value +#if RANLUXCL_NSKIP % 4 != 0 +#error nskip must be divisible by 4! +#endif +#if RANLUXCL_NSKIP < 24 && RANLUXCL_NSKIP != 0 +#error nskip must be either 0 or >= 24! +#endif +#if RANLUXCL_NSKIP < 0 +#error nskip is negative! +#endif + +//Check if planar scheme is recovered +#if RANLUXCL_NSKIP % 24 == 0 +#define RANLUXCL_PLANAR +#endif + +//Check if we will skip at all +#if RANLUXCL_NSKIP == 0 +#define RANLUXCL_NOSKIP +#endif + +//Single-value global size and id +#define RANLUXCL_NUMWORKITEMS (get_global_size(0) * get_global_size(1) * get_global_size(2)) +#define RANLUXCL_MYID (get_global_id(0) + get_global_id(1) * get_global_size(0) + get_global_id(2) * get_global_size(0) * get_global_size(1)) + +void ranluxcl_download_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab) +{ + (*rst) = ranluxcltab[RANLUXCL_MYID]; +} + +void ranluxcl_upload_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab) +{ + ranluxcltab[RANLUXCL_MYID] = (*rst); +} + +float ranluxcl_onestep(float sj24m1, float sj24, float *si24, float *carry){ + float uni, out; + uni = sj24 - (*si24) - (*carry); + if(uni < 0.0f){ + uni += 1.0f; + (*carry) = RANLUXCL_TWOM24; + } else (*carry) = 0.0f; + out = ((*si24) = uni); + + if(uni < RANLUXCL_TWOM12){ + out += RANLUXCL_TWOM24 * sj24m1; + if(out == 0.0f) out = RANLUXCL_TWOM24 * RANLUXCL_TWOM24; + } + return out; +} + +float4 ranluxcl32(ranluxcl_state_t *rst) +{ + //ranluxcl32 returns a 4-component float vector where each component is uniformly distributed + //between 0-1, end points not included. + + float4 out; + + if((*rst).stepnr == 0){ + out.x = ranluxcl_onestep((*rst).seed09, (*rst).seed10, &((*rst).seed24), &((*rst).carry)); + out.y = ranluxcl_onestep((*rst).seed08, (*rst).seed09, &((*rst).seed23), &((*rst).carry)); + out.z = ranluxcl_onestep((*rst).seed07, (*rst).seed08, &((*rst).seed22), &((*rst).carry)); + out.w = ranluxcl_onestep((*rst).seed06, (*rst).seed07, &((*rst).seed21), &((*rst).carry)); + (*rst).stepnr += 4; + } + + else if((*rst).stepnr == 4){ + out.x = ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry)); + out.y = ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry)); + out.z = ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry)); + out.w = ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry)); + (*rst).stepnr += 4; + } + + else if((*rst).stepnr == 8){ + out.x = ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry)); + out.y = ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry)); + out.z = ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry)); + out.w = ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry)); + (*rst).stepnr += 4; + } + + else if((*rst).stepnr == 12){ + out.x = ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry)); + out.y = ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry)); + out.z = ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry)); + out.w = ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry)); + (*rst).stepnr += 4; + } + + else if((*rst).stepnr == 16){ + out.x = ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry)); + out.y = ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry)); + out.z = ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry)); + out.w = ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry)); + (*rst).stepnr += 4; + } + + else if((*rst).stepnr == 20){ + out.x = ranluxcl_onestep((*rst).seed13, (*rst).seed14, &((*rst).seed04), &((*rst).carry)); + out.y = ranluxcl_onestep((*rst).seed12, (*rst).seed13, &((*rst).seed03), &((*rst).carry)); + out.z = ranluxcl_onestep((*rst).seed11, (*rst).seed12, &((*rst).seed02), &((*rst).carry)); + out.w = ranluxcl_onestep((*rst).seed10, (*rst).seed11, &((*rst).seed01), &((*rst).carry)); + (*rst).stepnr = 0; + +//The below preprocessor directives are here to recover the simpler planar scheme when nskip is a multiple of 24. +//For the most general planar shift approach, just ignore all #if's below. +#ifndef RANLUXCL_PLANAR + } + + (*&((*rst).in24)) += 4; + if((*&((*rst).in24)) == 24){ + (*&((*rst).in24)) = 0; +#endif //RANLUXCL_PLANAR + + int initialskips = ((*rst).stepnr) ? (24 - (*rst).stepnr) : 0; + int bulkskips = ((RANLUXCL_NSKIP - initialskips)/24) * 24; + int remainingskips = RANLUXCL_NSKIP - initialskips - bulkskips; + +//We know there won't be any initial skips in the planar scheme +#ifndef RANLUXCL_PLANAR + //Do initial skips (lack of breaks in switch is intentional). + switch(initialskips){ + case(20): + ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry)); + ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry)); + ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry)); + ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry)); + case(16): + ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry)); + ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry)); + ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry)); + ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry)); + case(12): + ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry)); + ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry)); + ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry)); + ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry)); + case(8): + ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry)); + ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry)); + ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry)); + ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry)); + case(4): + ranluxcl_onestep((*rst).seed13, (*rst).seed14, &((*rst).seed04), &((*rst).carry)); + ranluxcl_onestep((*rst).seed12, (*rst).seed13, &((*rst).seed03), &((*rst).carry)); + ranluxcl_onestep((*rst).seed11, (*rst).seed12, &((*rst).seed02), &((*rst).carry)); + ranluxcl_onestep((*rst).seed10, (*rst).seed11, &((*rst).seed01), &((*rst).carry)); + } +#endif //RANLUXCL_PLANAR + +//Also check if we will ever need to skip at all +#ifndef RANLUXCL_NOSKIP + for(int i=0; i 4){ + ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry)); + ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry)); + ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry)); + ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry)); + } + + if(remainingskips > 8){ + ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry)); + ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry)); + ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry)); + ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry)); + } + + if(remainingskips > 12){ + ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry)); + ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry)); + ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry)); + ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry)); + } + + if(remainingskips > 16){ + ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry)); + ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry)); + ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry)); + ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry)); + } + } +#endif //RANLUXCL_PLANAR + + //Initial skips brought stepnr down to 0. The bulk skips did only full cycles. + //Therefore stepnr is now equal to remainingskips. + (*rst).stepnr = remainingskips; + } + + return out; +} + +void ranluxcl_synchronize(ranluxcl_state_t *rst){ + //This function generates numbers so that the generator is at the beginning, + //i.e. ready to generate 24 numbers before the next skipping sequence. This is + //useful if different work-items have called ranluxcl a different number of times. + //Since that would lead to out of sync execution it could be rather inefficient on + //SIMD architectures like GPUs. This function thus allows us to resynchronize + //execution across all work-items. + + //Do necessary number of calls to ranluxcl so that stepnr == 0 at the end. + if((*rst).stepnr == 4) + ranluxcl32(rst); + if((*rst).stepnr == 8) + ranluxcl32(rst); + if((*rst).stepnr == 12) + ranluxcl32(rst); + if((*rst).stepnr == 16) + ranluxcl32(rst); + if((*rst).stepnr == 20) + ranluxcl32(rst); +} + +void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab) +{ + ranluxcl_state_t rst; + + #ifdef RANLUXCL_USE_LEGACY_INITIALIZATION + //Using legacy initialization from original Fortan 77 implementation + + //ins is scaled so that if the user makes another call somewhere else + //with ins + 1 there should be no overlap. Also adding one + //allows us to use ins = 0. + int k, maxWorkitems; + + #ifdef RANLUXCL_USE_LEGACY_INITIALIZATION + maxWorkitems = RANLUXCL_USE_LEGACY_INITIALIZATION; + #else + maxWorkitems = RANLUXCL_NUMWORKITEMS; + #endif //RANLUXCL_USE_LEGACY_INITIALIZATION + + int scaledins = ins * maxWorkitems + 1; + + int js = scaledins + RANLUXCL_MYID; + + //Make sure js is not too small (should really be an error) + if(js < 1) + js = 1; + + #define IC 2147483563 + #define ITWO24 16777216 + + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed01=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed02=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed03=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed04=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed05=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed06=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed07=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed08=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed09=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed10=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed11=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed12=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed13=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed14=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed15=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed16=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed17=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed18=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed19=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed20=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed21=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed22=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed23=(js%ITWO24)*RANLUXCL_TWOM24; + k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed24=(js%ITWO24)*RANLUXCL_TWOM24; + + #undef IC + #undef ITWO24 + + #else //RANLUXCL_USE_LEGACY_INITIALIZATION + //Using default initialization + + #define RANLUXCL_POW2_24 16777216 + #define RANLUXCL_56 0x00FFFFFFFFFFFFFF + #define RANLUXCL_48 0x0000FFFFFFFFFFFF + #define RANLUXCL_40 0x000000FFFFFFFFFF + #define RANLUXCL_32 0x00000000FFFFFFFF + #define RANLUXCL_24 0x0000000000FFFFFF + #define RANLUXCL_16 0x000000000000FFFF + #define RANLUXCL_8 0x00000000000000FF + + //We scale ins by (2^32)-1. As long as we never use more than (2^32)-1 work-items per + //NDRange the initial states should never be the same. We use a simple 64-bit LCG from + //the SPRNG library. We choose a single prime b, while in SPRNG for parallel applications + //different b give rise to different sequences in parallel (a feature not used here). + ulong x1, x2, x3; + ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * 4294967295; + ulong a = 2862933555777941757; + ulong b = 3037000493; + + //Logical shifts used so that all 64 bits of LCG output are used (24 bits per float), + //to be certain that all initial states are different. + x1 = x = ((a * x + b) & ULONG_MAX); + x2 = x = ((a * x + b) & ULONG_MAX); + x3 = x = ((a * x + b) & ULONG_MAX); + rst.seed01 = (float) (x1 >> 40) / (float)RANLUXCL_POW2_24; + rst.seed02 = (float) ((x1 & RANLUXCL_40) >> 16) / (float)RANLUXCL_POW2_24; + rst.seed03 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) / (float)RANLUXCL_POW2_24; + rst.seed04 = (float) ((x2 & RANLUXCL_56) >> 32) / (float)RANLUXCL_POW2_24; + rst.seed05 = (float) ((x2 & RANLUXCL_32) >> 8) / (float)RANLUXCL_POW2_24; + rst.seed06 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24; + rst.seed07 = (float) ((x3 & RANLUXCL_48) >> 24) / (float)RANLUXCL_POW2_24; + rst.seed08 = (float) (x3 & RANLUXCL_24) / (float)RANLUXCL_POW2_24; + + x1 = x = ((a * x + b) & ULONG_MAX); + x2 = x = ((a * x + b) & ULONG_MAX); + x3 = x = ((a * x + b) & ULONG_MAX); + rst.seed09 = (float) (x1 >> 40) / (float)RANLUXCL_POW2_24; + rst.seed10 = (float) ((x1 & RANLUXCL_40) >> 16) / (float)RANLUXCL_POW2_24; + rst.seed11 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) / (float)RANLUXCL_POW2_24; + rst.seed12 = (float) ((x2 & RANLUXCL_56) >> 32) / (float)RANLUXCL_POW2_24; + rst.seed13 = (float) ((x2 & RANLUXCL_32) >> 8) / (float)RANLUXCL_POW2_24; + rst.seed14 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24; + rst.seed15 = (float) ((x3 & RANLUXCL_48) >> 24) / (float)RANLUXCL_POW2_24; + rst.seed16 = (float) (x3 & RANLUXCL_24) / (float)RANLUXCL_POW2_24; + + x1 = x = ((a * x + b) & ULONG_MAX); + x2 = x = ((a * x + b) & ULONG_MAX); + x3 = x = ((a * x + b) & ULONG_MAX); + rst.seed17 = (float) (x1 >> 40) / (float)RANLUXCL_POW2_24; + rst.seed18 = (float) ((x1 & RANLUXCL_40) >> 16) / (float)RANLUXCL_POW2_24; + rst.seed19 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) / (float)RANLUXCL_POW2_24; + rst.seed20 = (float) ((x2 & RANLUXCL_56) >> 32) / (float)RANLUXCL_POW2_24; + rst.seed21 = (float) ((x2 & RANLUXCL_32) >> 8) / (float)RANLUXCL_POW2_24; + rst.seed22 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24; + rst.seed23 = (float) ((x3 & RANLUXCL_48) >> 24) / (float)RANLUXCL_POW2_24; + rst.seed24 = (float) (x3 & RANLUXCL_24) / (float)RANLUXCL_POW2_24; + + #undef RANLUXCL_POW2_24 + #undef RANLUXCL_56 + #undef RANLUXCL_48 + #undef RANLUXCL_40 + #undef RANLUXCL_32 + #undef RANLUXCL_24 + #undef RANLUXCL_16 + #undef RANLUXCL_8 + + #endif //RANLUXCL_USE_LEGACY_INITIALIZATION + + rst.in24 = 0; + rst.stepnr = 0; + rst.carry = 0.0f; + if(rst.seed24 == 0.0f) + rst.carry = RANLUXCL_TWOM24; + + #ifndef RANLUXCL_NO_WARMUP + //Warming up the generator, ensuring there are no initial correlations. + //16 is a "magic number". It is the number of times we must generate + //a batch of 24 numbers to ensure complete decorrelation. + for(int i=0; i<16; i++){ + ranluxcl_onestep(rst.seed09, rst.seed10, &(rst.seed24), &(rst.carry)); + ranluxcl_onestep(rst.seed08, rst.seed09, &(rst.seed23), &(rst.carry)); + ranluxcl_onestep(rst.seed07, rst.seed08, &(rst.seed22), &(rst.carry)); + ranluxcl_onestep(rst.seed06, rst.seed07, &(rst.seed21), &(rst.carry)); + ranluxcl_onestep(rst.seed05, rst.seed06, &(rst.seed20), &(rst.carry)); + ranluxcl_onestep(rst.seed04, rst.seed05, &(rst.seed19), &(rst.carry)); + ranluxcl_onestep(rst.seed03, rst.seed04, &(rst.seed18), &(rst.carry)); + ranluxcl_onestep(rst.seed02, rst.seed03, &(rst.seed17), &(rst.carry)); + ranluxcl_onestep(rst.seed01, rst.seed02, &(rst.seed16), &(rst.carry)); + ranluxcl_onestep(rst.seed24, rst.seed01, &(rst.seed15), &(rst.carry)); + ranluxcl_onestep(rst.seed23, rst.seed24, &(rst.seed14), &(rst.carry)); + ranluxcl_onestep(rst.seed22, rst.seed23, &(rst.seed13), &(rst.carry)); + ranluxcl_onestep(rst.seed21, rst.seed22, &(rst.seed12), &(rst.carry)); + ranluxcl_onestep(rst.seed20, rst.seed21, &(rst.seed11), &(rst.carry)); + ranluxcl_onestep(rst.seed19, rst.seed20, &(rst.seed10), &(rst.carry)); + ranluxcl_onestep(rst.seed18, rst.seed19, &(rst.seed09), &(rst.carry)); + ranluxcl_onestep(rst.seed17, rst.seed18, &(rst.seed08), &(rst.carry)); + ranluxcl_onestep(rst.seed16, rst.seed17, &(rst.seed07), &(rst.carry)); + ranluxcl_onestep(rst.seed15, rst.seed16, &(rst.seed06), &(rst.carry)); + ranluxcl_onestep(rst.seed14, rst.seed15, &(rst.seed05), &(rst.carry)); + ranluxcl_onestep(rst.seed13, rst.seed14, &(rst.seed04), &(rst.carry)); + ranluxcl_onestep(rst.seed12, rst.seed13, &(rst.seed03), &(rst.carry)); + ranluxcl_onestep(rst.seed11, rst.seed12, &(rst.seed02), &(rst.carry)); + ranluxcl_onestep(rst.seed10, rst.seed11, &(rst.seed01), &(rst.carry)); + } + #endif //RANLUXCL_NO_WARMUP + + //Upload the state + ranluxcl_upload_seed(&rst, ranluxcltab); +} + +float4 ranluxcl32norm(ranluxcl_state_t *rst) +{ + //Returns a vector where each component is a normally + //distributed PRN centered on 0, with standard deviation + //1. Note: M_PI_F is an OpenCL macro for the value of pi. + //M_PI would be the 64-bit double version. + + float4 U = ranluxcl32(rst); + + float4 Z; + float R, phi; + + R = sqrt(-2 * log(U.x)); + phi = 2 * M_PI_F * U.y; + Z.x = R * cos(phi); + Z.y = R * sin(phi); + + R = sqrt(-2 * log(U.z)); + phi = 2 * M_PI_F * U.w; + Z.z = R * cos(phi); + Z.w = R * sin(phi); + + return Z; +} + +#ifdef RANLUXCL_SUPPORT_DOUBLE +double4 ranluxcl64(ranluxcl_state_t *rst) +{ + double4 out; + float4 randvec; + + //We know this value is caused by the never-zero part + //of the original algorithm, but we want to allow zero for + //the most significant bits in the double precision result. + randvec = ranluxcl32(rst); + if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) + randvec.x = 0.0f; + if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) + randvec.z = 0.0f; + + out.x = (double)(randvec.x) + (double)(randvec.y) / 16777216; + out.y = (double)(randvec.z) + (double)(randvec.w) / 16777216; + + randvec = ranluxcl32(rst); + if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) + randvec.x = 0.0f; + if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) + randvec.z = 0.0f; + + out.z = (double)(randvec.x) + (double)(randvec.y) / 16777216; + out.w = (double)(randvec.z) + (double)(randvec.w) / 16777216; + + return out; +} + +double4 ranluxcl64norm(ranluxcl_state_t *rst) +{ + //Returns a vector where each component is a normally + //distributed PRN centered on 0, with standard deviation + //1. + + double4 U = ranluxcl64(rst); + + double4 Z; + double R, phi; + + R = sqrt(-2 * log(U.x)); + phi = 2 * M_PI * U.y; + Z.x = R * cos(phi); + Z.y = R * sin(phi); + + R = sqrt(-2 * log(U.z)); + phi = 2 * M_PI * U.w; + Z.z = R * cos(phi); + Z.w = R * sin(phi); + + return Z; +} +#endif //RANLUXCL_SUPPORT_DOUBLE + +#undef RANLUXCL_TWOM24 +#undef RANLUXCL_TWOM12 +#undef RANLUXCL_NUMWORKITEMS +#undef RANLUXCL_MYID +#undef RANLUXCL_PLANAR +#undef RANLUXCL_NOSKIP + +#endif //RANLUXCL_CL diff --git a/test/test_array.py b/test/test_array.py index 9854afdc3760e00f17b99bd70de7ed7330a054ab..2a939cedeaaa0a687d5da7329126c61c6f604a36 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -226,18 +226,39 @@ def test_random(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) - from pyopencl.clrandom import rand as clrand + from pyopencl.clrandom import RanluxGenerator if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] + gen = RanluxGenerator(queue, 5120) + for dtype in dtypes: - a = clrand(context, queue, (10, 100), dtype=dtype).get() + ran = gen.uniform(queue, (10007,), dtype) + assert (0 < ran.get()).all() + assert (ran.get() < 1).all() + + gen.synchronize(queue) + + ran = gen.uniform(queue, (10007,), dtype, a=4, b=7) + assert (4 < ran.get()).all() + assert (ran.get() < 7).all() + + ran = gen.normal(queue, (10007,), dtype, mu=4, sigma=3) + + dtypes = [np.int32] + for dtype in dtypes: + ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300) + assert (200 <= ran.get()).all() + assert (ran.get() < 300).all() + #from matplotlib import pyplot as pt + #pt.hist(ran.get()) + #pt.show() + + - assert (0 <= a).all() - assert (a < 1).all() @pytools.test.mark_test.opencl