diff --git a/doc/source/array.rst b/doc/source/array.rst
index 48525e96e930c7d1a18bbc83f75028844ea7003d..7264d84ea351bcadc93b7c702c328c882b4f6fbe 100644
--- a/doc/source/array.rst
+++ b/doc/source/array.rst
@@ -361,11 +361,80 @@ Generating Arrays of Random Numbers
.. module:: pyopencl.clrandom
+.. class:: RanluxGenerator(self, queue, num_work_items, max_work_items, luxury=2, seed=None)
+
+ :param queue: :class:`pyopencl.CommandQueue`, only used for initialization
+ :param luxury: the "luxury value" of the generator, and should be 0-4, where 0 is fastest
+ and 4 produces the best numbers. It can also be >=24, in which case it directly
+ sets the p-value of RANLUXCL.
+ :param num_work_items: is the number of generators to initialize, usually corresponding
+ to the number of work-items in the NDRange RANLUXCL will be used with.
+ :param max_work_items: should reflect the maximum number of work-items that will be used
+ on any parallel instance of RANLUXCL. So for instance if we are launching 5120
+ work-items on GPU1 and 10240 work-items on GPU2, GPU1's RANLUXCLTab would be
+ generated by calling ranluxcl_intialization with numWorkitems = 5120 while
+ GPU2's RANLUXCLTab would use numWorkitems = 10240. However maxWorkitems must
+ be at least 10240 for both GPU1 and GPU2, and it must be set to the same value
+ for both.
+
+ .. attribute:: state
+
+ A :class:`pyopencl.array.Array` containing the state of the generator.
+
+ .. attribute:: nskip
+
+ nskip is an integer which can (optionally) be defined in the kernel code
+ as RANLUXCL_NSKIP. If this is done the generator will be faster for luxury setting
+ 0 and 1, or when the p-value is manually set to a multiple of 24.
+
+ .. method:: fill_uniform(ary, a=0, b=1, queue=None)
+
+ Fill *ary* with uniformly distributed random numbers in the interval
+ *(a, b)*, endpoints excluded.
+
+ .. method:: uniform(queue, shape, dtype, order="C", allocator=None, base=None, data=None, a=0, b=1)
+
+ Make a new empty array, apply :meth:`fill_uniform` to it.
+
+ .. method:: fill_normal(ary, mu=0, sigma=1, queue=None):
+
+ Fill *ary* with normally distributed numbers with mean *mu* and
+ standard deviation *sigma*.
+
+ .. method:: normal(queue, shape, dtype, order="C", allocator=None, base=None, data=None, mu=0, sigma=1)
+
+ Make a new empty array, apply :meth:`fill_normal` to it.
+
+ .. method:: synchronize()
+
+ The generator gets inefficient when different work items invoke
+ the generator a differing number of times. This function
+ ensures efficiency.
+
.. function:: rand(queue, shape, dtype)
Return an array of `shape` filled with random values of `dtype`
in the range [0,1).
+PyOpenCL now includes and uses the `RANLUXCL random number generator
+`_ by Ivar Ursin Nikolaisen. In
+addition to being usable through the convenience functions above, it is
+available in any piece of code compiled through PyOpenCL by::
+
+ #include
+
+The RANLUX generator is described in the following two articles. If you use the
+generator for scientific purposes, please consider citing them:
+
+* Martin Lüscher, A portable high-quality random number generator for lattice
+ field theory simulations, `Computer Physics Communications 79 (1994) 100-110
+ `_
+
+* F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom
+ number generator of Lüscher, `Computer Physics Communications 79 (1994) 111-114
+ `_
+
+
Single-pass Custom Expression Evaluation
----------------------------------------
diff --git a/doc/source/misc.rst b/doc/source/misc.rst
index c69f95ccdede76632397ff54bb1e4fb8a16b216c..dfabd69b7f437f09eb038e2d54982c74d5d08262 100644
--- a/doc/source/misc.rst
+++ b/doc/source/misc.rst
@@ -79,10 +79,12 @@ Version 2011.2
severe consequences on the execution time of :class:`pyopencl.array.Array`
operations.
Henrik Andresen at a `PyOpenCL workshop at DTU `_
- first noticed the timings
+ first noticed the strange timings.
* All comparable PyOpenCL objects are now also hashable.
* Add :func:`pyopencl.tools.context_dependent_memoize` to the documented
functionality.
+* Base :mod:`pyopencl.clrandom` on `RANLUXCL `_,
+ add functionality.
Version 2011.1.2
----------------
@@ -299,6 +301,27 @@ implementation). These parts are licensed as follows:
with software licensed exclusively under GPL2. (Most software is licensed
as GPL2 or later, in which case this is not an issue.)
+PyOpenCL includes the RANLUXCL random number generator:
+
+ Copyright (c) 2011 Ivar Ursin Nikolaisen
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ software and associated documentation files (the "Software"), to deal in the Software
+ without restriction, including without limitation the rights to use, copy, modify,
+ merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to the following
+ conditions:
+
+ The above copyright notice and this permission notice shall be included in all copies
+ or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
Frequently Asked Questions
==========================
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index b1fcb788f240fc8c5b21a409942d464e14f54e00..e689051cf8f800a0b0f47fe392cfc80f8155b6a3 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -368,6 +368,39 @@ _add_functionality()
+# {{{ find pyopencl shipped source code
+
+def _find_pyopencl_include_path():
+ from imp import find_module
+ import sys
+ file, pathname, descr = find_module("pyopencl")
+
+ # Who knew Python installation is so uniform and predictable?
+ from os.path import join, exists
+ possible_include_paths = [
+ join(pathname, "..", "include", "pyopencl"),
+ join(pathname, "..", "src", "cl"),
+ join(pathname, "..", "..", "..", "src", "cl"),
+ join(pathname, "..", "..", "..", "..", "include", "pyopencl"),
+ join(pathname, "..", "..", "..", "include", "pyopencl"),
+ ]
+
+ if sys.platform in ("linux2", "darwin"):
+ possible_include_paths.extend([
+ join(sys.prefix, "include" , "pyopencl"),
+ "/usr/include/pyopencl",
+ "/usr/local/include/pyopencl"
+ ])
+
+ for inc_path in possible_include_paths:
+ if exists(inc_path):
+ return inc_path
+
+ raise RuntimeError("could not find path to PyOpenCL's CL"
+ " header files, searched in : %s"
+ % '\n'.join(possible_include_paths))
+
+# }}}
# {{{ Program (including caching support)
@@ -425,6 +458,11 @@ class Program(object):
"info attribute or as a kernel name" % attr)
def build(self, options=[], devices=None, cache_dir=None):
+ if isinstance(options, str):
+ options = [options]
+
+ options = options + ["-I", _find_pyopencl_include_path()]
+
if self._prg is not None:
self._prg._build(options, devices)
else:
diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py
index 2b13047eef792e23bd023d238f74aa03b604c41f..ab4805b20e85b1ed30dc88c8302573da61aa310d 100644
--- a/pyopencl/clrandom.py
+++ b/pyopencl/clrandom.py
@@ -1,235 +1,219 @@
import pyopencl as cl
import pyopencl.array as cl_array
-from pyopencl.tools import context_dependent_memoize
-
-
-
-
-md5_code = """
-/*
- **********************************************************************
- ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
- ** **
- ** License to copy and use this software is granted provided that **
- ** it is identified as the "RSA Data Security, Inc. MD5 Message **
- ** Digest Algorithm" in all material mentioning or referencing this **
- ** software or this function. **
- ** **
- ** License is also granted to make and use derivative works **
- ** provided that such works are identified as "derived from the RSA **
- ** Data Security, Inc. MD5 Message Digest Algorithm" in all **
- ** material mentioning or referencing the derived work. **
- ** **
- ** RSA Data Security, Inc. makes no representations concerning **
- ** either the merchantability of this software or the suitability **
- ** of this software for any particular purpose. It is provided "as **
- ** is" without express or implied warranty of any kind. **
- ** **
- ** These notices must be retained in any copies of any part of this **
- ** documentation and/or software. **
- **********************************************************************
- */
-
-/* F, G and H are basic MD5 functions: selection, majority, parity */
-#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
-#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
-#define H(x, y, z) ((x) ^ (y) ^ (z))
-#define I(x, y, z) ((y) ^ ((x) | (~z)))
-
-/* ROTATE_LEFT rotates x left n bits */
-#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
-
-/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */
-/* Rotation is separate from addition to prevent recomputation */
-#define FF(a, b, c, d, x, s, ac) \
- {(a) += F ((b), (c), (d)) + (x) + (ac); \
- (a) = ROTATE_LEFT ((a), (s)); \
- (a) += (b); \
- }
-#define GG(a, b, c, d, x, s, ac) \
- {(a) += G ((b), (c), (d)) + (x) + (ac); \
- (a) = ROTATE_LEFT ((a), (s)); \
- (a) += (b); \
- }
-#define HH(a, b, c, d, x, s, ac) \
- {(a) += H ((b), (c), (d)) + (x) + (ac); \
- (a) = ROTATE_LEFT ((a), (s)); \
- (a) += (b); \
- }
-#define II(a, b, c, d, x, s, ac) \
- {(a) += I ((b), (c), (d)) + (x) + (ac); \
- (a) = ROTATE_LEFT ((a), (s)); \
- (a) += (b); \
- }
-
-#define X0 get_local_id(0)
-#define X1 get_local_id(1)
-#define X2 get_local_id(2)
-#define X3 get_group_id(0)
-#define X4 get_group_id(1)
-#define X5 get_group_id(2)
-#define X6 seed
-#define X7 i
-#define X8 n
-#define X9 get_local_size(0)
-#define X10 get_local_size(1)
-#define X11 get_local_size(2)
-#define X12 get_global_size(0)
-#define X13 get_global_size(1)
-#define X14 get_global_size(2)
-#define X15 0
-
- unsigned int a = 0x67452301;
- unsigned int b = 0xefcdab89;
- unsigned int c = 0x98badcfe;
- unsigned int d = 0x10325476;
-
- /* Round 1 */
-#define S11 7
-#define S12 12
-#define S13 17
-#define S14 22
- FF ( a, b, c, d, X0 , S11, 3614090360); /* 1 */
- FF ( d, a, b, c, X1 , S12, 3905402710); /* 2 */
- FF ( c, d, a, b, X2 , S13, 606105819); /* 3 */
- FF ( b, c, d, a, X3 , S14, 3250441966); /* 4 */
- FF ( a, b, c, d, X4 , S11, 4118548399); /* 5 */
- FF ( d, a, b, c, X5 , S12, 1200080426); /* 6 */
- FF ( c, d, a, b, X6 , S13, 2821735955); /* 7 */
- FF ( b, c, d, a, X7 , S14, 4249261313); /* 8 */
- FF ( a, b, c, d, X8 , S11, 1770035416); /* 9 */
- FF ( d, a, b, c, X9 , S12, 2336552879); /* 10 */
- FF ( c, d, a, b, X10, S13, 4294925233); /* 11 */
- FF ( b, c, d, a, X11, S14, 2304563134); /* 12 */
- FF ( a, b, c, d, X12, S11, 1804603682); /* 13 */
- FF ( d, a, b, c, X13, S12, 4254626195); /* 14 */
- FF ( c, d, a, b, X14, S13, 2792965006); /* 15 */
- FF ( b, c, d, a, X15, S14, 1236535329); /* 16 */
-
- /* Round 2 */
-#define S21 5
-#define S22 9
-#define S23 14
-#define S24 20
- GG ( a, b, c, d, X1 , S21, 4129170786); /* 17 */
- GG ( d, a, b, c, X6 , S22, 3225465664); /* 18 */
- GG ( c, d, a, b, X11, S23, 643717713); /* 19 */
- GG ( b, c, d, a, X0 , S24, 3921069994); /* 20 */
- GG ( a, b, c, d, X5 , S21, 3593408605); /* 21 */
- GG ( d, a, b, c, X10, S22, 38016083); /* 22 */
- GG ( c, d, a, b, X15, S23, 3634488961); /* 23 */
- GG ( b, c, d, a, X4 , S24, 3889429448); /* 24 */
- GG ( a, b, c, d, X9 , S21, 568446438); /* 25 */
- GG ( d, a, b, c, X14, S22, 3275163606); /* 26 */
- GG ( c, d, a, b, X3 , S23, 4107603335); /* 27 */
- GG ( b, c, d, a, X8 , S24, 1163531501); /* 28 */
- GG ( a, b, c, d, X13, S21, 2850285829); /* 29 */
- GG ( d, a, b, c, X2 , S22, 4243563512); /* 30 */
- GG ( c, d, a, b, X7 , S23, 1735328473); /* 31 */
- GG ( b, c, d, a, X12, S24, 2368359562); /* 32 */
-
- /* Round 3 */
-#define S31 4
-#define S32 11
-#define S33 16
-#define S34 23
- HH ( a, b, c, d, X5 , S31, 4294588738); /* 33 */
- HH ( d, a, b, c, X8 , S32, 2272392833); /* 34 */
- HH ( c, d, a, b, X11, S33, 1839030562); /* 35 */
- HH ( b, c, d, a, X14, S34, 4259657740); /* 36 */
- HH ( a, b, c, d, X1 , S31, 2763975236); /* 37 */
- HH ( d, a, b, c, X4 , S32, 1272893353); /* 38 */
- HH ( c, d, a, b, X7 , S33, 4139469664); /* 39 */
- HH ( b, c, d, a, X10, S34, 3200236656); /* 40 */
- HH ( a, b, c, d, X13, S31, 681279174); /* 41 */
- HH ( d, a, b, c, X0 , S32, 3936430074); /* 42 */
- HH ( c, d, a, b, X3 , S33, 3572445317); /* 43 */
- HH ( b, c, d, a, X6 , S34, 76029189); /* 44 */
- HH ( a, b, c, d, X9 , S31, 3654602809); /* 45 */
- HH ( d, a, b, c, X12, S32, 3873151461); /* 46 */
- HH ( c, d, a, b, X15, S33, 530742520); /* 47 */
- HH ( b, c, d, a, X2 , S34, 3299628645); /* 48 */
-
- /* Round 4 */
-#define S41 6
-#define S42 10
-#define S43 15
-#define S44 21
- II ( a, b, c, d, X0 , S41, 4096336452); /* 49 */
- II ( d, a, b, c, X7 , S42, 1126891415); /* 50 */
- II ( c, d, a, b, X14, S43, 2878612391); /* 51 */
- II ( b, c, d, a, X5 , S44, 4237533241); /* 52 */
- II ( a, b, c, d, X12, S41, 1700485571); /* 53 */
- II ( d, a, b, c, X3 , S42, 2399980690); /* 54 */
- II ( c, d, a, b, X10, S43, 4293915773); /* 55 */
- II ( b, c, d, a, X1 , S44, 2240044497); /* 56 */
- II ( a, b, c, d, X8 , S41, 1873313359); /* 57 */
- II ( d, a, b, c, X15, S42, 4264355552); /* 58 */
- II ( c, d, a, b, X6 , S43, 2734768916); /* 59 */
- II ( b, c, d, a, X13, S44, 1309151649); /* 60 */
- II ( a, b, c, d, X4 , S41, 4149444226); /* 61 */
- II ( d, a, b, c, X11, S42, 3174756917); /* 62 */
- II ( c, d, a, b, X2 , S43, 718787259); /* 63 */
- II ( b, c, d, a, X9 , S44, 3951481745); /* 64 */
-
- a += 0x67452301;
- b += 0xefcdab89;
- c += 0x98badcfe;
- d += 0x10325476;
-"""
+from pyopencl.tools import first_arg_dependent_memoize
+from pytools import memoize_method
import numpy as np
-@context_dependent_memoize
-def get_rand_kernel(context, dtype):
- from pyopencl.elementwise import get_elwise_kernel
- if dtype == np.float32:
- return get_elwise_kernel(context,
- "float *dest, unsigned int seed",
- md5_code + """
- #define POW_2_M32 (1/4294967296.0f)
- dest[i] = a*POW_2_M32;
- if ((i += gsize) < n)
- dest[i] = b*POW_2_M32;
- if ((i += gsize) < n)
- dest[i] = c*POW_2_M32;
- if ((i += gsize) < n)
- dest[i] = d*POW_2_M32;
- """,
- "md5_rng_float")
- elif dtype == np.float64:
- return get_elwise_kernel(context,
- "double *dest, unsigned int seed",
- md5_code + """
- #define POW_2_M32 (1/4294967296.0)
- #define POW_2_M64 (1/18446744073709551616.)
-
- dest[i] = a*POW_2_M32 + b*POW_2_M64;
-
- if ((i += gsize) < n)
+class RanluxGenerator(object):
+ def __init__(self, queue, num_work_items,
+ luxury=4, seed=None, no_warmup=False,
+ use_legacy_init=False, max_work_items=None):
+ if seed is None:
+ from time import time
+ seed = int(time()*1e6) % 2<<30
+
+ self.context = queue.context
+ self.luxury = luxury
+ self.num_work_items = num_work_items
+
+ from pyopencl.characterize import has_double_support
+ self.support_double = has_double_support(queue.device)
+
+ self.no_warmup = no_warmup
+ self.use_legacy_init = use_legacy_init
+ self.max_work_items = max_work_items
+
+ prg = cl.Program(queue.context, """
+ %s
+
+ #include
+
+ kernel void init_ranlux(unsigned seeds, global ranluxcl_state_t *ranluxcltab)
+ {
+ ranluxcl_initialization(seeds, ranluxcltab);
+ }
+ """ % self.generate_settings_defines()).build()
+
+ self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8)
+ prg.init_ranlux(queue, (num_work_items,), None, np.uint32(seed),
+ self.state.data)
+
+ def generate_settings_defines(self, include_double_pragma=True):
+ lines = []
+ if include_double_pragma and self.support_double:
+ lines.append("#pragma OPENCL EXTENSION cl_khr_fp64 : enable")
+
+ lines.append("#define RANLUXCL_LUX %d" % self.luxury)
+
+ if self.no_warmup:
+ lines.append("#define RANLUXCL_NO_WARMUP")
+
+ if self.support_double:
+ lines.append("#define RANLUXCL_SUPPORT_DOUBLE")
+
+ if self.use_legacy_init:
+ lines.append("#define RANLUXCL_USE_LEGACY_INITIALIZATION")
+
+ if self.max_work_items:
+ lines.append("#define RANLUXCL_MAXWORKITEMS %d" % self.max_work_items)
+
+ return "\n".join(lines)
+
+ @memoize_method
+ def get_gen_kernel(self, dtype, flavor=""):
+ if dtype == np.float64:
+ bits = 64
+ c_type = "double"
+ rng_expr = "(shift + scale * gen)"
+ elif dtype == np.float32:
+ bits = 32
+ c_type = "float"
+ rng_expr = "(shift + scale * gen)"
+ elif dtype == np.int32:
+ assert flavor == ""
+ bits = 32
+ c_type = "int"
+ rng_expr = ("(shift "
+ "+ convert_int4(scale * gen) "
+ "+ convert_int4((scale / (1<<24)) * gen))"
+ )
+ else:
+ raise TypeError("unsupported RNG data type '%s'" % dtype)
+
+ rl_flavor = "%d%s" % (bits, flavor)
+
+ src = """//CL//
+ %(defines)s
+
+ #include
+
+ typedef %(output_t)s output_t;
+ typedef %(output_t)s4 output_vec_t;
+ #define NUM_WORKITEMS %(num_work_items)d
+ #define RANLUX_FUNC ranluxcl##%(rlflavor)s
+ #define GET_RANDOM_NUM(gen) %(rng_expr)s
+
+ kernel void generate(
+ global ranluxcl_state_t *ranluxcltab,
+ global output_t *output,
+ unsigned long out_size,
+ output_t scale,
+ output_t shift)
{
- dest[i] = c*POW_2_M32 + d*POW_2_M64;
+ ranluxcl_state_t ranluxclstate;
+ ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
+
+ // output bulk
+ unsigned long idx = get_global_id(0)*4;
+ while (idx + 4 < out_size)
+ {
+ vstore4(GET_RANDOM_NUM(RANLUX_FUNC(&ranluxclstate)), idx >> 2, output);
+ idx += 4*NUM_WORKITEMS;
+ }
+
+ // output tail
+ output_vec_t tail_ran = GET_RANDOM_NUM(RANLUX_FUNC(&ranluxclstate));
+ if (idx < out_size)
+ output[idx] = tail_ran.x;
+ if (idx+1 < out_size)
+ output[idx+1] = tail_ran.y;
+ if (idx+2 < out_size)
+ output[idx+2] = tail_ran.z;
+ if (idx+3 < out_size)
+ output[idx+2] = tail_ran.w;
+
+ ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
}
- """,
- "md5_rng_float")
- elif dtype in [np.int32, np.uint32]:
- return get_elwise_kernel(context,
- "unsigned int *dest, unsigned int seed",
- md5_code + """
- dest[i] = a;
- if ((i += gsize) < n)
- dest[i] = b;
- if ((i += gsize) < n)
- dest[i] = c;
- if ((i += gsize) < n)
- dest[i] = d;
- """,
- "md5_rng_int")
+ """ % {
+ "defines": self.generate_settings_defines(),
+ "rlflavor": rl_flavor,
+ "output_t": c_type,
+ "num_work_items": self.num_work_items,
+ "rng_expr": rng_expr
+ }
+
+ prg = cl.Program(self.context, src).build()
+ knl = prg.generate
+ knl.set_scalar_arg_dtypes([None, None, np.uint64, dtype, dtype])
+
+ return knl
+
+ def fill_uniform(self, ary, a=0, b=1, queue=None):
+ if queue is None:
+ queue = ary.queue
+
+ self.get_gen_kernel(ary.dtype, "")(queue, (self.num_work_items,), None,
+ self.state.data, ary.data, ary.size,
+ b-a, a)
+
+ def uniform(self, *args, **kwargs):
+ a = kwargs.pop("a", 0)
+ b = kwargs.pop("b", 1)
+
+ result = cl_array.empty(*args, **kwargs)
+
+ self.fill_uniform(result, queue=result.queue, a=a, b=b)
+ return result
+
+ def fill_normal(self, ary, mu=0, sigma=1, queue=None):
+ if queue is None:
+ queue = ary.queue
+
+ self.get_gen_kernel(ary.dtype, "norm")(queue, (self.num_work_items,), None,
+ self.state.data, ary.data, ary.size, sigma, mu)
+
+ def normal(self, *args, **kwargs):
+ mu = kwargs.pop("mu", 0)
+ sigma = kwargs.pop("sigma", 1)
+
+ result = cl_array.empty(*args, **kwargs)
+
+ self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma)
+ return result
+
+ @memoize_method
+ def get_sync_kernel(self):
+ src = """//CL//
+ %(defines)s
+
+ #include
+
+ kernel void sync(
+ global ranluxcl_state_t *ranluxcltab)
+ {
+ ranluxcl_state_t ranluxclstate;
+ ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
+ ranluxcl_synchronize(&ranluxclstate);
+ ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
+ }
+ """ % {
+ "defines": self.generate_settings_defines(),
+ }
+ prg = cl.Program(self.context, src).build()
+ return prg.sync
+
+ def synchronize(self, queue):
+ self.get_sync_kernel()(queue, (self.num_work_items,), None, self.state.data)
+
+
+
+
+
+
+@first_arg_dependent_memoize
+def _get_generator(queue):
+ if queue.device.type == cl.device_type.CPU:
+ num_work_items = 8 * queue.device.max_compute_units
else:
- raise NotImplementedError
+ num_work_items = 64 * queue.device.max_compute_units
+
+ gen = RanluxGenerator(queue, num_work_items)
+ queue.finish()
+ return gen
@@ -245,8 +229,9 @@ def rand(*args, **kwargs):
def inner_rand(queue, shape, dtype):
from pyopencl.array import Array
+ gen = _get_generator(queue)
result = Array(queue, shape, dtype)
- _rand(result, np.random.randint(2**31-1))
+ gen.fill_uniform(result)
return result
if isinstance(args[0], cl.Context):
@@ -259,27 +244,7 @@ def rand(*args, **kwargs):
return inner_rand(*args, **kwargs)
-if __name__ == "__main__":
- import sys
- ctx = cl.create_some_context()
- queue = cl.CommandQueue(ctx)
- if "generate" in sys.argv[1:]:
- N = 256
- print N, "MB"
- r = rand(ctx, queue, (N*2**18,), np.uint32)
- print "generated"
- r.get().tofile("random.dat")
- print "written"
- else:
- from pylab import plot, show, subplot
- N = 250
- r1 = rand(ctx, queue, (N,), np.uint32)
- r2 = rand(ctx, queue, (N,), np.int32)
- r3 = rand(ctx, queue, (N,), np.float32)
-
- subplot(131); plot( r1.get(),"x-")
- subplot(132); plot( r2.get(),"x-")
- subplot(133); plot( r3.get(),"x-")
- show()
+
+# vim: filetype=pyopencl
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index fce7f7db12d0ebc8e253b7d42d7f3fbad4fb23df..986eb885d23d50dd436c7bd45623693698c77208 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -43,40 +43,42 @@ MemoryPool = cl.MemoryPool
-context_dependent_memoized_functions = []
+first_arg_dependent_memoized_functions = []
@decorator
-def context_dependent_memoize(func, context, *args):
+def first_arg_dependent_memoize(func, context, *args):
"""Provides memoization for things that get created inside
a context, i.e. mainly programs and kernels. Assumes that
the first argument of the decorated function is the context.
"""
try:
- ctx_dict = func._pyopencl_ctx_dep_memoize_dic
+ ctx_dict = func._pyopencl_first_arg_dep_memoize_dic
except AttributeError:
# FIXME: This may keep contexts alive longer than desired.
# But I guess since the memory in them is freed, who cares.
- ctx_dict = func._pyopencl_ctx_dep_memoize_dic = {}
+ ctx_dict = func._pyopencl_first_arg_dep_memoize_dic = {}
try:
return ctx_dict[context][args]
except KeyError:
- context_dependent_memoized_functions.append(func)
+ first_arg_dependent_memoized_functions.append(func)
arg_dict = ctx_dict.setdefault(context, {})
result = func(context, *args)
arg_dict[args] = result
return result
+context_dependent_memoize = first_arg_dependent_memoize
+
def clear_context_caches():
- for func in context_dependent_memoized_functions:
+ for func in first_arg_dependent_memoized_functions:
try:
- ctx_dict = func._pycuda_ctx_dep_memoize_dic
+ ctx_dict = func._pyopencl_first_arg_dep_memoize_dic
except AttributeError:
pass
else:
diff --git a/setup.py b/setup.py
index 78a8700184a72b3343fa7d967784d2c25e2cfdf0..27e4c864f5f9e537e7c8e067350ac9cc85cdbc2c 100644
--- a/setup.py
+++ b/setup.py
@@ -224,6 +224,11 @@ def main():
),
],
+ data_files=[
+ ("include/pyopencl",
+ glob.glob("src/cl/*.cl") + glob.glob("src/cl/*.h"))
+ ],
+
# 2to3 invocation
cmdclass={'build_py': build_py})
diff --git a/src/cl/pyopencl-ranluxcl.cl b/src/cl/pyopencl-ranluxcl.cl
new file mode 100644
index 0000000000000000000000000000000000000000..1a1a0e53b0f5d4d1a2036c7f86bd86c691d50ac9
--- /dev/null
+++ b/src/cl/pyopencl-ranluxcl.cl
@@ -0,0 +1,798 @@
+#ifndef RANLUXCL_CL
+#define RANLUXCL_CL
+
+/**** RANLUXCL v1.3.1 MODIFIED *****************************************************
+
+Implements the RANLUX generator of Matrin Luscher, based on the Fortran 77
+implementation by Fred James. This OpenCL code is a complete implementation which
+should perfectly replicate the numbers generated by the original Fortran 77
+implementation (if using the legacy initialization routine).
+
+***** QUICK USAGE DESCRIPTION ******************************************************
+
+1. Create an OpenCL buffer with room for at least 28 32-bit variables (112 byte).
+I.e., in C/C++: size_t buffSize = numWorkitems * 112;
+
+2. Pass the buffer and an unsigned integer seed to a kernel that launches the
+ranluxcl_initialization function. The seed can be any unsigned 32-bit integer,
+and must be different on different OpenCL devices/NDRanges to ensure different
+sequences. As long as the number of work-items on each device/NDRange is less than
+2^32 = 4294967296 all sequences will be different.
+An examle initialization kernel would be:
+ #include "ranluxcl.cl"
+ kernel void Kernel_Ranluxcl_Init(private uint ins, global ranluxcl_state_t *ranluxcltab){
+ ranluxcl_initialization(ins, ranluxcltab);
+ }
+
+3. Now the generator is ready for use. Remember to download the seeds first, and
+upload them again when done. Example kernel that downloads seeds, generates a float4
+where each component is uniformly distributed between 0 and 1, end points not included,
+then uploads the seeds again:
+ #include "ranluxcl.cl"
+ kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab){
+ //ranluxclstate is a struct of 7 float4 variables
+ //storing the state of the generator.
+ ranluxcl_state_t ranluxclstate;
+
+ //Download state into ranluxclstate struct.
+ ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
+
+ //Generate a float4 with each component on (0,1),
+ //end points not included. We can call ranluxcl as many
+ //times as we like until we upload the state again.
+ float4 randomnr = ranluxcl32(&ranluxclstate);
+
+ //Upload state again so that we don't get the same
+ //numbers over again the next time we use ranluxcl.
+ ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
+ }
+
+***** MACROS ***********************************************************************
+
+The following macros can optionally be defined:
+
+RANLUXCL_LUX:
+Sets the luxury level of the generator. Should be 0-4, or if it is 24 or larger it
+sets the p-value of the generator (generally not needed). If this macro is not set
+then lux=4 is the default (highest quality). For many applications the high quality
+of lux=4 may not be needed. Indeed if two values (each value having 24 random bits)
+are glued together to form a 48-bit value the generator passes all tests in the TestU01
+suite already with lux=2. See "TestU01: A C Library for Empirical Testing of Random
+Number Generators" by PIERRE L’ECUYER and RICHARD SIMARD. SWB(224, 10, 24)[24, l] is
+RANLUX with two values glued together to create 48-bit numbers, and we see that it
+passes all tests already at luxury value 2.
+
+RANLUXCL_NO_WARMUP:
+Turns off the warmup functionality in ranluxcl_initialization. This macro should
+generally not be used, since the generators will initially be correlated if it is
+defined. The only advantage is that the numbers generated will exactly correspond
+to those of the original Fortran 77 implementation.
+
+RANLUXCL_SUPPORT_DOUBLE:
+Enables double precision functions. Please enable the OpenCL double precision
+extension yourself, usually by "#pragma OPENCL EXTENSION cl_khr_fp64 : enable".
+
+RANLUXCL_USE_LEGACY_INITIALIZATION
+Uses exactly the same initialization routine as in the original Fortran 77 code,
+leading to the same sequences. If using legacy initialization there are some
+restrictions on what the seed can be, and it may also be necessary to define
+RANLUXCL_MAXWORKITEMS if several sequences are to be run in parallel.
+
+RANLUXCL_MAXWORKITEMS:
+When RANLUXCL_USE_LEGACY_INITIALIZATION is defined we may need this macro.
+If several OpenCL NDRanges will be running in parallel and the parallel sequences should
+be different then this macro should have a value equal or larger than the
+largest number of work-items in any of the parallel runs. The default is to use the
+current global size, so if all NDRanges are of the same size this need not be
+defined.
+ Each parallel instance must also have different seeds . For example if
+we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2 we would use
+different seeds for the two generators, and RANLUXCL_MAXWORKITEMS must be defined to
+be at least 10240. If GPU1 and GPU2 had the same number of work-items this would not
+be necessary.
+ An underestimate of the highest permissible seed is given by the smallest of:
+( = 10^9 / ) or ( = 10^9 / RANLUXCL_MAXWORKITEMS).
+Please make sure that is never higher than this since it could cause undetected
+problems. For example with 10240 work-items the highest permissible is about
+100 000.
+ Again note that this is only relevant when using the legacy
+initialization function enabled by RANLUXCL_USE_LEGACY_INITIALIZATION. When not using
+the legacy initialization this macro is effectively set to a very high value of 2^32-1.
+
+***** FUNCTIONS: INITIALIZATION ****************************************************
+
+The initialization function is defined as:
+void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
+Run once at the very beginning. ranluxcltab should be a buffer with space for
+112 byte per work-item in the NDRange. is the seed to the generator.
+For a given each work-item in the NDRange will generate a different sequence.
+If more than one NDRange is used in parallel then must be different for each
+NDRange to avoid identical sequences.
+
+***** FUNCTIONS: SEED UPLOAD/DOWNLOAD **********************************************
+
+The following two functions should be launced at the beginning and end of a kernel
+that uses ranluxcl to generate numbers, respectively:
+
+void ranluxcl_download_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab)
+Run at the beginning of a kernel to download ranluxcl state data
+
+void ranluxcl_upload_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab)
+Run at the end of a kernel to upload state data
+
+***** FUNCTIONS: GENERATION AND SYNCHRONIZATION ************************************
+
+float4 ranluxcl32(ranluxcl_state_t *rst)
+Run to generate a pseudo-random float4 where each component is a number between
+0 and 1, end points not included (meaning the number will never be exactly 0 or 1).
+
+double4 ranluxcl64(ranluxcl_state_t *rst)
+Double precision version of the above function. The preprocessor macro
+RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.
+This function "glues" together two single-precision numbers to make one double
+precision number. Most of the work is still done in single precision, so the
+performance will be roughly halved regardless of the double precision performance
+of the hardware.
+
+float4 ranluxcl32norm(ranluxcl_state_t *rst)
+Run to generate a pseudo-random float4 where each component is normally distributed
+with mean 0 and standard deviation 1.
+
+double4 ranluxcl64norm(ranluxcl_state_t *rst)
+Double precision version of the above function. The preprocessor macro
+RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.
+
+void ranluxcl_synchronize(ranluxcl_state_t *rst)
+Run to synchronize execution in case different work-items have made a different
+number of calls to ranluxcl. On SIMD machines this could lead to inefficient execution.
+ranluxcl_synchronize allows us to make sure all generators are SIMD-friendly again. Not
+needed if all work-items always call ranluxcl the same number of times.
+
+***** PERFORMANCE ******************************************************************
+
+For luxury setting 4, performance on AMD Cypress should be ~4.5*10^9 pseudorandom
+values per second, when not downloading values to host memory (i.e. the values are
+just generated, but not used for anything in particular).
+
+***** DESCRIPTION OF THE IMPLEMENTATION ********************************************
+
+This code closely follows the original Fortran 77 code (see credit section). Here
+the differences (and similarities) between RANLUXCL (this implementation) and the
+original RANLUX are discussed.
+
+The Fortran 77 implementation uses a simple LCG to initialize the generator, and
+so the same approach is taken here. If RANLUXCL is initialized with = 0 as
+seed, the first work-item behaves like the original RANLUX with seed equal 1, the
+second work-item as if with seed equal 2 and so on. If = 1 then the first
+work-item behaves like the original RANLUX with seed equal to + 1,
+and so on for higher so that we never have overlapping sequences. This is
+why the RANLUXCL_MAXWORKITEMS macro must be set if we have different NDRanges with
+a different number of work-items.
+
+RANLUX is based on chaos theory, and what we are actually doing when selecting
+a luxury value is setting how many values to skip over (causing decorrelation).
+The number of values to skip is controlled by the so-called p-value of the
+generator. After generating 24 values we skip p - 24 values until again generating
+24 values.
+
+This implementation is somewhat modified from the original fortran implementation
+by F. James. Because of the way the OpenCL code is optimized with 4-component
+32-bit float vectors, it is most convenient to always throw away some multiple
+of 24 values (i.e. p is always a multiple of 24).
+
+However, there might be some resonances if we always throw away a multiple of
+the seeds table size. Therefore the implementation is slightly more intricate
+where p can be a multiple of 4 instead, at a cost to performance (only about 10%
+lower than the cleaner 24 values approach on AMD Cypress). These two approaches
+are termed planar and planar shift respectively. The idea for the planar approach
+comes from the following paper:
+Vadim Demchik, Pseudo-random number generators for Monte Carlo simulations on
+Graphics Processing Units, arXiv:1003.1898v1 [hep-lat]
+
+Below the p-values for the original reference implementation are listed along with
+those of the planar shift implementation. Suggested values for the planar approach
+are also presented. When this function is called with RANLUXCL_LUX set to 0-4, the
+planar shift values are used. To use the pure planar approach (for some extra
+performance with likely undetectable quality decrease), set lux equal to the specific
+p-value.
+
+Luxury setting (RANLUXCL_LUX): 0 1 2 3 4
+Original fortran77 implementation by F. James: 24 48 97 223 389
+Planar (suggested): 24 48 120 240 408
+Planar shift: 24 48 100 224 404
+
+Note that levels 0 and 1 are the same as in the original implementation for both
+planar and planar shift. Level 4 of planar shift where p=404 is the same as chosen
+for luxury level 1 by Martin Luescher for his v3 version of RANLUX. Therefore if
+it is considered important to only use "official" values, luxury settings 0, 1 or
+4 of planar shift should be used. It is however unlikely that the other values are
+bad, they just haven't been as extensively used and tested by others.
+
+Variable names are generally the same as in the fortran77 implementation, however
+because of the way the generator is implemented, the i24 and j24 variables are
+no longer needed.
+
+***** CREDIT ***********************************************************************
+
+I have been told by Fred James (the coder) that the original Fortran 77
+implementation (which is the subject of the second paper below) is free to use and
+share. Therefore I am using the MIT license (below). But most importantly please
+always remember to give credit to the two articles by Martin Luscher and Fred James,
+describing the generator and the Fortran 77 implementation on which this
+implementation is based, respectively:
+
+Martin Lüscher, A portable high-quality random number generator for lattice
+field theory simulations, Computer Physics Communications 79 (1994) 100-110
+
+F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom
+number generator of Lüscher, Computer Physics Communications 79 (1994) 111-114
+
+***** LICENSE **********************************************************************
+
+Copyright (c) 2011 Ivar Ursin Nikolaisen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this
+software and associated documentation files (the "Software"), to deal in the Software
+without restriction, including without limitation the rights to use, copy, modify,
+merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be included in all copies
+or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+***************************************************************************************/
+
+typedef struct{
+ float
+ seed01, seed02, seed03, seed04,
+ seed05, seed06, seed07, seed08,
+ seed09, seed10, seed11, seed12,
+ seed13, seed14, seed15, seed16,
+ seed17, seed18, seed19, seed20,
+ seed21, seed22, seed23, seed24;
+ float carry;
+ float dummy; //Causes struct to be a multiple of 128 bits
+ int in24;
+ int stepnr;
+} ranluxcl_state_t;
+
+#define RANLUXCL_TWOM24 0.000000059604644775f
+#define RANLUXCL_TWOM12 0.000244140625f
+
+#ifdef RANLUXCL_LUX
+#if RANLUXCL_LUX < 0
+#error ranluxcl: lux must be zero or positive.
+#endif
+#else
+#define RANLUXCL_LUX 4 //Default to high quality
+#endif //RANLUXCL_LUX
+
+//Here the luxury values are defined
+#if RANLUXCL_LUX == 0
+#define RANLUXCL_NSKIP 0
+#elif RANLUXCL_LUX == 1
+#define RANLUXCL_NSKIP 24
+#elif RANLUXCL_LUX == 2
+#define RANLUXCL_NSKIP 76
+#elif RANLUXCL_LUX == 3
+#define RANLUXCL_NSKIP 200
+#elif RANLUXCL_LUX == 4
+#define RANLUXCL_NSKIP 380
+#else
+#define RANLUXCL_NSKIP (RANLUXCL_LUX - 24)
+#endif //RANLUXCL_LUX == 0
+
+//Check that nskip is a permissible value
+#if RANLUXCL_NSKIP % 4 != 0
+#error nskip must be divisible by 4!
+#endif
+#if RANLUXCL_NSKIP < 24 && RANLUXCL_NSKIP != 0
+#error nskip must be either 0 or >= 24!
+#endif
+#if RANLUXCL_NSKIP < 0
+#error nskip is negative!
+#endif
+
+//Check if planar scheme is recovered
+#if RANLUXCL_NSKIP % 24 == 0
+#define RANLUXCL_PLANAR
+#endif
+
+//Check if we will skip at all
+#if RANLUXCL_NSKIP == 0
+#define RANLUXCL_NOSKIP
+#endif
+
+//Single-value global size and id
+#define RANLUXCL_NUMWORKITEMS (get_global_size(0) * get_global_size(1) * get_global_size(2))
+#define RANLUXCL_MYID (get_global_id(0) + get_global_id(1) * get_global_size(0) + get_global_id(2) * get_global_size(0) * get_global_size(1))
+
+void ranluxcl_download_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab)
+{
+ (*rst) = ranluxcltab[RANLUXCL_MYID];
+}
+
+void ranluxcl_upload_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab)
+{
+ ranluxcltab[RANLUXCL_MYID] = (*rst);
+}
+
+float ranluxcl_onestep(float sj24m1, float sj24, float *si24, float *carry){
+ float uni, out;
+ uni = sj24 - (*si24) - (*carry);
+ if(uni < 0.0f){
+ uni += 1.0f;
+ (*carry) = RANLUXCL_TWOM24;
+ } else (*carry) = 0.0f;
+ out = ((*si24) = uni);
+
+ if(uni < RANLUXCL_TWOM12){
+ out += RANLUXCL_TWOM24 * sj24m1;
+ if(out == 0.0f) out = RANLUXCL_TWOM24 * RANLUXCL_TWOM24;
+ }
+ return out;
+}
+
+float4 ranluxcl32(ranluxcl_state_t *rst)
+{
+ //ranluxcl32 returns a 4-component float vector where each component is uniformly distributed
+ //between 0-1, end points not included.
+
+ float4 out;
+
+ if((*rst).stepnr == 0){
+ out.x = ranluxcl_onestep((*rst).seed09, (*rst).seed10, &((*rst).seed24), &((*rst).carry));
+ out.y = ranluxcl_onestep((*rst).seed08, (*rst).seed09, &((*rst).seed23), &((*rst).carry));
+ out.z = ranluxcl_onestep((*rst).seed07, (*rst).seed08, &((*rst).seed22), &((*rst).carry));
+ out.w = ranluxcl_onestep((*rst).seed06, (*rst).seed07, &((*rst).seed21), &((*rst).carry));
+ (*rst).stepnr += 4;
+ }
+
+ else if((*rst).stepnr == 4){
+ out.x = ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry));
+ out.y = ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry));
+ out.z = ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry));
+ out.w = ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry));
+ (*rst).stepnr += 4;
+ }
+
+ else if((*rst).stepnr == 8){
+ out.x = ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry));
+ out.y = ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry));
+ out.z = ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry));
+ out.w = ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry));
+ (*rst).stepnr += 4;
+ }
+
+ else if((*rst).stepnr == 12){
+ out.x = ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry));
+ out.y = ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry));
+ out.z = ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry));
+ out.w = ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry));
+ (*rst).stepnr += 4;
+ }
+
+ else if((*rst).stepnr == 16){
+ out.x = ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry));
+ out.y = ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry));
+ out.z = ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry));
+ out.w = ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry));
+ (*rst).stepnr += 4;
+ }
+
+ else if((*rst).stepnr == 20){
+ out.x = ranluxcl_onestep((*rst).seed13, (*rst).seed14, &((*rst).seed04), &((*rst).carry));
+ out.y = ranluxcl_onestep((*rst).seed12, (*rst).seed13, &((*rst).seed03), &((*rst).carry));
+ out.z = ranluxcl_onestep((*rst).seed11, (*rst).seed12, &((*rst).seed02), &((*rst).carry));
+ out.w = ranluxcl_onestep((*rst).seed10, (*rst).seed11, &((*rst).seed01), &((*rst).carry));
+ (*rst).stepnr = 0;
+
+//The below preprocessor directives are here to recover the simpler planar scheme when nskip is a multiple of 24.
+//For the most general planar shift approach, just ignore all #if's below.
+#ifndef RANLUXCL_PLANAR
+ }
+
+ (*&((*rst).in24)) += 4;
+ if((*&((*rst).in24)) == 24){
+ (*&((*rst).in24)) = 0;
+#endif //RANLUXCL_PLANAR
+
+ int initialskips = ((*rst).stepnr) ? (24 - (*rst).stepnr) : 0;
+ int bulkskips = ((RANLUXCL_NSKIP - initialskips)/24) * 24;
+ int remainingskips = RANLUXCL_NSKIP - initialskips - bulkskips;
+
+//We know there won't be any initial skips in the planar scheme
+#ifndef RANLUXCL_PLANAR
+ //Do initial skips (lack of breaks in switch is intentional).
+ switch(initialskips){
+ case(20):
+ ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry));
+ case(16):
+ ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry));
+ case(12):
+ ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry));
+ case(8):
+ ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry));
+ case(4):
+ ranluxcl_onestep((*rst).seed13, (*rst).seed14, &((*rst).seed04), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed12, (*rst).seed13, &((*rst).seed03), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed11, (*rst).seed12, &((*rst).seed02), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed10, (*rst).seed11, &((*rst).seed01), &((*rst).carry));
+ }
+#endif //RANLUXCL_PLANAR
+
+//Also check if we will ever need to skip at all
+#ifndef RANLUXCL_NOSKIP
+ for(int i=0; i 4){
+ ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry));
+ }
+
+ if(remainingskips > 8){
+ ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry));
+ }
+
+ if(remainingskips > 12){
+ ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry));
+ }
+
+ if(remainingskips > 16){
+ ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry));
+ ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry));
+ }
+ }
+#endif //RANLUXCL_PLANAR
+
+ //Initial skips brought stepnr down to 0. The bulk skips did only full cycles.
+ //Therefore stepnr is now equal to remainingskips.
+ (*rst).stepnr = remainingskips;
+ }
+
+ return out;
+}
+
+void ranluxcl_synchronize(ranluxcl_state_t *rst){
+ //This function generates numbers so that the generator is at the beginning,
+ //i.e. ready to generate 24 numbers before the next skipping sequence. This is
+ //useful if different work-items have called ranluxcl a different number of times.
+ //Since that would lead to out of sync execution it could be rather inefficient on
+ //SIMD architectures like GPUs. This function thus allows us to resynchronize
+ //execution across all work-items.
+
+ //Do necessary number of calls to ranluxcl so that stepnr == 0 at the end.
+ if((*rst).stepnr == 4)
+ ranluxcl32(rst);
+ if((*rst).stepnr == 8)
+ ranluxcl32(rst);
+ if((*rst).stepnr == 12)
+ ranluxcl32(rst);
+ if((*rst).stepnr == 16)
+ ranluxcl32(rst);
+ if((*rst).stepnr == 20)
+ ranluxcl32(rst);
+}
+
+void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
+{
+ ranluxcl_state_t rst;
+
+ #ifdef RANLUXCL_USE_LEGACY_INITIALIZATION
+ //Using legacy initialization from original Fortan 77 implementation
+
+ //ins is scaled so that if the user makes another call somewhere else
+ //with ins + 1 there should be no overlap. Also adding one
+ //allows us to use ins = 0.
+ int k, maxWorkitems;
+
+ #ifdef RANLUXCL_USE_LEGACY_INITIALIZATION
+ maxWorkitems = RANLUXCL_USE_LEGACY_INITIALIZATION;
+ #else
+ maxWorkitems = RANLUXCL_NUMWORKITEMS;
+ #endif //RANLUXCL_USE_LEGACY_INITIALIZATION
+
+ int scaledins = ins * maxWorkitems + 1;
+
+ int js = scaledins + RANLUXCL_MYID;
+
+ //Make sure js is not too small (should really be an error)
+ if(js < 1)
+ js = 1;
+
+ #define IC 2147483563
+ #define ITWO24 16777216
+
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed01=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed02=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed03=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed04=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed05=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed06=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed07=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed08=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed09=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed10=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed11=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed12=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed13=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed14=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed15=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed16=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed17=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed18=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed19=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed20=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed21=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed22=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed23=(js%ITWO24)*RANLUXCL_TWOM24;
+ k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed24=(js%ITWO24)*RANLUXCL_TWOM24;
+
+ #undef IC
+ #undef ITWO24
+
+ #else //RANLUXCL_USE_LEGACY_INITIALIZATION
+ //Using default initialization
+
+ #define RANLUXCL_POW2_24 16777216
+ #define RANLUXCL_56 0x00FFFFFFFFFFFFFF
+ #define RANLUXCL_48 0x0000FFFFFFFFFFFF
+ #define RANLUXCL_40 0x000000FFFFFFFFFF
+ #define RANLUXCL_32 0x00000000FFFFFFFF
+ #define RANLUXCL_24 0x0000000000FFFFFF
+ #define RANLUXCL_16 0x000000000000FFFF
+ #define RANLUXCL_8 0x00000000000000FF
+
+ //We scale ins by (2^32)-1. As long as we never use more than (2^32)-1 work-items per
+ //NDRange the initial states should never be the same. We use a simple 64-bit LCG from
+ //the SPRNG library. We choose a single prime b, while in SPRNG for parallel applications
+ //different b give rise to different sequences in parallel (a feature not used here).
+ ulong x1, x2, x3;
+ ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * 4294967295;
+ ulong a = 2862933555777941757;
+ ulong b = 3037000493;
+
+ //Logical shifts used so that all 64 bits of LCG output are used (24 bits per float),
+ //to be certain that all initial states are different.
+ x1 = x = ((a * x + b) & ULONG_MAX);
+ x2 = x = ((a * x + b) & ULONG_MAX);
+ x3 = x = ((a * x + b) & ULONG_MAX);
+ rst.seed01 = (float) (x1 >> 40) / (float)RANLUXCL_POW2_24;
+ rst.seed02 = (float) ((x1 & RANLUXCL_40) >> 16) / (float)RANLUXCL_POW2_24;
+ rst.seed03 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) / (float)RANLUXCL_POW2_24;
+ rst.seed04 = (float) ((x2 & RANLUXCL_56) >> 32) / (float)RANLUXCL_POW2_24;
+ rst.seed05 = (float) ((x2 & RANLUXCL_32) >> 8) / (float)RANLUXCL_POW2_24;
+ rst.seed06 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24;
+ rst.seed07 = (float) ((x3 & RANLUXCL_48) >> 24) / (float)RANLUXCL_POW2_24;
+ rst.seed08 = (float) (x3 & RANLUXCL_24) / (float)RANLUXCL_POW2_24;
+
+ x1 = x = ((a * x + b) & ULONG_MAX);
+ x2 = x = ((a * x + b) & ULONG_MAX);
+ x3 = x = ((a * x + b) & ULONG_MAX);
+ rst.seed09 = (float) (x1 >> 40) / (float)RANLUXCL_POW2_24;
+ rst.seed10 = (float) ((x1 & RANLUXCL_40) >> 16) / (float)RANLUXCL_POW2_24;
+ rst.seed11 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) / (float)RANLUXCL_POW2_24;
+ rst.seed12 = (float) ((x2 & RANLUXCL_56) >> 32) / (float)RANLUXCL_POW2_24;
+ rst.seed13 = (float) ((x2 & RANLUXCL_32) >> 8) / (float)RANLUXCL_POW2_24;
+ rst.seed14 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24;
+ rst.seed15 = (float) ((x3 & RANLUXCL_48) >> 24) / (float)RANLUXCL_POW2_24;
+ rst.seed16 = (float) (x3 & RANLUXCL_24) / (float)RANLUXCL_POW2_24;
+
+ x1 = x = ((a * x + b) & ULONG_MAX);
+ x2 = x = ((a * x + b) & ULONG_MAX);
+ x3 = x = ((a * x + b) & ULONG_MAX);
+ rst.seed17 = (float) (x1 >> 40) / (float)RANLUXCL_POW2_24;
+ rst.seed18 = (float) ((x1 & RANLUXCL_40) >> 16) / (float)RANLUXCL_POW2_24;
+ rst.seed19 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) / (float)RANLUXCL_POW2_24;
+ rst.seed20 = (float) ((x2 & RANLUXCL_56) >> 32) / (float)RANLUXCL_POW2_24;
+ rst.seed21 = (float) ((x2 & RANLUXCL_32) >> 8) / (float)RANLUXCL_POW2_24;
+ rst.seed22 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24;
+ rst.seed23 = (float) ((x3 & RANLUXCL_48) >> 24) / (float)RANLUXCL_POW2_24;
+ rst.seed24 = (float) (x3 & RANLUXCL_24) / (float)RANLUXCL_POW2_24;
+
+ #undef RANLUXCL_POW2_24
+ #undef RANLUXCL_56
+ #undef RANLUXCL_48
+ #undef RANLUXCL_40
+ #undef RANLUXCL_32
+ #undef RANLUXCL_24
+ #undef RANLUXCL_16
+ #undef RANLUXCL_8
+
+ #endif //RANLUXCL_USE_LEGACY_INITIALIZATION
+
+ rst.in24 = 0;
+ rst.stepnr = 0;
+ rst.carry = 0.0f;
+ if(rst.seed24 == 0.0f)
+ rst.carry = RANLUXCL_TWOM24;
+
+ #ifndef RANLUXCL_NO_WARMUP
+ //Warming up the generator, ensuring there are no initial correlations.
+ //16 is a "magic number". It is the number of times we must generate
+ //a batch of 24 numbers to ensure complete decorrelation.
+ for(int i=0; i<16; i++){
+ ranluxcl_onestep(rst.seed09, rst.seed10, &(rst.seed24), &(rst.carry));
+ ranluxcl_onestep(rst.seed08, rst.seed09, &(rst.seed23), &(rst.carry));
+ ranluxcl_onestep(rst.seed07, rst.seed08, &(rst.seed22), &(rst.carry));
+ ranluxcl_onestep(rst.seed06, rst.seed07, &(rst.seed21), &(rst.carry));
+ ranluxcl_onestep(rst.seed05, rst.seed06, &(rst.seed20), &(rst.carry));
+ ranluxcl_onestep(rst.seed04, rst.seed05, &(rst.seed19), &(rst.carry));
+ ranluxcl_onestep(rst.seed03, rst.seed04, &(rst.seed18), &(rst.carry));
+ ranluxcl_onestep(rst.seed02, rst.seed03, &(rst.seed17), &(rst.carry));
+ ranluxcl_onestep(rst.seed01, rst.seed02, &(rst.seed16), &(rst.carry));
+ ranluxcl_onestep(rst.seed24, rst.seed01, &(rst.seed15), &(rst.carry));
+ ranluxcl_onestep(rst.seed23, rst.seed24, &(rst.seed14), &(rst.carry));
+ ranluxcl_onestep(rst.seed22, rst.seed23, &(rst.seed13), &(rst.carry));
+ ranluxcl_onestep(rst.seed21, rst.seed22, &(rst.seed12), &(rst.carry));
+ ranluxcl_onestep(rst.seed20, rst.seed21, &(rst.seed11), &(rst.carry));
+ ranluxcl_onestep(rst.seed19, rst.seed20, &(rst.seed10), &(rst.carry));
+ ranluxcl_onestep(rst.seed18, rst.seed19, &(rst.seed09), &(rst.carry));
+ ranluxcl_onestep(rst.seed17, rst.seed18, &(rst.seed08), &(rst.carry));
+ ranluxcl_onestep(rst.seed16, rst.seed17, &(rst.seed07), &(rst.carry));
+ ranluxcl_onestep(rst.seed15, rst.seed16, &(rst.seed06), &(rst.carry));
+ ranluxcl_onestep(rst.seed14, rst.seed15, &(rst.seed05), &(rst.carry));
+ ranluxcl_onestep(rst.seed13, rst.seed14, &(rst.seed04), &(rst.carry));
+ ranluxcl_onestep(rst.seed12, rst.seed13, &(rst.seed03), &(rst.carry));
+ ranluxcl_onestep(rst.seed11, rst.seed12, &(rst.seed02), &(rst.carry));
+ ranluxcl_onestep(rst.seed10, rst.seed11, &(rst.seed01), &(rst.carry));
+ }
+ #endif //RANLUXCL_NO_WARMUP
+
+ //Upload the state
+ ranluxcl_upload_seed(&rst, ranluxcltab);
+}
+
+float4 ranluxcl32norm(ranluxcl_state_t *rst)
+{
+ //Returns a vector where each component is a normally
+ //distributed PRN centered on 0, with standard deviation
+ //1. Note: M_PI_F is an OpenCL macro for the value of pi.
+ //M_PI would be the 64-bit double version.
+
+ float4 U = ranluxcl32(rst);
+
+ float4 Z;
+ float R, phi;
+
+ R = sqrt(-2 * log(U.x));
+ phi = 2 * M_PI_F * U.y;
+ Z.x = R * cos(phi);
+ Z.y = R * sin(phi);
+
+ R = sqrt(-2 * log(U.z));
+ phi = 2 * M_PI_F * U.w;
+ Z.z = R * cos(phi);
+ Z.w = R * sin(phi);
+
+ return Z;
+}
+
+#ifdef RANLUXCL_SUPPORT_DOUBLE
+double4 ranluxcl64(ranluxcl_state_t *rst)
+{
+ double4 out;
+ float4 randvec;
+
+ //We know this value is caused by the never-zero part
+ //of the original algorithm, but we want to allow zero for
+ //the most significant bits in the double precision result.
+ randvec = ranluxcl32(rst);
+ if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
+ randvec.x = 0.0f;
+ if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
+ randvec.z = 0.0f;
+
+ out.x = (double)(randvec.x) + (double)(randvec.y) / 16777216;
+ out.y = (double)(randvec.z) + (double)(randvec.w) / 16777216;
+
+ randvec = ranluxcl32(rst);
+ if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
+ randvec.x = 0.0f;
+ if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
+ randvec.z = 0.0f;
+
+ out.z = (double)(randvec.x) + (double)(randvec.y) / 16777216;
+ out.w = (double)(randvec.z) + (double)(randvec.w) / 16777216;
+
+ return out;
+}
+
+double4 ranluxcl64norm(ranluxcl_state_t *rst)
+{
+ //Returns a vector where each component is a normally
+ //distributed PRN centered on 0, with standard deviation
+ //1.
+
+ double4 U = ranluxcl64(rst);
+
+ double4 Z;
+ double R, phi;
+
+ R = sqrt(-2 * log(U.x));
+ phi = 2 * M_PI * U.y;
+ Z.x = R * cos(phi);
+ Z.y = R * sin(phi);
+
+ R = sqrt(-2 * log(U.z));
+ phi = 2 * M_PI * U.w;
+ Z.z = R * cos(phi);
+ Z.w = R * sin(phi);
+
+ return Z;
+}
+#endif //RANLUXCL_SUPPORT_DOUBLE
+
+#undef RANLUXCL_TWOM24
+#undef RANLUXCL_TWOM12
+#undef RANLUXCL_NUMWORKITEMS
+#undef RANLUXCL_MYID
+#undef RANLUXCL_PLANAR
+#undef RANLUXCL_NOSKIP
+
+#endif //RANLUXCL_CL
diff --git a/test/test_array.py b/test/test_array.py
index 9854afdc3760e00f17b99bd70de7ed7330a054ab..2a939cedeaaa0a687d5da7329126c61c6f604a36 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -226,18 +226,39 @@ def test_random(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
- from pyopencl.clrandom import rand as clrand
+ from pyopencl.clrandom import RanluxGenerator
if has_double_support(context.devices[0]):
dtypes = [np.float32, np.float64]
else:
dtypes = [np.float32]
+ gen = RanluxGenerator(queue, 5120)
+
for dtype in dtypes:
- a = clrand(context, queue, (10, 100), dtype=dtype).get()
+ ran = gen.uniform(queue, (10007,), dtype)
+ assert (0 < ran.get()).all()
+ assert (ran.get() < 1).all()
+
+ gen.synchronize(queue)
+
+ ran = gen.uniform(queue, (10007,), dtype, a=4, b=7)
+ assert (4 < ran.get()).all()
+ assert (ran.get() < 7).all()
+
+ ran = gen.normal(queue, (10007,), dtype, mu=4, sigma=3)
+
+ dtypes = [np.int32]
+ for dtype in dtypes:
+ ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300)
+ assert (200 <= ran.get()).all()
+ assert (ran.get() < 300).all()
+ #from matplotlib import pyplot as pt
+ #pt.hist(ran.get())
+ #pt.show()
+
+
- assert (0 <= a).all()
- assert (a < 1).all()
@pytools.test.mark_test.opencl