diff --git a/doc/source/array.rst b/doc/source/array.rst
index 48525e96e930c7d1a18bbc83f75028844ea7003d..7264d84ea351bcadc93b7c702c328c882b4f6fbe 100644
--- a/doc/source/array.rst
+++ b/doc/source/array.rst
@@ -361,11 +361,80 @@ Generating Arrays of Random Numbers
 
 .. module:: pyopencl.clrandom
 
+.. class:: RanluxGenerator(self, queue, num_work_items, max_work_items, luxury=2, seed=None)
+
+    :param queue: :class:`pyopencl.CommandQueue`, only used for initialization
+    :param luxury: the "luxury value" of the generator, and should be 0-4, where 0 is fastest
+        and 4 produces the best numbers. It can also be >=24, in which case it directly
+        sets the p-value of RANLUXCL.
+    :param num_work_items: is the number of generators to initialize, usually corresponding
+        to the number of work-items in the NDRange RANLUXCL will be used with.
+    :param max_work_items: should reflect the maximum number of work-items that will be used
+        on any parallel instance of RANLUXCL. So for instance if we are launching 5120
+        work-items on GPU1 and 10240 work-items on GPU2, GPU1's RANLUXCLTab would be
+        generated by calling ranluxcl_intialization with numWorkitems = 5120 while
+        GPU2's RANLUXCLTab would use numWorkitems = 10240. However maxWorkitems must
+        be at least 10240 for both GPU1 and GPU2, and it must be set to the same value
+        for both.
+
+    .. attribute:: state
+
+        A :class:`pyopencl.array.Array` containing the state of the generator.
+
+    .. attribute:: nskip
+
+        nskip is an integer which can (optionally) be defined in the kernel code
+        as RANLUXCL_NSKIP. If this is done the generator will be faster for luxury setting
+        0 and 1, or when the p-value is manually set to a multiple of 24.
+
+    .. method:: fill_uniform(ary, a=0, b=1, queue=None)
+
+        Fill *ary* with uniformly distributed random numbers in the interval
+        *(a, b)*, endpoints excluded.
+
+    .. method:: uniform(queue, shape, dtype, order="C", allocator=None, base=None, data=None, a=0, b=1)
+
+        Make a new empty array, apply :meth:`fill_uniform` to it.
+
+    .. method:: fill_normal(ary, mu=0, sigma=1, queue=None):
+
+        Fill *ary* with normally distributed numbers with mean *mu* and
+        standard deviation *sigma*.
+
+    .. method:: normal(queue, shape, dtype, order="C", allocator=None, base=None, data=None, mu=0, sigma=1)
+
+        Make a new empty array, apply :meth:`fill_normal` to it.
+
+    .. method:: synchronize()
+
+        The generator gets inefficient when different work items invoke
+        the generator a differing number of times. This function
+        ensures efficiency.
+
 .. function:: rand(queue, shape, dtype)
 
     Return an array of `shape` filled with random values of `dtype`
     in the range [0,1).
 
+PyOpenCL now includes and uses the `RANLUXCL random number generator
+<https://bitbucket.org/ivarun/ranluxcl/>`_ by Ivar Ursin Nikolaisen.  In
+addition to being usable through the convenience functions above, it is
+available in any piece of code compiled through PyOpenCL by::
+
+    #include <pyopencl-ranluxcl.cl>
+
+The RANLUX generator is described in the following two articles. If you use the
+generator for scientific purposes, please consider citing them:
+
+* Martin Lüscher, A portable high-quality random number generator for lattice
+  field theory simulations, `Computer Physics Communications 79 (1994) 100-110
+  <http://dx.doi.org/10.1016/0010-4655(94)90232-1>`_
+
+* F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom
+  number generator of Lüscher, `Computer Physics Communications 79 (1994) 111-114
+  <http://dx.doi.org/10.1016/0010-4655(94)90233-X>`_
+
+
 Single-pass Custom Expression Evaluation
 ----------------------------------------
 
diff --git a/doc/source/misc.rst b/doc/source/misc.rst
index c69f95ccdede76632397ff54bb1e4fb8a16b216c..dfabd69b7f437f09eb038e2d54982c74d5d08262 100644
--- a/doc/source/misc.rst
+++ b/doc/source/misc.rst
@@ -79,10 +79,12 @@ Version 2011.2
   severe consequences on the execution time of :class:`pyopencl.array.Array`
   operations.
   Henrik Andresen at a `PyOpenCL workshop at DTU <http://gpulab.imm.dtu.dk/courses.html>`_
-  first noticed the timings
+  first noticed the strange timings.
 * All comparable PyOpenCL objects are now also hashable.
 * Add :func:`pyopencl.tools.context_dependent_memoize` to the documented
   functionality.
+* Base :mod:`pyopencl.clrandom` on `RANLUXCL <https://bitbucket.org/ivarun/ranluxcl>`_,
+  add functionality.
 
 Version 2011.1.2
 ----------------
@@ -299,6 +301,27 @@ implementation). These parts are licensed as follows:
     with software licensed exclusively under GPL2.  (Most software is licensed
     as GPL2 or later, in which case this is not an issue.)
 
+PyOpenCL includes the RANLUXCL random number generator:
+
+    Copyright (c) 2011 Ivar Ursin Nikolaisen
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy of this
+    software and associated documentation files (the "Software"), to deal in the Software
+    without restriction, including without limitation the rights to use, copy, modify,
+    merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to the following
+    conditions:
+
+    The above copyright notice and this permission notice shall be included in all copies
+    or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+    INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+    PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+    OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
 Frequently Asked Questions
 ==========================
 
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index b1fcb788f240fc8c5b21a409942d464e14f54e00..e689051cf8f800a0b0f47fe392cfc80f8155b6a3 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -368,6 +368,39 @@ _add_functionality()
 
 
 
+# {{{ find pyopencl shipped source code
+
+def _find_pyopencl_include_path():
+    from imp import find_module
+    import sys
+    file, pathname, descr = find_module("pyopencl")
+
+    # Who knew Python installation is so uniform and predictable?
+    from os.path import join, exists
+    possible_include_paths = [
+            join(pathname, "..", "include", "pyopencl"),
+            join(pathname, "..", "src", "cl"),
+            join(pathname, "..", "..", "..", "src", "cl"),
+            join(pathname, "..", "..", "..", "..", "include", "pyopencl"),
+            join(pathname, "..", "..", "..", "include", "pyopencl"),
+            ]
+
+    if sys.platform in ("linux2", "darwin"):
+        possible_include_paths.extend([
+            join(sys.prefix, "include" , "pyopencl"),
+            "/usr/include/pyopencl",
+            "/usr/local/include/pyopencl"
+            ])
+
+    for inc_path in possible_include_paths:
+        if exists(inc_path):
+            return inc_path
+
+    raise RuntimeError("could not find path to PyOpenCL's CL"
+            " header files, searched in : %s"
+            % '\n'.join(possible_include_paths))
+
+# }}}
 
 # {{{ Program (including caching support)
 
@@ -425,6 +458,11 @@ class Program(object):
                     "info attribute or as a kernel name" % attr)
 
     def build(self, options=[], devices=None, cache_dir=None):
+        if isinstance(options, str):
+            options = [options]
+
+        options = options + ["-I", _find_pyopencl_include_path()]
+
         if self._prg is not None:
             self._prg._build(options, devices)
         else:
diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py
index 2b13047eef792e23bd023d238f74aa03b604c41f..ab4805b20e85b1ed30dc88c8302573da61aa310d 100644
--- a/pyopencl/clrandom.py
+++ b/pyopencl/clrandom.py
@@ -1,235 +1,219 @@
 import pyopencl as cl
 import pyopencl.array as cl_array
-from pyopencl.tools import context_dependent_memoize
-
-
-
-
-md5_code = """
-/*
- **********************************************************************
- ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
- **                                                                  **
- ** License to copy and use this software is granted provided that   **
- ** it is identified as the "RSA Data Security, Inc. MD5 Message     **
- ** Digest Algorithm" in all material mentioning or referencing this **
- ** software or this function.                                       **
- **                                                                  **
- ** License is also granted to make and use derivative works         **
- ** provided that such works are identified as "derived from the RSA **
- ** Data Security, Inc. MD5 Message Digest Algorithm" in all         **
- ** material mentioning or referencing the derived work.             **
- **                                                                  **
- ** RSA Data Security, Inc. makes no representations concerning      **
- ** either the merchantability of this software or the suitability   **
- ** of this software for any particular purpose.  It is provided "as **
- ** is" without express or implied warranty of any kind.             **
- **                                                                  **
- ** These notices must be retained in any copies of any part of this **
- ** documentation and/or software.                                   **
- **********************************************************************
- */
-
-/* F, G and H are basic MD5 functions: selection, majority, parity */
-#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
-#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
-#define H(x, y, z) ((x) ^ (y) ^ (z))
-#define I(x, y, z) ((y) ^ ((x) | (~z)))
-
-/* ROTATE_LEFT rotates x left n bits */
-#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
-
-/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */
-/* Rotation is separate from addition to prevent recomputation */
-#define FF(a, b, c, d, x, s, ac) \
-  {(a) += F ((b), (c), (d)) + (x) + (ac); \
-   (a) = ROTATE_LEFT ((a), (s)); \
-   (a) += (b); \
-  }
-#define GG(a, b, c, d, x, s, ac) \
-  {(a) += G ((b), (c), (d)) + (x) + (ac); \
-   (a) = ROTATE_LEFT ((a), (s)); \
-   (a) += (b); \
-  }
-#define HH(a, b, c, d, x, s, ac) \
-  {(a) += H ((b), (c), (d)) + (x) + (ac); \
-   (a) = ROTATE_LEFT ((a), (s)); \
-   (a) += (b); \
-  }
-#define II(a, b, c, d, x, s, ac) \
-  {(a) += I ((b), (c), (d)) + (x) + (ac); \
-   (a) = ROTATE_LEFT ((a), (s)); \
-   (a) += (b); \
-  }
-
-#define X0 get_local_id(0)
-#define X1 get_local_id(1)
-#define X2 get_local_id(2)
-#define X3 get_group_id(0)
-#define X4 get_group_id(1)
-#define X5 get_group_id(2)
-#define X6 seed
-#define X7 i
-#define X8 n
-#define X9  get_local_size(0)
-#define X10 get_local_size(1)
-#define X11 get_local_size(2)
-#define X12 get_global_size(0)
-#define X13 get_global_size(1)
-#define X14 get_global_size(2)
-#define X15 0
-
-  unsigned int a = 0x67452301;
-  unsigned int b = 0xefcdab89;
-  unsigned int c = 0x98badcfe;
-  unsigned int d = 0x10325476;
-
-  /* Round 1 */
-#define S11 7
-#define S12 12
-#define S13 17
-#define S14 22
-  FF ( a, b, c, d, X0 , S11, 3614090360); /* 1 */
-  FF ( d, a, b, c, X1 , S12, 3905402710); /* 2 */
-  FF ( c, d, a, b, X2 , S13,  606105819); /* 3 */
-  FF ( b, c, d, a, X3 , S14, 3250441966); /* 4 */
-  FF ( a, b, c, d, X4 , S11, 4118548399); /* 5 */
-  FF ( d, a, b, c, X5 , S12, 1200080426); /* 6 */
-  FF ( c, d, a, b, X6 , S13, 2821735955); /* 7 */
-  FF ( b, c, d, a, X7 , S14, 4249261313); /* 8 */
-  FF ( a, b, c, d, X8 , S11, 1770035416); /* 9 */
-  FF ( d, a, b, c, X9 , S12, 2336552879); /* 10 */
-  FF ( c, d, a, b, X10, S13, 4294925233); /* 11 */
-  FF ( b, c, d, a, X11, S14, 2304563134); /* 12 */
-  FF ( a, b, c, d, X12, S11, 1804603682); /* 13 */
-  FF ( d, a, b, c, X13, S12, 4254626195); /* 14 */
-  FF ( c, d, a, b, X14, S13, 2792965006); /* 15 */
-  FF ( b, c, d, a, X15, S14, 1236535329); /* 16 */
-
-  /* Round 2 */
-#define S21 5
-#define S22 9
-#define S23 14
-#define S24 20
-  GG ( a, b, c, d, X1 , S21, 4129170786); /* 17 */
-  GG ( d, a, b, c, X6 , S22, 3225465664); /* 18 */
-  GG ( c, d, a, b, X11, S23,  643717713); /* 19 */
-  GG ( b, c, d, a, X0 , S24, 3921069994); /* 20 */
-  GG ( a, b, c, d, X5 , S21, 3593408605); /* 21 */
-  GG ( d, a, b, c, X10, S22,   38016083); /* 22 */
-  GG ( c, d, a, b, X15, S23, 3634488961); /* 23 */
-  GG ( b, c, d, a, X4 , S24, 3889429448); /* 24 */
-  GG ( a, b, c, d, X9 , S21,  568446438); /* 25 */
-  GG ( d, a, b, c, X14, S22, 3275163606); /* 26 */
-  GG ( c, d, a, b, X3 , S23, 4107603335); /* 27 */
-  GG ( b, c, d, a, X8 , S24, 1163531501); /* 28 */
-  GG ( a, b, c, d, X13, S21, 2850285829); /* 29 */
-  GG ( d, a, b, c, X2 , S22, 4243563512); /* 30 */
-  GG ( c, d, a, b, X7 , S23, 1735328473); /* 31 */
-  GG ( b, c, d, a, X12, S24, 2368359562); /* 32 */
-
-  /* Round 3 */
-#define S31 4
-#define S32 11
-#define S33 16
-#define S34 23
-  HH ( a, b, c, d, X5 , S31, 4294588738); /* 33 */
-  HH ( d, a, b, c, X8 , S32, 2272392833); /* 34 */
-  HH ( c, d, a, b, X11, S33, 1839030562); /* 35 */
-  HH ( b, c, d, a, X14, S34, 4259657740); /* 36 */
-  HH ( a, b, c, d, X1 , S31, 2763975236); /* 37 */
-  HH ( d, a, b, c, X4 , S32, 1272893353); /* 38 */
-  HH ( c, d, a, b, X7 , S33, 4139469664); /* 39 */
-  HH ( b, c, d, a, X10, S34, 3200236656); /* 40 */
-  HH ( a, b, c, d, X13, S31,  681279174); /* 41 */
-  HH ( d, a, b, c, X0 , S32, 3936430074); /* 42 */
-  HH ( c, d, a, b, X3 , S33, 3572445317); /* 43 */
-  HH ( b, c, d, a, X6 , S34,   76029189); /* 44 */
-  HH ( a, b, c, d, X9 , S31, 3654602809); /* 45 */
-  HH ( d, a, b, c, X12, S32, 3873151461); /* 46 */
-  HH ( c, d, a, b, X15, S33,  530742520); /* 47 */
-  HH ( b, c, d, a, X2 , S34, 3299628645); /* 48 */
-
-  /* Round 4 */
-#define S41 6
-#define S42 10
-#define S43 15
-#define S44 21
-  II ( a, b, c, d, X0 , S41, 4096336452); /* 49 */
-  II ( d, a, b, c, X7 , S42, 1126891415); /* 50 */
-  II ( c, d, a, b, X14, S43, 2878612391); /* 51 */
-  II ( b, c, d, a, X5 , S44, 4237533241); /* 52 */
-  II ( a, b, c, d, X12, S41, 1700485571); /* 53 */
-  II ( d, a, b, c, X3 , S42, 2399980690); /* 54 */
-  II ( c, d, a, b, X10, S43, 4293915773); /* 55 */
-  II ( b, c, d, a, X1 , S44, 2240044497); /* 56 */
-  II ( a, b, c, d, X8 , S41, 1873313359); /* 57 */
-  II ( d, a, b, c, X15, S42, 4264355552); /* 58 */
-  II ( c, d, a, b, X6 , S43, 2734768916); /* 59 */
-  II ( b, c, d, a, X13, S44, 1309151649); /* 60 */
-  II ( a, b, c, d, X4 , S41, 4149444226); /* 61 */
-  II ( d, a, b, c, X11, S42, 3174756917); /* 62 */
-  II ( c, d, a, b, X2 , S43,  718787259); /* 63 */
-  II ( b, c, d, a, X9 , S44, 3951481745); /* 64 */
-
-  a += 0x67452301;
-  b += 0xefcdab89;
-  c += 0x98badcfe;
-  d += 0x10325476;
-"""
+from pyopencl.tools import first_arg_dependent_memoize
+from pytools import memoize_method
 
 import numpy as np
 
 
 
 
-@context_dependent_memoize
-def get_rand_kernel(context, dtype):
-    from pyopencl.elementwise import get_elwise_kernel
-    if dtype == np.float32:
-        return get_elwise_kernel(context,
-            "float *dest, unsigned int seed",
-            md5_code + """
-            #define POW_2_M32 (1/4294967296.0f)
-            dest[i] = a*POW_2_M32;
-            if ((i += gsize) < n)
-                dest[i] = b*POW_2_M32;
-            if ((i += gsize) < n)
-                dest[i] = c*POW_2_M32;
-            if ((i += gsize) < n)
-                dest[i] = d*POW_2_M32;
-            """,
-            "md5_rng_float")
-    elif dtype == np.float64:
-        return get_elwise_kernel(context,
-            "double *dest, unsigned int seed",
-            md5_code + """
-            #define POW_2_M32 (1/4294967296.0)
-            #define POW_2_M64 (1/18446744073709551616.)
-
-            dest[i] = a*POW_2_M32 + b*POW_2_M64;
-
-            if ((i += gsize) < n)
+class RanluxGenerator(object):
+    def __init__(self, queue, num_work_items,
+            luxury=4, seed=None, no_warmup=False,
+            use_legacy_init=False, max_work_items=None):
+        if seed is None:
+            from time import time
+            seed = int(time()*1e6) % 2<<30
+
+        self.context = queue.context
+        self.luxury = luxury
+        self.num_work_items = num_work_items
+
+        from pyopencl.characterize import has_double_support
+        self.support_double = has_double_support(queue.device)
+
+        self.no_warmup = no_warmup
+        self.use_legacy_init = use_legacy_init
+        self.max_work_items = max_work_items
+
+        prg = cl.Program(queue.context, """
+            %s
+
+            #include <pyopencl-ranluxcl.cl>
+
+            kernel void init_ranlux(unsigned seeds, global ranluxcl_state_t *ranluxcltab)
+            {
+              ranluxcl_initialization(seeds, ranluxcltab);
+            }
+            """ % self.generate_settings_defines()).build()
+
+        self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8)
+        prg.init_ranlux(queue, (num_work_items,), None, np.uint32(seed),
+                self.state.data)
+
+    def generate_settings_defines(self, include_double_pragma=True):
+        lines = []
+        if include_double_pragma and self.support_double:
+            lines.append("#pragma OPENCL EXTENSION cl_khr_fp64 : enable")
+
+        lines.append("#define RANLUXCL_LUX %d" % self.luxury)
+
+        if self.no_warmup:
+            lines.append("#define RANLUXCL_NO_WARMUP")
+
+        if self.support_double:
+            lines.append("#define RANLUXCL_SUPPORT_DOUBLE")
+
+        if self.use_legacy_init:
+            lines.append("#define RANLUXCL_USE_LEGACY_INITIALIZATION")
+
+            if self.max_work_items:
+                lines.append("#define RANLUXCL_MAXWORKITEMS %d" % self.max_work_items)
+
+        return "\n".join(lines)
+
+    @memoize_method
+    def get_gen_kernel(self, dtype, flavor=""):
+        if dtype == np.float64:
+            bits = 64
+            c_type = "double"
+            rng_expr = "(shift + scale * gen)"
+        elif dtype == np.float32:
+            bits = 32
+            c_type = "float"
+            rng_expr = "(shift + scale * gen)"
+        elif dtype == np.int32:
+            assert flavor == ""
+            bits = 32
+            c_type = "int"
+            rng_expr = ("(shift "
+                    "+ convert_int4(scale * gen) "
+                    "+ convert_int4((scale / (1<<24)) * gen))"
+                    )
+        else:
+            raise TypeError("unsupported RNG data type '%s'" % dtype)
+
+        rl_flavor = "%d%s" % (bits, flavor)
+
+        src = """//CL//
+            %(defines)s
+
+            #include <pyopencl-ranluxcl.cl>
+
+            typedef %(output_t)s output_t;
+            typedef %(output_t)s4 output_vec_t;
+            #define NUM_WORKITEMS %(num_work_items)d
+            #define RANLUX_FUNC ranluxcl##%(rlflavor)s
+            #define GET_RANDOM_NUM(gen) %(rng_expr)s
+
+            kernel void generate(
+                global ranluxcl_state_t *ranluxcltab,
+                global output_t *output,
+                unsigned long out_size,
+                output_t scale,
+                output_t shift)
             {
-              dest[i] = c*POW_2_M32 + d*POW_2_M64;
+              ranluxcl_state_t ranluxclstate;
+              ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
+
+              // output bulk
+              unsigned long idx = get_global_id(0)*4;
+              while (idx + 4 < out_size)
+              {
+                  vstore4(GET_RANDOM_NUM(RANLUX_FUNC(&ranluxclstate)), idx >> 2, output);
+                  idx += 4*NUM_WORKITEMS;
+              }
+
+              // output tail
+              output_vec_t tail_ran = GET_RANDOM_NUM(RANLUX_FUNC(&ranluxclstate));
+              if (idx < out_size)
+                output[idx] = tail_ran.x;
+              if (idx+1 < out_size)
+                output[idx+1] = tail_ran.y;
+              if (idx+2 < out_size)
+                output[idx+2] = tail_ran.z;
+              if (idx+3 < out_size)
+                output[idx+2] = tail_ran.w;
+
+              ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
             }
-            """,
-            "md5_rng_float")
-    elif dtype in [np.int32, np.uint32]:
-        return get_elwise_kernel(context,
-            "unsigned int *dest, unsigned int seed",
-            md5_code + """
-            dest[i] = a;
-            if ((i += gsize) < n)
-                dest[i] = b;
-            if ((i += gsize) < n)
-                dest[i] = c;
-            if ((i += gsize) < n)
-                dest[i] = d;
-            """,
-            "md5_rng_int")
+            """ % {
+                "defines": self.generate_settings_defines(),
+                "rlflavor": rl_flavor,
+                "output_t": c_type,
+                "num_work_items": self.num_work_items,
+                "rng_expr": rng_expr
+            }
+
+        prg = cl.Program(self.context, src).build()
+        knl = prg.generate
+        knl.set_scalar_arg_dtypes([None, None, np.uint64, dtype, dtype])
+
+        return knl
+
+    def fill_uniform(self, ary, a=0, b=1, queue=None):
+        if queue is None:
+            queue = ary.queue
+
+        self.get_gen_kernel(ary.dtype, "")(queue, (self.num_work_items,), None,
+                self.state.data, ary.data, ary.size,
+                b-a, a)
+
+    def uniform(self, *args, **kwargs):
+        a = kwargs.pop("a", 0)
+        b = kwargs.pop("b", 1)
+
+        result = cl_array.empty(*args, **kwargs)
+
+        self.fill_uniform(result, queue=result.queue, a=a, b=b)
+        return result
+
+    def fill_normal(self, ary, mu=0, sigma=1, queue=None):
+        if queue is None:
+            queue = ary.queue
+
+        self.get_gen_kernel(ary.dtype, "norm")(queue, (self.num_work_items,), None,
+                self.state.data, ary.data, ary.size, sigma, mu)
+
+    def normal(self, *args, **kwargs):
+        mu = kwargs.pop("mu", 0)
+        sigma = kwargs.pop("sigma", 1)
+
+        result = cl_array.empty(*args, **kwargs)
+
+        self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma)
+        return result
+
+    @memoize_method
+    def get_sync_kernel(self):
+        src = """//CL//
+            %(defines)s
+
+            #include <pyopencl-ranluxcl.cl>
+
+            kernel void sync(
+                global ranluxcl_state_t *ranluxcltab)
+            {
+              ranluxcl_state_t ranluxclstate;
+              ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
+              ranluxcl_synchronize(&ranluxclstate);
+              ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
+            }
+            """ % {
+                "defines": self.generate_settings_defines(),
+            }
+        prg = cl.Program(self.context, src).build()
+        return prg.sync
+
+    def synchronize(self, queue):
+        self.get_sync_kernel()(queue, (self.num_work_items,), None, self.state.data)
+
+
+
+
+
+
+@first_arg_dependent_memoize
+def _get_generator(queue):
+    if queue.device.type == cl.device_type.CPU:
+        num_work_items = 8 * queue.device.max_compute_units
     else:
-        raise NotImplementedError
+        num_work_items = 64 * queue.device.max_compute_units
+
+    gen = RanluxGenerator(queue, num_work_items)
+    queue.finish()
+    return gen
 
 
 
@@ -245,8 +229,9 @@ def rand(*args, **kwargs):
     def inner_rand(queue, shape, dtype):
         from pyopencl.array import Array
 
+        gen = _get_generator(queue)
         result = Array(queue, shape, dtype)
-        _rand(result, np.random.randint(2**31-1))
+        gen.fill_uniform(result)
         return result
 
     if isinstance(args[0], cl.Context):
@@ -259,27 +244,7 @@ def rand(*args, **kwargs):
 
     return inner_rand(*args, **kwargs)
 
-if __name__ == "__main__":
-    import sys
-    ctx = cl.create_some_context()
-    queue = cl.CommandQueue(ctx)
 
-    if "generate" in sys.argv[1:]:
-        N = 256
-        print N, "MB"
-        r = rand(ctx, queue, (N*2**18,), np.uint32)
-        print "generated"
-        r.get().tofile("random.dat")
-        print "written"
 
-    else:
-        from pylab import plot, show, subplot
-        N = 250
-        r1 = rand(ctx, queue, (N,), np.uint32)
-        r2 = rand(ctx, queue, (N,), np.int32)
-        r3 = rand(ctx, queue, (N,), np.float32)
-
-        subplot(131); plot( r1.get(),"x-")
-        subplot(132); plot( r2.get(),"x-")
-        subplot(133); plot( r3.get(),"x-")
-        show()
+
+# vim: filetype=pyopencl
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index fce7f7db12d0ebc8e253b7d42d7f3fbad4fb23df..986eb885d23d50dd436c7bd45623693698c77208 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -43,40 +43,42 @@ MemoryPool = cl.MemoryPool
 
 
 
-context_dependent_memoized_functions = []
+first_arg_dependent_memoized_functions = []
 
 
 
 
 @decorator
-def context_dependent_memoize(func, context, *args):
+def first_arg_dependent_memoize(func, context, *args):
     """Provides memoization for things that get created inside
     a context, i.e. mainly programs and kernels. Assumes that
     the first argument of the decorated function is the context.
     """
     try:
-        ctx_dict = func._pyopencl_ctx_dep_memoize_dic
+        ctx_dict = func._pyopencl_first_arg_dep_memoize_dic
     except AttributeError:
         # FIXME: This may keep contexts alive longer than desired.
         # But I guess since the memory in them is freed, who cares.
-        ctx_dict = func._pyopencl_ctx_dep_memoize_dic = {}
+        ctx_dict = func._pyopencl_first_arg_dep_memoize_dic = {}
 
     try:
         return ctx_dict[context][args]
     except KeyError:
-        context_dependent_memoized_functions.append(func)
+        first_arg_dependent_memoized_functions.append(func)
         arg_dict = ctx_dict.setdefault(context, {})
         result = func(context, *args)
         arg_dict[args] = result
         return result
 
+context_dependent_memoize = first_arg_dependent_memoize
+
 
 
 
 def clear_context_caches():
-    for func in context_dependent_memoized_functions:
+    for func in first_arg_dependent_memoized_functions:
         try:
-            ctx_dict = func._pycuda_ctx_dep_memoize_dic
+            ctx_dict = func._pyopencl_first_arg_dep_memoize_dic
         except AttributeError:
             pass
         else:
diff --git a/setup.py b/setup.py
index 78a8700184a72b3343fa7d967784d2c25e2cfdf0..27e4c864f5f9e537e7c8e067350ac9cc85cdbc2c 100644
--- a/setup.py
+++ b/setup.py
@@ -224,6 +224,11 @@ def main():
                     ),
                 ],
 
+            data_files=[
+                ("include/pyopencl",
+                    glob.glob("src/cl/*.cl") + glob.glob("src/cl/*.h"))
+                ],
+
             # 2to3 invocation
             cmdclass={'build_py': build_py})
 
diff --git a/src/cl/pyopencl-ranluxcl.cl b/src/cl/pyopencl-ranluxcl.cl
new file mode 100644
index 0000000000000000000000000000000000000000..1a1a0e53b0f5d4d1a2036c7f86bd86c691d50ac9
--- /dev/null
+++ b/src/cl/pyopencl-ranluxcl.cl
@@ -0,0 +1,798 @@
+#ifndef RANLUXCL_CL
+#define RANLUXCL_CL
+
+/**** RANLUXCL v1.3.1 MODIFIED *****************************************************
+
+Implements the RANLUX generator of Matrin Luscher, based on the Fortran 77
+implementation by Fred James. This OpenCL code is a complete implementation which 
+should perfectly replicate the numbers generated by the original Fortran 77
+implementation (if using the legacy initialization routine).
+
+***** QUICK USAGE DESCRIPTION ******************************************************
+
+1. Create an OpenCL buffer with room for at least 28 32-bit variables (112 byte).
+I.e., in C/C++: size_t buffSize = numWorkitems * 112;
+
+2. Pass the buffer and an unsigned integer seed <ins> to a kernel that launches the
+ranluxcl_initialization function. The seed <ins> can be any unsigned 32-bit integer,
+and must be different on different OpenCL devices/NDRanges to ensure different
+sequences. As long as the number of work-items on each device/NDRange is less than
+2^32 = 4294967296 all sequences will be different.
+An examle initialization kernel would be:
+	#include "ranluxcl.cl"
+	kernel void Kernel_Ranluxcl_Init(private uint ins, global ranluxcl_state_t *ranluxcltab){
+		ranluxcl_initialization(ins, ranluxcltab);
+	}
+
+3. Now the generator is ready for use. Remember to download the seeds first, and
+upload them again when done. Example kernel that downloads seeds, generates a float4
+where each component is uniformly distributed between 0 and 1, end points not included,
+then uploads the seeds again:
+	#include "ranluxcl.cl"
+	kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab){
+		//ranluxclstate is a struct of 7 float4 variables
+		//storing the state of the generator.
+		ranluxcl_state_t ranluxclstate;
+
+		//Download state into ranluxclstate struct.
+		ranluxcl_download_seed(&ranluxclstate, ranluxcltab);
+
+		//Generate a float4 with each component on (0,1),
+		//end points not included. We can call ranluxcl as many
+		//times as we like until we upload the state again.
+		float4 randomnr = ranluxcl32(&ranluxclstate);
+
+		//Upload state again so that we don't get the same
+		//numbers over again the next time we use ranluxcl.
+		ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
+	}
+
+***** MACROS ***********************************************************************
+
+The following macros can optionally be defined:
+
+RANLUXCL_LUX:
+Sets the luxury level of the generator. Should be 0-4, or if it is 24 or larger it
+sets the p-value of the generator (generally not needed). If this macro is not set
+then lux=4 is the default (highest quality). For many applications the high quality
+of lux=4 may not be needed. Indeed if two values (each value having 24 random bits) 
+are glued together to form a 48-bit value the generator passes all tests in the TestU01
+suite already with lux=2. See "TestU01: A C Library for Empirical Testing of Random 
+Number Generators" by PIERRE L�ECUYER and RICHARD SIMARD. SWB(224, 10, 24)[24, l] is 
+RANLUX with two values glued together to create 48-bit numbers, and we see that it
+passes all tests already at luxury value 2.
+
+RANLUXCL_NO_WARMUP:
+Turns off the warmup functionality in ranluxcl_initialization. This macro should
+generally not be used, since the generators will initially be correlated if it is
+defined. The only advantage is that the numbers generated will exactly correspond
+to those of the original Fortran 77 implementation.
+
+RANLUXCL_SUPPORT_DOUBLE:
+Enables double precision functions. Please enable the OpenCL double precision
+extension yourself, usually by "#pragma OPENCL EXTENSION cl_khr_fp64 : enable".
+
+RANLUXCL_USE_LEGACY_INITIALIZATION
+Uses exactly the same initialization routine as in the original Fortran 77 code,
+leading to the same sequences. If using legacy initialization there are some
+restrictions on what the seed <ins> can be, and it may also be necessary to define
+RANLUXCL_MAXWORKITEMS if several sequences are to be run in parallel.
+
+RANLUXCL_MAXWORKITEMS:
+When RANLUXCL_USE_LEGACY_INITIALIZATION is defined we may need this macro.
+If several OpenCL NDRanges will be running in parallel and the parallel sequences should
+be different then this macro should have a value equal or larger than the
+largest number of work-items in any of the parallel runs. The default is to use the
+current global size, so if all NDRanges are of the same size this need not be
+defined.
+	Each parallel instance must also have different seeds <ins>. For example if
+we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2 we would use
+different seeds for the two generators, and RANLUXCL_MAXWORKITEMS must be defined to
+be at least 10240. If GPU1 and GPU2 had the same number of work-items this would not
+be necessary. 
+	An underestimate of the highest permissible seed <ins> is given by the smallest of:
+(<maxins> = 10^9 / <numWorkitems>) or (<maxins> = 10^9 / RANLUXCL_MAXWORKITEMS).
+Please make sure that <ins> is never higher than this since it could cause undetected
+problems. For example with 10240 work-items the highest permissible <ins> is about
+100 000.
+	Again note that this is only relevant when using the legacy
+initialization function enabled by RANLUXCL_USE_LEGACY_INITIALIZATION. When not using
+the legacy initialization this macro is effectively set to a very high value of 2^32-1.
+
+***** FUNCTIONS: INITIALIZATION ****************************************************
+
+The initialization function is defined as:
+void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
+Run once at the very beginning. ranluxcltab should be a buffer with space for
+112 byte per work-item in the NDRange. <ins> is the seed to the generator.
+For a given <ins> each work-item in the NDRange will generate a different sequence.
+If more than one NDRange is used in parallel then <ins> must be different for each
+NDRange to avoid identical sequences.
+
+***** FUNCTIONS: SEED UPLOAD/DOWNLOAD **********************************************
+
+The following two functions should be launced at the beginning and end of a kernel
+that uses ranluxcl to generate numbers, respectively:
+
+void ranluxcl_download_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab)
+Run at the beginning of a kernel to download ranluxcl state data
+
+void ranluxcl_upload_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab)
+Run at the end of a kernel to upload state data
+
+***** FUNCTIONS: GENERATION AND SYNCHRONIZATION ************************************
+
+float4 ranluxcl32(ranluxcl_state_t *rst)
+Run to generate a pseudo-random float4 where each component is a number between
+0 and 1, end points not included (meaning the number will never be exactly 0 or 1).
+
+double4 ranluxcl64(ranluxcl_state_t *rst)
+Double precision version of the above function. The preprocessor macro
+RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.
+This function "glues" together two single-precision numbers to make one double
+precision number. Most of the work is still done in single precision, so the
+performance will be roughly halved regardless of the double precision performance
+of the hardware.
+
+float4 ranluxcl32norm(ranluxcl_state_t *rst)
+Run to generate a pseudo-random float4 where each component is normally distributed
+with mean 0 and standard deviation 1.
+
+double4 ranluxcl64norm(ranluxcl_state_t *rst)
+Double precision version of the above function. The preprocessor macro
+RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.
+
+void ranluxcl_synchronize(ranluxcl_state_t *rst)
+Run to synchronize execution in case different work-items have made a different
+number of calls to ranluxcl. On SIMD machines this could lead to inefficient execution.
+ranluxcl_synchronize allows us to make sure all generators are SIMD-friendly again. Not
+needed if all work-items always call ranluxcl the same number of times.
+
+***** PERFORMANCE ******************************************************************
+
+For luxury setting 4, performance on AMD Cypress should be ~4.5*10^9 pseudorandom 
+values per second, when not downloading values to host memory (i.e. the values are 
+just generated, but not used for anything in particular).
+
+***** DESCRIPTION OF THE IMPLEMENTATION ********************************************
+
+This code closely follows the original Fortran 77 code (see credit section). Here
+the differences (and similarities) between RANLUXCL (this implementation) and the
+original RANLUX are discussed.
+
+The Fortran 77 implementation uses a simple LCG to initialize the generator, and
+so the same approach is taken here. If RANLUXCL is initialized with <ins> = 0 as
+seed, the first work-item behaves like the original RANLUX with seed equal 1, the
+second work-item as if with seed equal 2 and so on. If <ins> = 1 then the first
+work-item behaves like the original RANLUX with seed equal to <numWorkitems> + 1,
+and so on for higher <ins> so that we never have overlapping sequences. This is
+why the RANLUXCL_MAXWORKITEMS macro must be set if we have different NDRanges with
+a different number of work-items.
+
+RANLUX is based on chaos theory, and what we are actually doing when selecting
+a luxury value is setting how many values to skip over (causing decorrelation).
+The number of values to skip is controlled by the so-called p-value of the
+generator. After generating 24 values we skip p - 24 values until again generating
+24 values.
+
+This implementation is somewhat modified from the original fortran implementation 
+by F. James. Because of the way the OpenCL code is optimized with 4-component 
+32-bit float vectors, it is most convenient to always throw away some multiple 
+of 24 values (i.e. p is always a multiple of 24).
+
+However, there might be some resonances if we always throw away a multiple of
+the seeds table size. Therefore the implementation is slightly more intricate
+where p can be a multiple of 4 instead, at a cost to performance (only about 10%
+lower than the cleaner 24 values approach on AMD Cypress). These two approaches
+are termed planar and planar shift respectively. The idea for the planar approach
+comes from the following paper:
+Vadim Demchik, Pseudo-random number generators for Monte Carlo simulations on 
+Graphics Processing Units, arXiv:1003.1898v1 [hep-lat]
+
+Below the p-values for the original reference implementation are listed along with 
+those of the planar shift implementation. Suggested values for the planar approach 
+are also presented. When this function is called with RANLUXCL_LUX set to 0-4, the
+planar shift values are used. To use the pure planar approach (for some extra
+performance with likely undetectable quality decrease), set lux equal to the specific 
+p-value.
+
+Luxury setting (RANLUXCL_LUX):                   0   1   2   3   4
+Original fortran77 implementation by F. James:  24  48  97  223 389
+Planar (suggested):                             24  48  120 240 408
+Planar shift:                                   24  48  100 224 404
+
+Note that levels 0 and 1 are the same as in the original implementation for both
+planar and planar shift. Level 4 of planar shift where p=404 is the same as chosen 
+for luxury level 1 by Martin Luescher for his v3 version of RANLUX. Therefore if 
+it is considered important to only use "official" values, luxury settings 0, 1 or 
+4 of planar shift should be used. It is however unlikely that the other values are 
+bad, they just haven't been as extensively used and tested by others.
+
+Variable names are generally the same as in the fortran77 implementation, however 
+because of the way the generator is implemented, the i24 and j24 variables are 
+no longer needed.
+
+***** CREDIT ***********************************************************************
+
+I have been told by Fred James (the coder) that the original Fortran 77 
+implementation (which is the subject of the second paper below) is free to use and 
+share. Therefore I am using the MIT license (below). But most importantly please 
+always remember to give credit to the two articles by Martin Luscher and Fred James, 
+describing the generator and the Fortran 77 implementation on which this 
+implementation is based, respectively:
+
+Martin L�scher, A portable high-quality random number generator for lattice 
+field theory simulations, Computer Physics Communications 79 (1994) 100-110
+
+F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom 
+number generator of L�scher, Computer Physics Communications 79 (1994) 111-114
+
+***** LICENSE **********************************************************************
+
+Copyright (c) 2011 Ivar Ursin Nikolaisen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+software and associated documentation files (the "Software"), to deal in the Software 
+without restriction, including without limitation the rights to use, copy, modify, 
+merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
+permit persons to whom the Software is furnished to do so, subject to the following 
+conditions:
+
+The above copyright notice and this permission notice shall be included in all copies 
+or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+***************************************************************************************/
+
+typedef struct{
+	float
+		seed01, seed02, seed03, seed04,
+		seed05, seed06, seed07, seed08,
+		seed09, seed10, seed11, seed12,
+		seed13, seed14, seed15, seed16,
+		seed17, seed18, seed19, seed20,
+		seed21, seed22, seed23, seed24;
+	float carry;
+	float dummy; //Causes struct to be a multiple of 128 bits
+	int in24;
+	int stepnr;
+} ranluxcl_state_t;
+
+#define RANLUXCL_TWOM24 0.000000059604644775f
+#define RANLUXCL_TWOM12 0.000244140625f
+
+#ifdef RANLUXCL_LUX
+#if RANLUXCL_LUX < 0
+#error ranluxcl: lux must be zero or positive.
+#endif
+#else
+#define RANLUXCL_LUX 4 //Default to high quality
+#endif //RANLUXCL_LUX
+
+//Here the luxury values are defined
+#if RANLUXCL_LUX == 0
+#define RANLUXCL_NSKIP 0
+#elif RANLUXCL_LUX == 1
+#define RANLUXCL_NSKIP 24
+#elif RANLUXCL_LUX == 2
+#define RANLUXCL_NSKIP 76
+#elif RANLUXCL_LUX == 3
+#define RANLUXCL_NSKIP 200
+#elif RANLUXCL_LUX == 4
+#define RANLUXCL_NSKIP 380
+#else
+#define RANLUXCL_NSKIP (RANLUXCL_LUX - 24)
+#endif //RANLUXCL_LUX == 0
+
+//Check that nskip is a permissible value
+#if RANLUXCL_NSKIP % 4 != 0 
+#error nskip must be divisible by 4!
+#endif
+#if RANLUXCL_NSKIP < 24 && RANLUXCL_NSKIP != 0
+#error nskip must be either 0 or >= 24!
+#endif
+#if RANLUXCL_NSKIP < 0
+#error nskip is negative!
+#endif
+
+//Check if planar scheme is recovered
+#if RANLUXCL_NSKIP % 24 == 0
+#define RANLUXCL_PLANAR
+#endif
+
+//Check if we will skip at all
+#if RANLUXCL_NSKIP == 0
+#define RANLUXCL_NOSKIP
+#endif
+
+//Single-value global size and id
+#define RANLUXCL_NUMWORKITEMS (get_global_size(0) * get_global_size(1) * get_global_size(2))
+#define RANLUXCL_MYID (get_global_id(0) + get_global_id(1) * get_global_size(0) + get_global_id(2) * get_global_size(0) * get_global_size(1))
+
+void ranluxcl_download_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab)
+{
+	(*rst) = ranluxcltab[RANLUXCL_MYID];
+}
+
+void ranluxcl_upload_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab)
+{
+	ranluxcltab[RANLUXCL_MYID] = (*rst);
+}
+
+float ranluxcl_onestep(float sj24m1, float sj24, float *si24, float *carry){
+	float uni, out;
+	uni = sj24 - (*si24) - (*carry);
+	if(uni < 0.0f){
+		uni += 1.0f;
+		(*carry) = RANLUXCL_TWOM24;
+	} else (*carry) = 0.0f;
+	out = ((*si24) = uni);
+
+	if(uni < RANLUXCL_TWOM12){
+		out += RANLUXCL_TWOM24 * sj24m1;
+		if(out == 0.0f) out = RANLUXCL_TWOM24 * RANLUXCL_TWOM24;
+	}
+	return out;
+}
+
+float4 ranluxcl32(ranluxcl_state_t *rst)
+{
+	//ranluxcl32 returns a 4-component float vector where each component is uniformly distributed
+	//between 0-1, end points not included.
+
+	float4 out;
+
+	if((*rst).stepnr == 0){
+		out.x = ranluxcl_onestep((*rst).seed09, (*rst).seed10, &((*rst).seed24), &((*rst).carry));
+		out.y = ranluxcl_onestep((*rst).seed08, (*rst).seed09, &((*rst).seed23), &((*rst).carry));
+		out.z = ranluxcl_onestep((*rst).seed07, (*rst).seed08, &((*rst).seed22), &((*rst).carry));
+		out.w = ranluxcl_onestep((*rst).seed06, (*rst).seed07, &((*rst).seed21), &((*rst).carry));
+		(*rst).stepnr += 4;
+	}
+
+	else if((*rst).stepnr == 4){
+		out.x = ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry));
+		out.y = ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry));
+		out.z = ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry));
+		out.w = ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry));
+		(*rst).stepnr += 4;
+	}
+
+	else if((*rst).stepnr == 8){
+		out.x = ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry));
+		out.y = ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry));
+		out.z = ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry));
+		out.w = ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry));
+		(*rst).stepnr += 4;
+	}
+
+	else if((*rst).stepnr == 12){
+		out.x = ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry));
+		out.y = ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry));
+		out.z = ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry));
+		out.w = ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry));
+		(*rst).stepnr += 4;
+	}
+
+	else if((*rst).stepnr == 16){
+		out.x = ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry));
+		out.y = ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry));
+		out.z = ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry));
+		out.w = ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry));
+		(*rst).stepnr += 4;
+	}
+
+	else if((*rst).stepnr == 20){
+		out.x = ranluxcl_onestep((*rst).seed13, (*rst).seed14, &((*rst).seed04), &((*rst).carry));
+		out.y = ranluxcl_onestep((*rst).seed12, (*rst).seed13, &((*rst).seed03), &((*rst).carry));
+		out.z = ranluxcl_onestep((*rst).seed11, (*rst).seed12, &((*rst).seed02), &((*rst).carry));
+		out.w = ranluxcl_onestep((*rst).seed10, (*rst).seed11, &((*rst).seed01), &((*rst).carry));
+		(*rst).stepnr = 0;
+
+//The below preprocessor directives are here to recover the simpler planar scheme when nskip is a multiple of 24.
+//For the most general planar shift approach, just ignore all #if's below.
+#ifndef RANLUXCL_PLANAR
+	}
+
+	(*&((*rst).in24)) += 4;
+	if((*&((*rst).in24)) == 24){
+		(*&((*rst).in24)) = 0;
+#endif //RANLUXCL_PLANAR
+
+		int initialskips = ((*rst).stepnr) ? (24 - (*rst).stepnr) : 0;
+		int bulkskips = ((RANLUXCL_NSKIP - initialskips)/24) * 24;
+		int remainingskips = RANLUXCL_NSKIP - initialskips - bulkskips;
+
+//We know there won't be any initial skips in the planar scheme
+#ifndef RANLUXCL_PLANAR
+		//Do initial skips (lack of breaks in switch is intentional).
+		switch(initialskips){
+			case(20):
+				ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry));
+			case(16):
+				ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry));
+			case(12):
+				ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry));
+			case(8):
+				ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry));
+			case(4):
+				ranluxcl_onestep((*rst).seed13, (*rst).seed14, &((*rst).seed04), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed12, (*rst).seed13, &((*rst).seed03), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed11, (*rst).seed12, &((*rst).seed02), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed10, (*rst).seed11, &((*rst).seed01), &((*rst).carry));
+		}
+#endif //RANLUXCL_PLANAR
+
+//Also check if we will ever need to skip at all
+#ifndef RANLUXCL_NOSKIP
+		for(int i=0; i<bulkskips/24; i++){
+			ranluxcl_onestep((*rst).seed09, (*rst).seed10, &((*rst).seed24), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed08, (*rst).seed09, &((*rst).seed23), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed07, (*rst).seed08, &((*rst).seed22), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed06, (*rst).seed07, &((*rst).seed21), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed13, (*rst).seed14, &((*rst).seed04), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed12, (*rst).seed13, &((*rst).seed03), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed11, (*rst).seed12, &((*rst).seed02), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed10, (*rst).seed11, &((*rst).seed01), &((*rst).carry));
+		}
+#endif //RANLUXCL_NOSKIP
+
+//There also won't be any remaining skips in the planar scheme
+#ifndef RANLUXCL_PLANAR
+		//Do remaining skips
+		if(remainingskips){
+			ranluxcl_onestep((*rst).seed09, (*rst).seed10, &((*rst).seed24), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed08, (*rst).seed09, &((*rst).seed23), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed07, (*rst).seed08, &((*rst).seed22), &((*rst).carry));
+			ranluxcl_onestep((*rst).seed06, (*rst).seed07, &((*rst).seed21), &((*rst).carry));
+
+			if(remainingskips > 4){
+				ranluxcl_onestep((*rst).seed05, (*rst).seed06, &((*rst).seed20), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed04, (*rst).seed05, &((*rst).seed19), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed03, (*rst).seed04, &((*rst).seed18), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed02, (*rst).seed03, &((*rst).seed17), &((*rst).carry));
+			}
+
+			if(remainingskips > 8){
+				ranluxcl_onestep((*rst).seed01, (*rst).seed02, &((*rst).seed16), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed24, (*rst).seed01, &((*rst).seed15), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed23, (*rst).seed24, &((*rst).seed14), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed22, (*rst).seed23, &((*rst).seed13), &((*rst).carry));
+			}
+
+			if(remainingskips > 12){
+				ranluxcl_onestep((*rst).seed21, (*rst).seed22, &((*rst).seed12), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed20, (*rst).seed21, &((*rst).seed11), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed19, (*rst).seed20, &((*rst).seed10), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed18, (*rst).seed19, &((*rst).seed09), &((*rst).carry));
+			}
+
+			if(remainingskips > 16){
+				ranluxcl_onestep((*rst).seed17, (*rst).seed18, &((*rst).seed08), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed16, (*rst).seed17, &((*rst).seed07), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed15, (*rst).seed16, &((*rst).seed06), &((*rst).carry));
+				ranluxcl_onestep((*rst).seed14, (*rst).seed15, &((*rst).seed05), &((*rst).carry));
+			}
+		}
+#endif //RANLUXCL_PLANAR
+
+		//Initial skips brought stepnr down to 0. The bulk skips did only full cycles.
+		//Therefore stepnr is now equal to remainingskips.
+		(*rst).stepnr = remainingskips;
+	}
+
+	return out;
+}
+
+void ranluxcl_synchronize(ranluxcl_state_t *rst){
+	//This function generates numbers so that the generator is at the beginning,
+	//i.e. ready to generate 24 numbers before the next skipping sequence. This is
+	//useful if different work-items have called ranluxcl a different number of times.
+	//Since that would lead to out of sync execution it could be rather inefficient on
+	//SIMD architectures like GPUs. This function thus allows us to resynchronize
+	//execution across all work-items.
+
+	//Do necessary number of calls to ranluxcl so that stepnr == 0 at the end.
+	if((*rst).stepnr == 4)
+		ranluxcl32(rst);
+	if((*rst).stepnr == 8)
+		ranluxcl32(rst);
+	if((*rst).stepnr == 12)
+		ranluxcl32(rst);
+	if((*rst).stepnr == 16)
+		ranluxcl32(rst);
+	if((*rst).stepnr == 20)
+		ranluxcl32(rst);
+}
+
+void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
+{
+	ranluxcl_state_t rst;
+
+	#ifdef RANLUXCL_USE_LEGACY_INITIALIZATION
+	//Using legacy initialization from original Fortan 77 implementation
+
+	//ins is scaled so that if the user makes another call somewhere else
+	//with ins + 1 there should be no overlap. Also adding one
+	//allows us to use ins = 0.
+	int k, maxWorkitems;
+
+	#ifdef RANLUXCL_USE_LEGACY_INITIALIZATION
+	maxWorkitems = RANLUXCL_USE_LEGACY_INITIALIZATION;
+	#else
+	maxWorkitems = RANLUXCL_NUMWORKITEMS;
+	#endif //RANLUXCL_USE_LEGACY_INITIALIZATION
+
+	int scaledins = ins * maxWorkitems + 1;
+
+	int js = scaledins + RANLUXCL_MYID;
+
+	//Make sure js is not too small (should really be an error)
+	if(js < 1)
+		js = 1;
+
+	#define IC 2147483563
+	#define ITWO24 16777216
+
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed01=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed02=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed03=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed04=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed05=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed06=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed07=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed08=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed09=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed10=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed11=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed12=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed13=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed14=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed15=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed16=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed17=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed18=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed19=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed20=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed21=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed22=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed23=(js%ITWO24)*RANLUXCL_TWOM24;
+	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.seed24=(js%ITWO24)*RANLUXCL_TWOM24;
+
+	#undef IC
+	#undef ITWO24
+
+	#else //RANLUXCL_USE_LEGACY_INITIALIZATION
+	//Using default initialization
+
+	#define RANLUXCL_POW2_24 16777216
+	#define RANLUXCL_56 0x00FFFFFFFFFFFFFF
+	#define RANLUXCL_48 0x0000FFFFFFFFFFFF
+	#define RANLUXCL_40 0x000000FFFFFFFFFF
+	#define RANLUXCL_32 0x00000000FFFFFFFF
+	#define RANLUXCL_24 0x0000000000FFFFFF
+	#define RANLUXCL_16 0x000000000000FFFF
+	#define RANLUXCL_8  0x00000000000000FF
+
+	//We scale ins by (2^32)-1. As long as we never use more than (2^32)-1 work-items per
+	//NDRange the initial states should never be the same. We use a simple 64-bit LCG from
+	//the SPRNG library. We choose a single prime b, while in SPRNG for parallel applications
+	//different b give rise to different sequences in parallel (a feature not used here).
+	ulong x1, x2, x3;
+	ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * 4294967295;
+	ulong a = 2862933555777941757;
+	ulong b = 3037000493;
+
+	//Logical shifts used so that all 64 bits of LCG output are used (24 bits per float),
+	//to be certain that all initial states are different.
+	x1 = x = ((a * x + b) & ULONG_MAX);
+	x2 = x = ((a * x + b) & ULONG_MAX);
+	x3 = x = ((a * x + b) & ULONG_MAX);
+	rst.seed01 = (float)  (x1 >> 40)                              / (float)RANLUXCL_POW2_24;
+	rst.seed02 = (float) ((x1 & RANLUXCL_40) >> 16)               / (float)RANLUXCL_POW2_24;
+	rst.seed03 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))  / (float)RANLUXCL_POW2_24;
+	rst.seed04 = (float) ((x2 & RANLUXCL_56) >> 32)               / (float)RANLUXCL_POW2_24;
+	rst.seed05 = (float) ((x2 & RANLUXCL_32) >> 8)                / (float)RANLUXCL_POW2_24;
+	rst.seed06 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24;
+	rst.seed07 = (float) ((x3 & RANLUXCL_48) >> 24)               / (float)RANLUXCL_POW2_24;
+	rst.seed08 = (float)  (x3 & RANLUXCL_24)                      / (float)RANLUXCL_POW2_24;
+
+	x1 = x = ((a * x + b) & ULONG_MAX);
+	x2 = x = ((a * x + b) & ULONG_MAX);
+	x3 = x = ((a * x + b) & ULONG_MAX);
+	rst.seed09 = (float)  (x1 >> 40)                              / (float)RANLUXCL_POW2_24;
+	rst.seed10 = (float) ((x1 & RANLUXCL_40) >> 16)               / (float)RANLUXCL_POW2_24;
+	rst.seed11 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))  / (float)RANLUXCL_POW2_24;
+	rst.seed12 = (float) ((x2 & RANLUXCL_56) >> 32)               / (float)RANLUXCL_POW2_24;
+	rst.seed13 = (float) ((x2 & RANLUXCL_32) >> 8)                / (float)RANLUXCL_POW2_24;
+	rst.seed14 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24;
+	rst.seed15 = (float) ((x3 & RANLUXCL_48) >> 24)               / (float)RANLUXCL_POW2_24;
+	rst.seed16 = (float)  (x3 & RANLUXCL_24)                      / (float)RANLUXCL_POW2_24;
+
+	x1 = x = ((a * x + b) & ULONG_MAX);
+	x2 = x = ((a * x + b) & ULONG_MAX);
+	x3 = x = ((a * x + b) & ULONG_MAX);
+	rst.seed17 = (float)  (x1 >> 40)                              / (float)RANLUXCL_POW2_24;
+	rst.seed18 = (float) ((x1 & RANLUXCL_40) >> 16)               / (float)RANLUXCL_POW2_24;
+	rst.seed19 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))  / (float)RANLUXCL_POW2_24;
+	rst.seed20 = (float) ((x2 & RANLUXCL_56) >> 32)               / (float)RANLUXCL_POW2_24;
+	rst.seed21 = (float) ((x2 & RANLUXCL_32) >> 8)                / (float)RANLUXCL_POW2_24;
+	rst.seed22 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24;
+	rst.seed23 = (float) ((x3 & RANLUXCL_48) >> 24)               / (float)RANLUXCL_POW2_24;
+	rst.seed24 = (float)  (x3 & RANLUXCL_24)                      / (float)RANLUXCL_POW2_24;
+
+	#undef RANLUXCL_POW2_24
+	#undef RANLUXCL_56
+	#undef RANLUXCL_48
+	#undef RANLUXCL_40
+	#undef RANLUXCL_32
+	#undef RANLUXCL_24
+	#undef RANLUXCL_16
+	#undef RANLUXCL_8
+
+	#endif //RANLUXCL_USE_LEGACY_INITIALIZATION
+
+	rst.in24 = 0;
+	rst.stepnr = 0;
+	rst.carry = 0.0f;
+	if(rst.seed24 == 0.0f)
+		rst.carry = RANLUXCL_TWOM24;
+
+	#ifndef RANLUXCL_NO_WARMUP
+	//Warming up the generator, ensuring there are no initial correlations.
+	//16 is a "magic number". It is the number of times we must generate
+	//a batch of 24 numbers to ensure complete decorrelation.
+	for(int i=0; i<16; i++){
+		ranluxcl_onestep(rst.seed09, rst.seed10, &(rst.seed24), &(rst.carry));
+		ranluxcl_onestep(rst.seed08, rst.seed09, &(rst.seed23), &(rst.carry));
+		ranluxcl_onestep(rst.seed07, rst.seed08, &(rst.seed22), &(rst.carry));
+		ranluxcl_onestep(rst.seed06, rst.seed07, &(rst.seed21), &(rst.carry));
+		ranluxcl_onestep(rst.seed05, rst.seed06, &(rst.seed20), &(rst.carry));
+		ranluxcl_onestep(rst.seed04, rst.seed05, &(rst.seed19), &(rst.carry));
+		ranluxcl_onestep(rst.seed03, rst.seed04, &(rst.seed18), &(rst.carry));
+		ranluxcl_onestep(rst.seed02, rst.seed03, &(rst.seed17), &(rst.carry));
+		ranluxcl_onestep(rst.seed01, rst.seed02, &(rst.seed16), &(rst.carry));
+		ranluxcl_onestep(rst.seed24, rst.seed01, &(rst.seed15), &(rst.carry));
+		ranluxcl_onestep(rst.seed23, rst.seed24, &(rst.seed14), &(rst.carry));
+		ranluxcl_onestep(rst.seed22, rst.seed23, &(rst.seed13), &(rst.carry));
+		ranluxcl_onestep(rst.seed21, rst.seed22, &(rst.seed12), &(rst.carry));
+		ranluxcl_onestep(rst.seed20, rst.seed21, &(rst.seed11), &(rst.carry));
+		ranluxcl_onestep(rst.seed19, rst.seed20, &(rst.seed10), &(rst.carry));
+		ranluxcl_onestep(rst.seed18, rst.seed19, &(rst.seed09), &(rst.carry));
+		ranluxcl_onestep(rst.seed17, rst.seed18, &(rst.seed08), &(rst.carry));
+		ranluxcl_onestep(rst.seed16, rst.seed17, &(rst.seed07), &(rst.carry));
+		ranluxcl_onestep(rst.seed15, rst.seed16, &(rst.seed06), &(rst.carry));
+		ranluxcl_onestep(rst.seed14, rst.seed15, &(rst.seed05), &(rst.carry));
+		ranluxcl_onestep(rst.seed13, rst.seed14, &(rst.seed04), &(rst.carry));
+		ranluxcl_onestep(rst.seed12, rst.seed13, &(rst.seed03), &(rst.carry));
+		ranluxcl_onestep(rst.seed11, rst.seed12, &(rst.seed02), &(rst.carry));
+		ranluxcl_onestep(rst.seed10, rst.seed11, &(rst.seed01), &(rst.carry));
+	}
+	#endif //RANLUXCL_NO_WARMUP
+
+	//Upload the state
+	ranluxcl_upload_seed(&rst, ranluxcltab);
+}
+
+float4 ranluxcl32norm(ranluxcl_state_t *rst)
+{
+	//Returns a vector where each component is a normally
+	//distributed PRN centered on 0, with standard deviation
+	//1. Note: M_PI_F is an OpenCL macro for the value of pi.
+	//M_PI would be the 64-bit double version.
+
+	float4 U = ranluxcl32(rst);
+
+	float4 Z;
+	float R, phi;
+
+	R = sqrt(-2 * log(U.x));
+	phi = 2 * M_PI_F * U.y;
+	Z.x = R * cos(phi);
+	Z.y = R * sin(phi);
+
+	R = sqrt(-2 * log(U.z));
+	phi = 2 * M_PI_F * U.w;
+	Z.z = R * cos(phi);
+	Z.w = R * sin(phi);
+
+	return Z;
+}
+
+#ifdef RANLUXCL_SUPPORT_DOUBLE
+double4 ranluxcl64(ranluxcl_state_t *rst)
+{
+	double4 out;
+	float4 randvec;
+
+	//We know this value is caused by the never-zero part
+	//of the original algorithm, but we want to allow zero for
+	//the most significant bits in the double precision result.
+	randvec = ranluxcl32(rst);
+	if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
+		randvec.x = 0.0f;
+	if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
+		randvec.z = 0.0f;
+
+	out.x = (double)(randvec.x) + (double)(randvec.y) / 16777216;
+	out.y = (double)(randvec.z) + (double)(randvec.w) / 16777216;
+
+	randvec = ranluxcl32(rst);
+	if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
+		randvec.x = 0.0f;
+	if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
+		randvec.z = 0.0f;
+
+	out.z = (double)(randvec.x) + (double)(randvec.y) / 16777216;
+	out.w = (double)(randvec.z) + (double)(randvec.w) / 16777216;
+
+	return out;
+}
+
+double4 ranluxcl64norm(ranluxcl_state_t *rst)
+{
+	//Returns a vector where each component is a normally
+	//distributed PRN centered on 0, with standard deviation
+	//1.
+
+	double4 U = ranluxcl64(rst);
+
+	double4 Z;
+	double R, phi;
+
+	R = sqrt(-2 * log(U.x));
+	phi = 2 * M_PI * U.y;
+	Z.x = R * cos(phi);
+	Z.y = R * sin(phi);
+
+	R = sqrt(-2 * log(U.z));
+	phi = 2 * M_PI * U.w;
+	Z.z = R * cos(phi);
+	Z.w = R * sin(phi);
+
+	return Z;
+}
+#endif //RANLUXCL_SUPPORT_DOUBLE
+
+#undef RANLUXCL_TWOM24
+#undef RANLUXCL_TWOM12
+#undef RANLUXCL_NUMWORKITEMS
+#undef RANLUXCL_MYID
+#undef RANLUXCL_PLANAR
+#undef RANLUXCL_NOSKIP
+
+#endif //RANLUXCL_CL
diff --git a/test/test_array.py b/test/test_array.py
index 9854afdc3760e00f17b99bd70de7ed7330a054ab..2a939cedeaaa0a687d5da7329126c61c6f604a36 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -226,18 +226,39 @@ def test_random(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    from pyopencl.clrandom import rand as clrand
+    from pyopencl.clrandom import RanluxGenerator
 
     if has_double_support(context.devices[0]):
         dtypes = [np.float32, np.float64]
     else:
         dtypes = [np.float32]
 
+    gen = RanluxGenerator(queue, 5120)
+
     for dtype in dtypes:
-        a = clrand(context, queue, (10, 100), dtype=dtype).get()
+        ran = gen.uniform(queue, (10007,), dtype)
+        assert (0 < ran.get()).all()
+        assert (ran.get() < 1).all()
+
+        gen.synchronize(queue)
+
+        ran = gen.uniform(queue, (10007,), dtype, a=4, b=7)
+        assert (4 < ran.get()).all()
+        assert (ran.get() < 7).all()
+
+        ran = gen.normal(queue, (10007,), dtype, mu=4, sigma=3)
+
+    dtypes = [np.int32]
+    for dtype in dtypes:
+        ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300)
+        assert (200 <= ran.get()).all()
+        assert (ran.get() < 300).all()
+        #from matplotlib import pyplot as pt
+        #pt.hist(ran.get())
+        #pt.show()
+
+
 
-        assert (0 <= a).all()
-        assert (a < 1).all()
 
 
 @pytools.test.mark_test.opencl