diff --git a/LICENSE b/LICENSE
index a273dcc5e460c2bd6bc9d9aa4c5709b930712029..a830ab0fde039982bfc6262f36f0539d566fc8a4 100644
--- a/LICENSE
+++ b/LICENSE
@@ -47,6 +47,38 @@ implementation). These parts are licensed as follows:
     with software licensed exclusively under GPL2.  (Most software is licensed
     as GPL2 or later, in which case this is not an issue.)
 
+PyOpenCL includes parts of the Random123 suite of random number generators:
+
+    Copyright 2010-2012, D. E. Shaw Research.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions, and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions, and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of D. E. Shaw Research nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 PyOpenCL includes the RANLUXCL random number generator:
 
     Copyright (c) 2011 Ivar Ursin Nikolaisen
diff --git a/doc/array.rst b/doc/array.rst
index 8c3256eb983bb9e78f39f3ee4a6128a59869c900..4f4b7ef66f5b050c8455d3d19ebc9b6c693bcbe6 100644
--- a/doc/array.rst
+++ b/doc/array.rst
@@ -292,14 +292,3 @@ Generating Arrays of Random Numbers
 -----------------------------------
 
 .. automodule:: pyopencl.clrandom
-
-    .. autoclass:: RanluxGenerator
-
-        .. automethod:: fill_uniform
-        .. automethod:: uniform
-        .. automethod:: fill_normal
-        .. automethod:: normal
-        .. automethod:: synchronize
-
-    .. autofunction:: rand
-    .. autofunction:: fill_rand
diff --git a/doc/misc.rst b/doc/misc.rst
index d6f3b530a02a871f475830dd88f968000a3e3977..eef15c1cf6b2ddab34303179bc73c9e19af4c3c7 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -118,6 +118,10 @@ Version 2016.2
     This version is currently under development. You can get snapshots from
     PyOpenCL's `git repository <https://github.com/pyopencl/pyopencl>`_
 
+* Deprecate RANLUXCL. It will be removed in the 2018.x series of PyOpenCL.
+* Introduce Random123 random number generators. See :mod:`pyopencl.clrandom`
+  for more information.
+
 Version 2016.1
 --------------
 
diff --git a/pyopencl/cl/pyopencl-random123/array.h b/pyopencl/cl/pyopencl-random123/array.h
new file mode 100644
index 0000000000000000000000000000000000000000..8af5a4f4520686e0bec582e80c901257ec287bad
--- /dev/null
+++ b/pyopencl/cl/pyopencl-random123/array.h
@@ -0,0 +1,325 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _r123array_dot_h__
+#define _r123array_dot_h__
+#include "openclfeatures.h"
+
+#ifndef __cplusplus
+#define CXXMETHODS(_N, W, T)
+#define CXXOVERLOADS(_N, W, T)
+#else
+
+#include <stddef.h>
+#include <algorithm>
+#include <stdexcept>
+#include <iterator>
+#include <limits>
+#include <iostream>
+
+/** @defgroup arrayNxW The r123arrayNxW classes 
+
+    Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers.
+    It is functionally equivalent to the C++0x std::array<N, uintW_t>,
+    but does not require C++0x features or libraries.
+
+    In addition to meeting most of the requirements of a Container,
+    it also has a member function, incr(), which increments the zero-th
+    element and carrys overflows into higher indexed elements.  Thus,
+    by using incr(), sequences of up to 2^(N*W) distinct values
+    can be produced. 
+
+    If SSE is supported by the compiler, then the class
+    r123array1xm128i is also defined, in which the data member is an
+    array of one r123128i object.
+
+    @cond HIDDEN_FROM_DOXYGEN
+*/
+
+template <typename value_type>
+inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){
+    value_type v=0;
+    for(size_t i=0; i<(3+sizeof(value_type))/4; ++i)
+        v |= ((value_type)(*p32++)) << (32*i);
+    return v;
+}
+
+// Work-alike methods and typedefs modeled on std::array:
+#define CXXMETHODS(_N, W, T)                                            \
+    typedef T value_type;                                               \
+    typedef T* iterator;                                                \
+    typedef const T* const_iterator;                                    \
+    typedef value_type& reference;                                      \
+    typedef const value_type& const_reference;                          \
+    typedef size_t size_type;                                           \
+    typedef ptrdiff_t difference_type;                                  \
+    typedef T* pointer;                                                 \
+    typedef const T* const_pointer;                                     \
+    typedef std::reverse_iterator<iterator> reverse_iterator;           \
+    typedef std::reverse_iterator<const_iterator> const_reverse_iterator; \
+    /* Boost.array has static_size.  C++11 specializes tuple_size */    \
+    enum {static_size = _N};                                            \
+    R123_CUDA_DEVICE reference operator[](size_type i){return v[i];}                     \
+    R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];}        \
+    R123_CUDA_DEVICE reference at(size_type i){ if(i >=  _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
+    R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >=  _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
+    R123_CUDA_DEVICE size_type size() const { return  _N; }                              \
+    R123_CUDA_DEVICE size_type max_size() const { return _N; }                           \
+    R123_CUDA_DEVICE bool empty() const { return _N==0; };                               \
+    R123_CUDA_DEVICE iterator begin() { return &v[0]; }                                  \
+    R123_CUDA_DEVICE iterator end() { return &v[_N]; }                                   \
+    R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; }                      \
+    R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; }                       \
+    R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; }                     \
+    R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; }                      \
+    R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); }        \
+    R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \
+    R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); }        \
+    R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \
+    R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \
+    R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } \
+    R123_CUDA_DEVICE pointer data(){ return &v[0]; }                                     \
+    R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; }                         \
+    R123_CUDA_DEVICE reference front(){ return v[0]; }                                   \
+    R123_CUDA_DEVICE const_reference front() const{ return v[0]; }                       \
+    R123_CUDA_DEVICE reference back(){ return v[_N-1]; }                                 \
+    R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; }                     \
+    R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \
+	/* CUDA3 does not have std::equal */ \
+	for (size_t i = 0; i < _N; ++i) \
+	    if (v[i] != rhs.v[i]) return false; \
+	return true; \
+    } \
+    R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \
+    /* CUDA3 does not have std::fill_n */ \
+    R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \
+    R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \
+	/* CUDA3 does not have std::swap_ranges */ \
+	for (size_t i = 0; i < _N; ++i) { \
+	    T tmp = v[i]; \
+	    v[i] = rhs.v[i]; \
+	    rhs.v[i] = tmp; \
+	} \
+    } \
+    R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){                         \
+        /* This test is tricky because we're trying to avoid spurious   \
+           complaints about illegal shifts, yet still be compile-time   \
+           evaulated. */                                                \
+        if(sizeof(T)<sizeof(n) && n>>((sizeof(T)<sizeof(n))?8*sizeof(T):0) ) \
+            return incr_carefully(n);                                   \
+        if(n==1){                                                       \
+            ++v[0];                                                     \
+            if(_N==1 || R123_BUILTIN_EXPECT(!!v[0], 1)) return *this;   \
+        }else{                                                          \
+            v[0] += n;                                                  \
+            if(_N==1 || R123_BUILTIN_EXPECT(n<=v[0], 1)) return *this;  \
+        }                                                               \
+        /* We expect that the N==?? tests will be                       \
+           constant-folded/optimized away by the compiler, so only the  \
+           overflow tests (!!v[i]) remain to be done at runtime.  For  \
+           small values of N, it would be better to do this as an       \
+           uncondtional sequence of adc.  An experiment/optimization    \
+           for another day...                                           \
+           N.B.  The weird subscripting: v[_N>3?3:0] is to silence      \
+           a spurious error from icpc                                   \
+           */                                                           \
+        ++v[_N>1?1:0];                                                  \
+        if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \
+        ++v[_N>2?2:0];                                                  \
+        if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this;  \
+        ++v[_N>3?3:0];                                                  \
+        for(size_t i=4; i<_N; ++i){                                     \
+            if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this;        \
+            ++v[i];                                                     \
+        }                                                               \
+        return *this;                                                   \
+    }                                                                   \
+    /* seed(SeedSeq) would be a constructor if having a constructor */  \
+    /* didn't cause headaches with defaults */                          \
+    template <typename SeedSeq>                                         \
+    R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){      \
+        r123array##_N##x##W ret;                                        \
+        const size_t Ngen = _N*((3+sizeof(value_type))/4);              \
+        uint32_t u32[Ngen];                                             \
+        uint32_t *p32 = &u32[0];                                        \
+        ss.generate(&u32[0], &u32[Ngen]);                               \
+        for(size_t i=0; i<_N; ++i){                                     \
+            ret.v[i] = assemble_from_u32<value_type>(p32);              \
+            p32 += (3+sizeof(value_type))/4;                            \
+        }                                                               \
+        return ret;                                                     \
+    }                                                                   \
+protected:                                                              \
+    R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \
+        /* n may be greater than the maximum value of a single value_type */ \
+        value_type vtn;                                                 \
+        vtn = n;                                                        \
+        v[0] += n;                                                      \
+        const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \
+        for(size_t i=1; i<_N; ++i){                                     \
+            if(rshift){                                                 \
+                n >>= rshift;                                           \
+            }else{                                                      \
+                n=0;                                                    \
+            }                                                           \
+            if( v[i-1] < vtn )                                          \
+                ++n;                                                    \
+            if( n==0 ) break;                                           \
+            vtn = n;                                                    \
+            v[i] += n;                                                  \
+        }                                                               \
+        return *this;                                                   \
+    }                                                                   \
+    
+                                                                        
+// There are several tricky considerations for the insertion and extraction
+// operators:
+// - we would like to be able to print r123array16x8 as a sequence of 16 integers,
+//   not as 16 bytes.
+// - we would like to be able to print r123array1xm128i.
+// - we do not want an int conversion operator in r123m128i because it causes
+//   lots of ambiguity problems with automatic promotions.
+// Solution: r123arrayinsertable and r123arrayextractable
+
+template<typename T>
+struct r123arrayinsertable{
+    const T& v;
+    r123arrayinsertable(const T& t_) : v(t_) {} 
+    friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<T>& t){
+        return os << t.v;
+    }
+};
+
+template<>
+struct r123arrayinsertable<uint8_t>{
+    const uint8_t& v;
+    r123arrayinsertable(const uint8_t& t_) : v(t_) {} 
+    friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<uint8_t>& t){
+        return os << (int)t.v;
+    }
+};
+
+template<typename T>
+struct r123arrayextractable{
+    T& v;
+    r123arrayextractable(T& t_) : v(t_) {}
+    friend std::istream& operator>>(std::istream& is, r123arrayextractable<T>& t){
+        return is >> t.v;
+    }
+};
+
+template<>
+struct r123arrayextractable<uint8_t>{
+    uint8_t& v;
+    r123arrayextractable(uint8_t& t_) : v(t_) {} 
+    friend std::istream& operator>>(std::istream& is, r123arrayextractable<uint8_t>& t){
+        int i;
+        is >>  i;
+        t.v = i;
+        return is;
+    }
+};
+
+#define CXXOVERLOADS(_N, W, T)                                          \
+                                                                        \
+inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){   \
+    os << r123arrayinsertable<T>(a.v[0]);                                  \
+    for(size_t i=1; i<_N; ++i)                                          \
+        os << " " << r123arrayinsertable<T>(a.v[i]);                       \
+    return os;                                                          \
+}                                                                       \
+                                                                        \
+inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){         \
+    for(size_t i=0; i<_N; ++i){                                         \
+        r123arrayextractable<T> x(a.v[i]);                                 \
+        is >> x;                                                        \
+    }                                                                   \
+    return is;                                                          \
+}                                                                       \
+                                                                        \
+namespace r123{                                                        \
+ typedef r123array##_N##x##W Array##_N##x##W;                          \
+}
+                                                                        
+#endif /* __cplusplus */
+
+/* _r123array_tpl expands to a declaration of struct r123arrayNxW.  
+
+   In C, it's nothing more than a struct containing an array of N
+   objects of type T.
+
+   In C++ it's the same, but endowed with an assortment of member
+   functions, typedefs and friends.  In C++, r123arrayNxW looks a lot
+   like std::array<T,N>, has most of the capabilities of a container,
+   and satisfies the requirements outlined in compat/Engine.hpp for
+   counter and key types.  ArrayNxW, in the r123 namespace is
+   a typedef equivalent to r123arrayNxW.
+*/
+
+#define _r123array_tpl(_N, W, T)                   \
+    /** @ingroup arrayNxW */                        \
+    /** @see arrayNxW */                            \
+struct r123array##_N##x##W{                         \
+ T v[_N];                                       \
+ CXXMETHODS(_N, W, T)                           \
+};                                              \
+                                                \
+CXXOVERLOADS(_N, W, T)
+
+/** @endcond */
+
+_r123array_tpl(1, 32, uint32_t)  /* r123array1x32 */
+_r123array_tpl(2, 32, uint32_t)  /* r123array2x32 */
+_r123array_tpl(4, 32, uint32_t)  /* r123array4x32 */
+_r123array_tpl(8, 32, uint32_t)  /* r123array8x32 */
+
+_r123array_tpl(1, 64, uint64_t)  /* r123array1x64 */
+_r123array_tpl(2, 64, uint64_t)  /* r123array2x64 */
+_r123array_tpl(4, 64, uint64_t)  /* r123array4x64 */
+
+_r123array_tpl(16, 8, uint8_t)  /* r123array16x8 for ARSsw, AESsw */
+
+#if R123_USE_SSE
+_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */
+#endif
+
+/* In C++, it's natural to use sizeof(a::value_type), but in C it's
+   pretty convoluted to figure out the width of the value_type of an
+   r123arrayNxW:
+*/
+#define R123_W(a)   (8*sizeof(((a *)0)->v[0]))
+
+/** @namespace r123
+  Most of the Random123 C++ API is contained in the r123 namespace. 
+*/
+
+#endif
+
diff --git a/pyopencl/cl/pyopencl-random123/openclfeatures.h b/pyopencl/cl/pyopencl-random123/openclfeatures.h
new file mode 100644
index 0000000000000000000000000000000000000000..af03d3092318c6c27f1a65ce8104c1609b1e66e1
--- /dev/null
+++ b/pyopencl/cl/pyopencl-random123/openclfeatures.h
@@ -0,0 +1,89 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef __openclfeatures_dot_hpp
+#define __openclfeatures_dot_hpp
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline))
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#define R123_ASSERT(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr,likely) expr
+#endif
+
+#ifndef R123_USE_GNU_UINT128
+#define R123_USE_GNU_UINT128 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 1
+#endif
+
+#ifndef R123_USE_AES_NI
+#define R123_USE_AES_NI 0
+#endif
+
+// XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of
+// ulong to mul_hi.  And gets lots of complaints from stdint.h
+// on some machines.
+// But these typedefs mean we cannot include stdint.h with
+// these headers?  Do we need R123_64T, R123_32T, R123_8T?
+typedef ulong uint64_t;
+typedef uint  uint32_t;
+typedef uchar uint8_t;
+#define UINT64_C(x) ((ulong)(x##UL))
+
+#endif
diff --git a/pyopencl/cl/pyopencl-random123/philox.cl b/pyopencl/cl/pyopencl-random123/philox.cl
new file mode 100644
index 0000000000000000000000000000000000000000..c6fe4464bf17cc3b20cb346e77d9ac73ae650275
--- /dev/null
+++ b/pyopencl/cl/pyopencl-random123/philox.cl
@@ -0,0 +1,486 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _philox_dot_h_
+#define _philox_dot_h_
+
+/** \cond HIDDEN_FROM_DOXYGEN */
+
+#include "openclfeatures.h"
+#include "array.h"
+
+
+/*
+// Macros _Foo_tpl are code generation 'templates'  They define
+// inline functions with names obtained by mangling Foo and the
+// macro arguments.  E.g.,
+//   _mulhilo_tpl(32, uint32_t, uint64_t)
+// expands to a definition of:
+//   mulhilo32(uint32_t, uint32_t, uint32_t *, uint32_t *)
+// We then 'instantiate the template' to define
+// several different functions, e.g.,
+//   mulhilo32
+//   mulhilo64
+// These functions will be visible to user code, and may
+// also be used later in subsequent templates and definitions.
+
+// A template for mulhilo using a temporary of twice the word-width.
+// Gcc figures out that this can be reduced to a single 'mul' instruction,
+// despite the apparent use of double-wide variables, shifts, etc.  It's
+// obviously not guaranteed that all compilers will be that smart, so
+// other implementations might be preferable, e.g., using an intrinsic
+// or an asm block.  On the other hand, for 32-bit multiplies,
+// this *is* perfectly standard C99 - any C99 compiler should 
+// understand it and produce correct code.  For 64-bit multiplies,
+// it's only usable if the compiler recognizes that it can do
+// arithmetic on a 128-bit type.  That happens to be true for gcc on
+// x86-64, and powerpc64 but not much else.
+*/
+#define _mulhilo_dword_tpl(W, Word, Dword)                              \
+R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
+    Dword product = ((Dword)a)*((Dword)b);                              \
+    *hip = product>>W;                                                  \
+    return (Word)product;                                               \
+}
+
+/*
+// A template for mulhilo using gnu-style asm syntax.
+// INSN can be "mulw", "mull" or "mulq".  
+// FIXME - porting to other architectures, we'll need still-more conditional
+// branching here.  Note that intrinsics are usually preferable.
+*/
+#ifdef __powerpc__
+#define _mulhilo_asm_tpl(W, Word, INSN)                         \
+R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
+    Word dx = 0;                                                \
+    __asm__("\n\t"                                              \
+        INSN " %0,%1,%2\n\t"                                    \
+        : "=r"(dx)                                              \
+        : "r"(b), "r"(ax)                                       \
+        );                                                      \
+    *hip = dx;                                                  \
+    return ax*b;                                                \
+}
+#else
+#define _mulhilo_asm_tpl(W, Word, INSN)                         \
+R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){      \
+    Word dx;                                                    \
+    __asm__("\n\t"                                              \
+        INSN " %2\n\t"                                          \
+        : "=a"(ax), "=d"(dx)                                    \
+        : "r"(b), "0"(ax)                                       \
+        );                                                      \
+    *hip = dx;                                                  \
+    return ax;                                                  \
+}
+#endif /* __powerpc__ */
+
+/*
+// A template for mulhilo using MSVC-style intrinsics
+// For example,_umul128 is an msvc intrinsic, c.f.
+// http://msdn.microsoft.com/en-us/library/3dayytw9.aspx
+*/
+#define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN)               \
+R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){       \
+    return INTRIN(a, b, hip);                                   \
+}
+
+/* N.B.  This really should be called _mulhilo_mulhi_intrin.  It just
+   happens that CUDA was the first time we used the idiom. */
+#define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN)                       \
+R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
+    *hip = INTRIN(a, b);                                                \
+    return a*b;                                                         \
+}
+
+/*
+// A template for mulhilo using only word-size operations and
+// C99 operators (no adc, no mulhi).  It
+// requires four multiplies and a dozen or so shifts, adds
+// and tests.  It's not clear what this is good for, other than
+// completeness.  On 32-bit platforms, it could be used to
+// implement philoxNx64, but on such platforms both the philoxNx32
+// and the threefryNx64 cbrngs are going to have much better
+// performance.  It is enabled below by R123_USE_MULHILO64_C99,
+// but that is currently (Sep 2011) not set by any of the
+// features/XXfeatures.h headers.  It can, of course, be
+// set with a compile-time -D option.
+*/
+#define _mulhilo_c99_tpl(W, Word) \
+R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
+    const unsigned WHALF = W/2;                                    \
+    const Word LOMASK = ((((Word)1)<<WHALF)-1);                    \
+    Word lo = a*b;               /* full low multiply */           \
+    Word ahi = a>>WHALF;                                           \
+    Word alo = a& LOMASK;                                          \
+    Word bhi = b>>WHALF;                                           \
+    Word blo = b& LOMASK;                                          \
+                                                                   \
+    Word ahbl = ahi*blo;                                           \
+    Word albh = alo*bhi;                                           \
+                                                                   \
+    Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK));                   \
+    Word hi = ahi*bhi + (ahbl>>WHALF) +  (albh>>WHALF);                 \
+    hi += ahbl_albh >> WHALF; /* carry from the sum of lo(ahbl) + lo(albh) ) */ \
+    /* carry from the sum with alo*blo */                               \
+    hi += ((lo >> WHALF) < (ahbl_albh&LOMASK));                         \
+    *hip = hi;                                                          \
+    return lo;                                                          \
+}
+
+/*
+// A template for mulhilo on a platform that can't do it
+// We could put a C version here, but is it better to run *VERY*
+// slowly or to just stop and force the user to find another CBRNG?
+*/
+#define _mulhilo_fail_tpl(W, Word)                                      \
+R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){               \
+    R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \
+}
+
+/*
+// N.B.  There's an MSVC intrinsic called _emul,
+// which *might* compile into better code than
+// _mulhilo_dword_tpl 
+*/
+#if R123_USE_MULHILO32_ASM
+#ifdef __powerpc__
+_mulhilo_asm_tpl(32, uint32_t, "mulhwu")
+#else
+_mulhilo_asm_tpl(32, uint32_t, "mull")
+#endif /* __powerpc__ */
+#else
+_mulhilo_dword_tpl(32, uint32_t, uint64_t)
+#endif
+
+#if R123_USE_PHILOX_64BIT
+#if R123_USE_MULHILO64_ASM
+#ifdef __powerpc64__
+_mulhilo_asm_tpl(64, uint64_t, "mulhdu")
+#else
+_mulhilo_asm_tpl(64, uint64_t, "mulq")
+#endif /* __powerpc64__ */
+#elif R123_USE_MULHILO64_MSVC_INTRIN
+_mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
+#elif R123_USE_MULHILO64_CUDA_INTRIN
+_mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
+#elif R123_USE_MULHILO64_OPENCL_INTRIN
+_mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi)
+#elif R123_USE_MULHILO64_MULHI_INTRIN
+_mulhilo_cuda_intrin_tpl(64, uint64_t, R123_MULHILO64_MULHI_INTRIN)
+#elif R123_USE_GNU_UINT128
+_mulhilo_dword_tpl(64, uint64_t, __uint128_t)
+#elif R123_USE_MULHILO64_C99
+_mulhilo_c99_tpl(64, uint64_t)
+#else
+_mulhilo_fail_tpl(64, uint64_t)
+#endif
+#endif
+
+/*
+// The multipliers and Weyl constants are "hard coded".
+// To change them, you can #define them with different
+// values before #include-ing this file. 
+// This isn't terribly elegant, but it works for C as
+// well as C++.  A nice C++-only solution would be to
+// use template parameters in the style of <random>
+*/
+#ifndef PHILOX_M2x64_0
+#define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93)
+#endif
+
+#ifndef PHILOX_M4x64_0
+#define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93)
+#endif
+
+#ifndef PHILOX_M4x64_1
+#define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157)
+#endif
+
+#ifndef PHILOX_M2x32_0
+#define PHILOX_M2x32_0 ((uint32_t)0xd256d193)
+#endif
+
+#ifndef PHILOX_M4x32_0
+#define PHILOX_M4x32_0 ((uint32_t)0xD2511F53)
+#endif
+#ifndef PHILOX_M4x32_1
+#define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57)
+#endif
+
+#ifndef PHILOX_W64_0
+#define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15)  /* golden ratio */
+#endif
+#ifndef PHILOX_W64_1
+#define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B)  /* sqrt(3)-1 */
+#endif
+
+#ifndef PHILOX_W32_0
+#define PHILOX_W32_0 ((uint32_t)0x9E3779B9)
+#endif
+#ifndef PHILOX_W32_1
+#define PHILOX_W32_1 ((uint32_t)0xBB67AE85)
+#endif
+
+#ifndef PHILOX2x32_DEFAULT_ROUNDS
+#define PHILOX2x32_DEFAULT_ROUNDS 10
+#endif
+
+#ifndef PHILOX2x64_DEFAULT_ROUNDS
+#define PHILOX2x64_DEFAULT_ROUNDS 10
+#endif
+
+#ifndef PHILOX4x32_DEFAULT_ROUNDS
+#define PHILOX4x32_DEFAULT_ROUNDS 10
+#endif
+
+#ifndef PHILOX4x64_DEFAULT_ROUNDS
+#define PHILOX4x64_DEFAULT_ROUNDS 10
+#endif
+
+/* The ignored fourth argument allows us to instantiate the
+   same macro regardless of N. */
+#define _philox2xWround_tpl(W, T)                                       \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \
+    T hi;                                                               \
+    T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi);                \
+    struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}};               \
+    return out;                                                         \
+}
+#define _philox2xWbumpkey_tpl(W)                                        \
+R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \
+    key.v[0] += PHILOX_W##W##_0;                                        \
+    return key;                                                         \
+}
+
+#define _philox4xWround_tpl(W, T)                                       \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \
+    T hi0;                                                              \
+    T hi1;                                                              \
+    T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0);              \
+    T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1);              \
+    struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1,               \
+                              hi0^ctr.v[3]^key.v[1], lo0}};             \
+    return out;                                                         \
+}
+
+#define _philox4xWbumpkey_tpl(W)                                        \
+R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \
+    key.v[0] += PHILOX_W##W##_0;                                        \
+    key.v[1] += PHILOX_W##W##_1;                                        \
+    return key;                                                         \
+}
+
+#define _philoxNxW_tpl(N, Nhalf, W, T)                         \
+/** @ingroup PhiloxNxW */                                       \
+enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \
+typedef struct r123array##N##x##W philox##N##x##W##_ctr_t;                  \
+typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t;              \
+typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t;              \
+R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
+    R123_ASSERT(R<=16);                                                 \
+    if(R>0){                                       ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
+    return ctr;                                                         \
+}
+         
+_philox2xWbumpkey_tpl(32)
+_philox4xWbumpkey_tpl(32)
+_philox2xWround_tpl(32, uint32_t) /* philo2x32round */
+_philox4xWround_tpl(32, uint32_t)            /* philo4x32round */
+/** \endcond */
+_philoxNxW_tpl(2, 1, 32, uint32_t)    /* philox2x32bijection */
+_philoxNxW_tpl(4, 2, 32, uint32_t)    /* philox4x32bijection */
+#if R123_USE_PHILOX_64BIT
+/** \cond HIDDEN_FROM_DOXYGEN */
+_philox2xWbumpkey_tpl(64)
+_philox4xWbumpkey_tpl(64)
+_philox2xWround_tpl(64, uint64_t) /* philo2x64round */
+_philox4xWround_tpl(64, uint64_t) /* philo4x64round */
+/** \endcond */
+_philoxNxW_tpl(2, 1, 64, uint64_t)    /* philox2x64bijection */
+_philoxNxW_tpl(4, 2, 64, uint64_t)    /* philox4x64bijection */
+#endif /* R123_USE_PHILOX_64BIT */
+
+#define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k)
+#define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k)
+#if R123_USE_PHILOX_64BIT
+#define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k)
+#define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k)
+#endif /* R123_USE_PHILOX_64BIT */
+
+#ifdef __cplusplus
+#include <stdexcept>
+
+/** \cond HIDDEN_FROM_DOXYGEN */
+
+#define _PhiloxNxW_base_tpl(CType, KType, N, W)                         \
+namespace r123{                                                          \
+template<unsigned int ROUNDS>                                             \
+struct Philox##N##x##W##_R{                                             \
+    typedef CType ctr_type;                                         \
+    typedef KType key_type;                                             \
+    typedef KType ukey_type;                                         \
+    static const unsigned int rounds=ROUNDS;                                 \
+    inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \
+        R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \
+        return philox##N##x##W##_R(ROUNDS, ctr, key);                       \
+    }                                                                   \
+};                                                                      \
+typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
+ } // namespace r123
+/** \endcond */
+
+_PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32) // Philox2x32_R<R>
+_PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32) // Philox4x32_R<R>
+#if R123_USE_PHILOX_64BIT
+_PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64) // Philox2x64_R<R>
+_PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64) // Philox4x64_R<R>
+#endif
+
+/* The _tpl macros don't quite work to do string-pasting inside comments.
+   so we just write out the boilerplate documentation four times... */
+
+/** 
+@defgroup PhiloxNxW Philox Classes and Typedefs
+
+The PhiloxNxW classes export the member functions, typedefs and
+operator overloads required by a @ref CBRNG "CBRNG" class.
+
+As described in  
+<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers:  As Easy as 1, 2, 3</i> </a>.
+The Philox family of counter-based RNGs use integer multiplication, xor and permutation of W-bit words
+to scramble its N-word input key.  Philox is a mnemonic for Product HI LO Xor).
+
+
+@class r123::Philox2x32_R 
+@ingroup PhiloxNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Philox round
+function will be applied.
+
+As of November 2011, the authors know of no statistical flaws with
+ROUNDS=6 or more for Philox2x32.
+
+@typedef r123::Philox2x32
+@ingroup PhiloxNxW
+  Philox2x32 is equivalent to Philox2x32_R<10>.    With 10 rounds,
+  Philox2x32 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+
+
+
+@class r123::Philox2x64_R 
+@ingroup PhiloxNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Philox round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=6 or more for Philox2x64.
+
+@typedef r123::Philox2x64
+@ingroup PhiloxNxW
+  Philox2x64 is equivalent to Philox2x64_R<10>.    With 10 rounds,
+  Philox2x64 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+
+
+
+@class r123::Philox4x32_R 
+@ingroup PhiloxNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Philox round
+function will be applied.
+
+In November 2011, the authors recorded some suspicious p-values (approximately 1.e-7) from
+some very long (longer than the default BigCrush length) SimpPoker tests.  Despite
+the fact that even longer tests reverted to "passing" p-values, a cloud remains over
+Philox4x32 with 7 rounds.  The authors know of no statistical flaws with
+ROUNDS=8 or more for Philox4x32.
+
+@typedef r123::Philox4x32
+@ingroup PhiloxNxW
+  Philox4x32 is equivalent to Philox4x32_R<10>.    With 10 rounds,
+  Philox4x32 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+
+
+
+@class r123::Philox4x64_R 
+@ingroup PhiloxNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Philox round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=7 or more for Philox4x64.
+
+@typedef r123::Philox4x64
+@ingroup PhiloxNxW
+  Philox4x64 is equivalent to Philox4x64_R<10>.    With 10 rounds,
+  Philox4x64 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+*/
+
+#endif /* __cplusplus */
+
+#endif /* _philox_dot_h_ */
diff --git a/pyopencl/cl/pyopencl-random123/threefry.cl b/pyopencl/cl/pyopencl-random123/threefry.cl
new file mode 100644
index 0000000000000000000000000000000000000000..1a83ddc1d32b05376a2278ad625ba2c5bc14bedb
--- /dev/null
+++ b/pyopencl/cl/pyopencl-random123/threefry.cl
@@ -0,0 +1,864 @@
+/*
+Copyright 2010-2011, D. E. Shaw Research.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of D. E. Shaw Research nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef _threefry_dot_h_
+#define _threefry_dot_h_
+#include "openclfeatures.h"
+#include "array.h"
+
+/** \cond HIDDEN_FROM_DOXYGEN */
+/* Significant parts of this file were copied from
+   from:
+      Skein_FinalRnd/ReferenceImplementation/skein.h
+      Skein_FinalRnd/ReferenceImplementation/skein_block.c
+
+   in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
+
+   This file has been modified so that it may no longer perform its originally
+   intended function.  If you're looking for a Skein or Threefish source code,
+   please consult the original file.
+
+   The original file had the following header:
+**************************************************************************
+**
+** Interface declarations and internal definitions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+***************************************************************************
+
+*/
+
+/* See comment at the top of philox.h for the macro pre-process
+   strategy. */
+
+/* Rotation constants: */
+enum r123_enum_threefry64x4 {
+    /* These are the R_256 constants from the Threefish reference sources
+       with names changed to R_64x4... */
+    R_64x4_0_0=14, R_64x4_0_1=16,
+    R_64x4_1_0=52, R_64x4_1_1=57,
+    R_64x4_2_0=23, R_64x4_2_1=40,
+    R_64x4_3_0= 5, R_64x4_3_1=37,
+    R_64x4_4_0=25, R_64x4_4_1=33,
+    R_64x4_5_0=46, R_64x4_5_1=12,
+    R_64x4_6_0=58, R_64x4_6_1=22,
+    R_64x4_7_0=32, R_64x4_7_1=32
+};
+
+enum r123_enum_threefry64x2 {
+    /*
+    // Output from skein_rot_search: (srs64_B64-X1000)
+    // Random seed = 1. BlockSize = 128 bits. sampleCnt =  1024. rounds =  8, minHW_or=57
+    // Start: Tue Mar  1 10:07:48 2011
+    // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format   
+    */
+    R_64x2_0_0=16,
+    R_64x2_1_0=42,
+    R_64x2_2_0=12,
+    R_64x2_3_0=31,
+    R_64x2_4_0=16,
+    R_64x2_5_0=32,
+    R_64x2_6_0=24,
+    R_64x2_7_0=21
+    /* 4 rounds: minHW =  4  [  4  4  4  4 ]
+    // 5 rounds: minHW =  8  [  8  8  8  8 ]
+    // 6 rounds: minHW = 16  [ 16 16 16 16 ]
+    // 7 rounds: minHW = 32  [ 32 32 32 32 ]
+    // 8 rounds: minHW = 64  [ 64 64 64 64 ]
+    // 9 rounds: minHW = 64  [ 64 64 64 64 ]
+    //10 rounds: minHW = 64  [ 64 64 64 64 ]
+    //11 rounds: minHW = 64  [ 64 64 64 64 ] */
+};
+
+enum r123_enum_threefry32x4 {
+    /* Output from skein_rot_search: (srs-B128-X5000.out)
+    // Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
+    // Start: Mon Aug 24 22:41:36 2009
+    // ...
+    // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format    */
+    R_32x4_0_0=10, R_32x4_0_1=26,
+    R_32x4_1_0=11, R_32x4_1_1=21,
+    R_32x4_2_0=13, R_32x4_2_1=27,
+    R_32x4_3_0=23, R_32x4_3_1= 5,
+    R_32x4_4_0= 6, R_32x4_4_1=20,
+    R_32x4_5_0=17, R_32x4_5_1=11,
+    R_32x4_6_0=25, R_32x4_6_1=10,
+    R_32x4_7_0=18, R_32x4_7_1=20
+
+    /* 4 rounds: minHW =  3  [  3  3  3  3 ]
+    // 5 rounds: minHW =  7  [  7  7  7  7 ]
+    // 6 rounds: minHW = 12  [ 13 12 13 12 ]
+    // 7 rounds: minHW = 22  [ 22 23 22 23 ]
+    // 8 rounds: minHW = 31  [ 31 31 31 31 ]
+    // 9 rounds: minHW = 32  [ 32 32 32 32 ]
+    //10 rounds: minHW = 32  [ 32 32 32 32 ]
+    //11 rounds: minHW = 32  [ 32 32 32 32 ] */
+
+};
+
+enum r123_enum_threefry32x2 {
+    /* Output from skein_rot_search (srs32x2-X5000.out)
+    // Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
+    // Start: Tue Jul 12 11:11:33 2011
+    // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize=  64].format   */
+    R_32x2_0_0=13,
+    R_32x2_1_0=15,
+    R_32x2_2_0=26,
+    R_32x2_3_0= 6,
+    R_32x2_4_0=17,
+    R_32x2_5_0=29,
+    R_32x2_6_0=16,
+    R_32x2_7_0=24
+
+    /* 4 rounds: minHW =  4  [  4  4  4  4 ]
+    // 5 rounds: minHW =  6  [  6  8  6  8 ]
+    // 6 rounds: minHW =  9  [  9 12  9 12 ]
+    // 7 rounds: minHW = 16  [ 16 24 16 24 ]
+    // 8 rounds: minHW = 32  [ 32 32 32 32 ]
+    // 9 rounds: minHW = 32  [ 32 32 32 32 ]
+    //10 rounds: minHW = 32  [ 32 32 32 32 ]
+    //11 rounds: minHW = 32  [ 32 32 32 32 ] */
+    };
+
+enum r123_enum_threefry_wcnt {
+    WCNT2=2,
+    WCNT4=4
+};
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
+R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
+{
+    return (x << (N & 63)) | (x >> ((64-N) & 63));
+}
+    
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
+R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
+{
+    return (x << (N & 31)) | (x >> ((32-N) & 31));
+}
+
+#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
+#define SKEIN_KS_PARITY64         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+#define SKEIN_KS_PARITY32         0x1BD11BDA
+
+#ifndef THREEFRY2x32_DEFAULT_ROUNDS
+#define THREEFRY2x32_DEFAULT_ROUNDS 20
+#endif
+
+#ifndef THREEFRY2x64_DEFAULT_ROUNDS
+#define THREEFRY2x64_DEFAULT_ROUNDS 20
+#endif
+
+#ifndef THREEFRY4x32_DEFAULT_ROUNDS
+#define THREEFRY4x32_DEFAULT_ROUNDS 20
+#endif
+
+#ifndef THREEFRY4x64_DEFAULT_ROUNDS
+#define THREEFRY4x64_DEFAULT_ROUNDS 20
+#endif
+
+#define _threefry2x_tpl(W)                                              \
+typedef struct r123array2x##W threefry2x##W##_ctr_t;                          \
+typedef struct r123array2x##W threefry2x##W##_key_t;                          \
+typedef struct r123array2x##W threefry2x##W##_ukey_t;                          \
+R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE                                          \
+threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
+    threefry2x##W##_ctr_t X;                                              \
+    uint##W##_t ks[2+1];                                          \
+    int  i; /* avoid size_t to avoid need for stddef.h */                   \
+    R123_ASSERT(Nrounds<=32);                                           \
+    ks[2] =  SKEIN_KS_PARITY##W;                                   \
+    for (i=0;i < 2; i++)                                        \
+        {                                                               \
+            ks[i] = k.v[i];                                             \
+            X.v[i]  = in.v[i];                                          \
+            ks[2] ^= k.v[i];                                    \
+        }                                                               \
+                                                                        \
+    /* Insert initial key before round 0 */                             \
+    X.v[0] += ks[0]; X.v[1] += ks[1];                                   \
+                                                                        \
+    if(Nrounds>0){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>1){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>2){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>3){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>3){                                                      \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2];                               \
+        X.v[1] += 1;     /* X.v[2-1] += r  */                   \
+    }                                                                   \
+    if(Nrounds>4){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>5){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>6){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>7){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>7){                                                      \
+        /* InjectKey(r=2) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[0];                               \
+        X.v[1] += 2;                                                    \
+    }                                                                   \
+    if(Nrounds>8){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>9){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>10){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>11){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>11){                                                     \
+        /* InjectKey(r=3) */                                            \
+        X.v[0] += ks[0]; X.v[1] += ks[1];                               \
+        X.v[1] += 3;                                                    \
+    }                                                                   \
+    if(Nrounds>12){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>13){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>14){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>15){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>15){                                                     \
+        /* InjectKey(r=4) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2];                               \
+        X.v[1] += 4;                                                    \
+    }                                                                   \
+    if(Nrounds>16){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>17){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>18){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>19){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>19){                                                     \
+        /* InjectKey(r=5) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[0];                               \
+        X.v[1] += 5;                                                    \
+    }                                                                   \
+    if(Nrounds>20){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>21){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>22){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>23){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>23){                                                     \
+        /* InjectKey(r=6) */                                            \
+        X.v[0] += ks[0]; X.v[1] += ks[1];                               \
+        X.v[1] += 6;                                                    \
+    }                                                                   \
+    if(Nrounds>24){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>25){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>26){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>27){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>27){                                                     \
+        /* InjectKey(r=7) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2];                               \
+        X.v[1] += 7;                                                    \
+    }                                                                   \
+    if(Nrounds>28){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>29){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>30){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>31){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
+    if(Nrounds>31){                                                     \
+        /* InjectKey(r=8) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[0];                               \
+        X.v[1] += 8;                                                    \
+    }                                                                   \
+    return X;                                                           \
+}                                                                       \
+ /** @ingroup ThreefryNxW */                                            \
+enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS };       \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE                                     \
+threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
+    return threefry2x##W##_R(threefry2x##W##_rounds, in, k);            \
+}
+
+
+#define _threefry4x_tpl(W)                                              \
+typedef struct r123array4x##W threefry4x##W##_ctr_t;                        \
+typedef struct r123array4x##W threefry4x##W##_key_t;                        \
+typedef struct r123array4x##W threefry4x##W##_ukey_t;                        \
+R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE                                          \
+threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
+    threefry4x##W##_ctr_t X;                                            \
+    uint##W##_t ks[4+1];                                            \
+    int  i; /* avoid size_t to avoid need for stddef.h */                   \
+    R123_ASSERT(Nrounds<=72);                                           \
+    ks[4] =  SKEIN_KS_PARITY##W;                                    \
+    for (i=0;i < 4; i++)                                            \
+        {                                                               \
+            ks[i] = k.v[i];                                             \
+            X.v[i]  = in.v[i];                                          \
+            ks[4] ^= k.v[i];                                        \
+        }                                                               \
+                                                                        \
+    /* Insert initial key before round 0 */                             \
+    X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
+                                                                        \
+    if(Nrounds>0){                                                      \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>1){                                                      \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>2){                                                      \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>3){                                                      \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>3){                                                      \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
+        X.v[4-1] += 1;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>4){                                                      \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>5){                                                      \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>6){                                                      \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>7){                                                      \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>7){                                                      \
+        /* InjectKey(r=2) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
+        X.v[4-1] += 2;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>8){                                                      \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>9){                                                      \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>10){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>11){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>11){                                                     \
+        /* InjectKey(r=3) */                                            \
+        X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
+        X.v[4-1] += 3;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>12){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>13){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>14){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>15){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>15){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
+        X.v[4-1] += 4;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>16){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>17){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>18){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>19){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>19){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
+        X.v[4-1] += 5;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>20){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>21){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>22){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>23){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>23){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
+        X.v[4-1] += 6;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>24){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>25){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>26){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>27){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>27){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
+        X.v[4-1] += 7;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>28){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>29){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>30){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>31){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>31){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
+        X.v[4-1] += 8;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>32){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>33){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>34){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>35){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>35){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
+        X.v[4-1] += 9;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>36){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>37){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>38){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>39){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>39){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
+        X.v[4-1] += 10;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>40){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>41){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>42){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>43){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>43){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
+        X.v[4-1] += 11;     /* X.v[WCNT4-1] += r  */                \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>44){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>45){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>46){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>47){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>47){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
+        X.v[4-1] += 12;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>48){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>49){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>50){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>51){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>51){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
+        X.v[4-1] += 13;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>52){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>53){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>54){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>55){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>55){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
+        X.v[4-1] += 14;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>56){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>57){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>58){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>59){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>59){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
+        X.v[4-1] += 15;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>60){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>61){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>62){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>63){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>63){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
+        X.v[4-1] += 16;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>64){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>65){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>66){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>67){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>67){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
+        X.v[4-1] += 17;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    if(Nrounds>68){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>69){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>70){                                                     \
+        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
+        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>71){                                                     \
+        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
+        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
+    }                                                                   \
+    if(Nrounds>71){                                                     \
+        /* InjectKey(r=1) */                                            \
+        X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
+        X.v[4-1] += 18;     /* X.v[WCNT4-1] += r  */                 \
+    }                                                                   \
+                                                                        \
+    return X;                                                           \
+}                                                                       \
+ /** @ingroup ThreefryNxW */                                            \
+enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS };       \
+R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
+R123_CUDA_DEVICE R123_STATIC_INLINE                                     \
+threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
+    return threefry4x##W##_R(threefry4x##W##_rounds, in, k);            \
+}
+/** \endcond */
+
+_threefry2x_tpl(64)
+_threefry2x_tpl(32)
+_threefry4x_tpl(64)
+_threefry4x_tpl(32)
+
+/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
+   than a static inline function.  Why?  */
+#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
+#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
+#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
+#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
+
+#ifdef __cplusplus
+/** \cond HIDDEN_FROM_DOXYGEN */
+#define _threefryNxWclass_tpl(NxW)                                      \
+namespace r123{                                                     \
+template<unsigned int R>                                                  \
+ struct Threefry##NxW##_R{                                              \
+    typedef threefry##NxW##_ctr_t ctr_type;                             \
+    typedef threefry##NxW##_key_t key_type;                             \
+    typedef threefry##NxW##_key_t ukey_type;                            \
+    static const unsigned int rounds=R;                                 \
+   inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
+        R123_STATIC_ASSERT(R<=72, "threefry is only unrolled up to 72 rounds\n"); \
+        return threefry##NxW##_R(R, ctr, key);                              \
+    }                                                                   \
+};                                                                      \
+ typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW;       \
+} // namespace r123
+
+/** \endcond */
+
+_threefryNxWclass_tpl(2x32)
+_threefryNxWclass_tpl(4x32)
+_threefryNxWclass_tpl(2x64)
+_threefryNxWclass_tpl(4x64)
+
+/* The _tpl macros don't quite work to do string-pasting inside comments.
+   so we just write out the boilerplate documentation four times... */
+
+/** 
+@defgroup ThreefryNxW Threefry Classes and Typedefs
+
+The ThreefryNxW classes export the member functions, typedefs and
+operator overloads required by a @ref CBRNG "CBRNG" class.
+
+As described in  
+<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers:  As Easy as 1, 2, 3</i> </a>, 
+the Threefry family is closely related to the Threefish block cipher from
+<a href="http://www.skein-hash.info/"> Skein Hash Function</a>.  
+Threefry is \b not suitable for cryptographic use.
+
+Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output.
+
+@class r123::Threefry2x32_R 
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=13 or more for Threefry2x32.
+
+@typedef r123::Threefry2x32
+@ingroup ThreefryNxW
+  Threefry2x32 is equivalent to Threefry2x32_R<20>.    With 20 rounds,
+  Threefry2x32 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+
+@class r123::Threefry2x64_R 
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+In November 2011, the authors discovered that 13 rounds of
+Threefry2x64 sequenced by strided, interleaved key and counter
+increments failed a very long (longer than the default BigCrush
+length) WeightDistrub test.  At the same time, it was confirmed that
+14 rounds passes much longer tests (up to 5x10^12 samples) of a
+similar nature.  The authors know of no statistical flaws with
+ROUNDS=14 or more for Threefry2x64.
+
+@typedef r123::Threefry2x64
+@ingroup ThreefryNxW
+  Threefry2x64 is equivalent to Threefry2x64_R<20>.    With 20 rounds,
+  Threefry2x64 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+
+
+
+@class r123::Threefry4x32_R 
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=12 or more for Threefry4x32.
+
+@typedef r123::Threefry4x32
+@ingroup ThreefryNxW
+  Threefry4x32 is equivalent to Threefry4x32_R<20>.    With 20 rounds,
+  Threefry4x32 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+
+
+
+@class r123::Threefry4x64_R 
+@ingroup ThreefryNxW
+
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+
+The template argument, ROUNDS, is the number of times the Threefry round
+function will be applied.
+
+As of September 2011, the authors know of no statistical flaws with
+ROUNDS=12 or more for Threefry4x64.
+
+@typedef r123::Threefry4x64
+@ingroup ThreefryNxW
+  Threefry4x64 is equivalent to Threefry4x64_R<20>.    With 20 rounds,
+  Threefry4x64 has a considerable safety margin over the minimum number
+  of rounds with no known statistical flaws, but still has excellent
+   performance. 
+*/
+
+#endif
+
+#endif
diff --git a/pyopencl/cl/pyopencl-ranluxcl.cl b/pyopencl/cl/pyopencl-ranluxcl.cl
index a40cadf1e3f5e28a390eb796d7ac7aaeda47a48a..eac9e18de850cd94b2276475e8b9a5f3110e80da 100644
--- a/pyopencl/cl/pyopencl-ranluxcl.cl
+++ b/pyopencl/cl/pyopencl-ranluxcl.cl
@@ -1,3 +1,6 @@
+/* RanluxCL is deprecated in PyOpenCL and will be removed in the 2018.x
+ * versions of the package. */
+
 #ifndef RANLUXCL_CL
 #define RANLUXCL_CL
 
diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py
index 7d68800ffd973a3e7c6791a926a922cc6b08fc36..6d4b2222fd9843449753a760efcf83fdcf6716b2 100644
--- a/pyopencl/clrandom.py
+++ b/pyopencl/clrandom.py
@@ -1,8 +1,7 @@
 # encoding: utf8
-from __future__ import division
-from __future__ import absolute_import
+from __future__ import division, absolute_import
 
-__copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
+__copyright__ = "Copyright (C) 2009-16 Andreas Kloeckner"
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -28,27 +27,36 @@ THE SOFTWARE.
 # {{{ documentation
 
 __doc__ = u"""
-PyOpenCL now includes and uses the `RANLUXCL random number generator
-<https://bitbucket.org/ivarun/ranluxcl/>`_ by Ivar Ursin Nikolaisen.  In
-addition to being usable through the convenience functions above, it is
-available in any piece of code compiled through PyOpenCL by::
+PyOpenCL now includes and uses some of the `Random123 random number generators
+<https://www.deshawresearch.com/resources_random123.html>`_ by D.E. Shaw
+Research.  In addition to being usable through the convenience functions above,
+they are available in any piece of code compiled through PyOpenCL by::
 
-    #include <pyopencl-ranluxcl.cl>
+    #include <pyopencl-philox.cl>
+    #include <pyopencl-threefry.cl>
 
-See the `source
-<https://github.com/pyopencl/pyopencl/blob/master/src/cl/pyopencl-ranluxcl.cl>`_
-for some documentation if you're planning on using RANLUXCL directly.
+See the `Philox source
+<https://github.com/pyopencl/pyopencl/blob/master/src/cl/pyopencl-philox.cl>`_
+and the `Threefry source
+<https://github.com/pyopencl/pyopencl/blob/master/src/cl/pyopencl-threefish.cl>`_
+for some documentation if you're planning on using Random123 directly.
 
-The RANLUX generator is described in the following two articles. If you use the
-generator for scientific purposes, please consider citing them:
+.. note::
 
-* Martin Lüscher, A portable high-quality random number generator for lattice
-  field theory simulations, `Computer Physics Communications 79 (1994) 100-110
-  <http://dx.doi.org/10.1016/0010-4655(94)90232-1>`_
+    PyOpenCL previously had documented support for the `RANLUXCL random number
+    generator <https://bitbucket.org/ivarun/ranluxcl/>`_ by Ivar Ursin
+    Nikolaisen.  This support is now deprecated because of the general slowness
+    of these generators and will be removed from PyOpenCL in the 2018.x series.
+    All users are encouraged to switch to one of the Random123 genrators,
+    :class:`PhiloxGenerator` or :class:`ThreefryGenerator`.
+
+.. autoclass:: PhiloxGenerator
+
+.. autoclass:: ThreefryGenerator
+
+.. autofunction:: rand
+.. autofunction:: fill_rand
 
-* F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom
-  number generator of Lüscher, `Computer Physics Communications 79 (1994) 111-114
-  <http://dx.doi.org/10.1016/0010-4655(94)90233-X>`_
 """
 
 # }}}
@@ -61,8 +69,14 @@ from pytools import memoize_method
 import numpy as np
 
 
+# {{{ RanluxGenerator (deprecated)
+
 class RanluxGenerator(object):
     """
+    .. warning::
+
+        This class is deprecated, to be removed in PyOpenCL 2018.x.
+
     .. versionadded:: 2011.2
 
     .. attribute:: state
@@ -390,20 +404,308 @@ class RanluxGenerator(object):
         self.get_sync_kernel()(queue, (self.num_work_items,),
                 self.wg_size, self.state.data)
 
+# }}}
+
+
+# {{{ Random123 generators
+
+class Random123GeneratorBase(object):
+    """
+    .. versionadded:: 2016.2
+
+    .. automethod:: fill_uniform
+    .. automethod:: uniform
+    .. automethod:: fill_normal
+    .. automethod:: normal
+    """
+
+    def __init__(self, context, key=None, counter=None):
+        int32_info = np.iinfo(np.int32)
+
+        if key is None:
+            from random import randrange
+            key = [randrange(int(int32_info.min), int(int32_info.max)+1)
+                    for i in range(self.key_length-1)]
+        if counter is None:
+            from random import randrange
+            counter = [randrange(int(int32_info.min), int(int32_info.max)+1)
+                    for i in range(4)]
+
+        self.context = context
+        self.key = key
+        self.counter = counter
+
+        self.counter_max = int32_info.max
+
+    @memoize_method
+    def get_gen_kernel(self, dtype, distribution):
+        size_multiplier = 1
+        arg_dtype = dtype
+
+        rng_key = (distribution, dtype)
+
+        if rng_key in [("uniform", np.float64), ("normal", np.float64)]:
+            c_type = "double"
+            scale1_const = "((double) %r)" % (1/2**32)
+            scale2_const = "((double) %r)" % (1/2**64)
+            if distribution == "normal":
+                transform = "box_muller"
+            else:
+                transform = ""
+
+            rng_expr = (
+                    "shift + scale * "
+                    "%s( %s * convert_double4(gen)"
+                    "+ %s * convert_double4(gen))"
+                    % (transform, scale1_const, scale2_const))
+
+            counter_multiplier = 2
+
+        elif rng_key in [(dist, cmp_dtype)
+                for dist in ["normal", "uniform"]
+                for cmp_dtype in [
+                    np.float32,
+                    cl.array.vec.float2,
+                    cl.array.vec.float3,
+                    cl.array.vec.float4,
+                    ]]:
+            c_type = "float"
+            scale_const = "((float) %r)" % (1/2**32)
+
+            if distribution == "normal":
+                transform = "box_muller"
+            else:
+                transform = ""
+
+            rng_expr = (
+                    "shift + scale * %s(%s * convert_float4(gen))"
+                    % (transform, scale_const))
+            counter_multiplier = 1
+            arg_dtype = np.float32
+            try:
+                _, size_multiplier = cl.array.vec.type_to_scalar_and_count[dtype]
+            except KeyError:
+                pass
+
+        elif rng_key == ("uniform", np.int32):
+            c_type = "int"
+            rng_expr = (
+                    "shift + convert_int4((convert_long4(gen) * scale) / %s)"
+                    % (str(2**32)+"l")
+                    )
+            counter_multiplier = 1
+
+        elif rng_key == ("uniform", np.int64):
+            c_type = "long"
+            rng_expr = (
+                    "shift"
+                    "+ convert_long4(gen) * (scale/two32) "
+                    "+ ((convert_long4(gen) * scale) / two32)"
+                    .replace("two32", (str(2**32)+"l")))
+            counter_multiplier = 2
+
+        else:
+            raise TypeError(
+                    "unsupported RNG distribution/data type combination '%s/%s'"
+                    % rng_key)
+
+        src = """//CL//
+            #include <%(header_name)s>
+
+            typedef %(output_t)s output_t;
+            typedef %(output_t)s4 output_vec_t;
+            typedef %(gen_name)s_ctr_t ctr_t;
+            typedef %(gen_name)s_key_t key_t;
+
+            uint4 gen_bits(key_t *key, ctr_t *ctr)
+            {
+                union {
+                    ctr_t ctr_el;
+                    uint4 vec_el;
+                } u;
+
+                u.ctr_el = %(gen_name)s(*ctr, *key);
+                if (++ctr->v[0] == 0)
+                    if (++ctr->v[1] == 0)
+                        ++ctr->v[2];
+
+                return u.vec_el;
+            }
+
+            #if %(include_box_muller)s
+            output_vec_t box_muller(output_vec_t x)
+            {
+                #define BOX_MULLER(I, COMPA, COMPB) \
+                    output_t r##I = sqrt(-2*log(x.COMPA)); \
+                    output_t c##I; \
+                    output_t s##I = sincos((output_t) (2*M_PI) * x.COMPB, &c##I);
+
+                BOX_MULLER(0, x, y);
+                BOX_MULLER(1, z, w);
+                return (output_vec_t) (r0*c0, r0*s0, r1*c1, r1*s1);
+            }
+            #endif
+
+            #define GET_RANDOM_NUM(gen) %(rng_expr)s
+
+            kernel void generate(
+                int k1,
+                #if %(key_length)s > 2
+                int k2, int k3,
+                #endif
+                int c0, int c1, int c2, int c3,
+                global output_t *output,
+                long out_size,
+                output_t scale,
+                output_t shift)
+            {
+                #if %(key_length)s == 2
+                key_t k = {{get_global_id(0), k1}};
+                #else
+                key_t k = {{get_global_id(0), k1, k2, k3}};
+                #endif
+
+                ctr_t c = {{c0, c1, c2, c3}};
+
+                // output bulk
+                unsigned long idx = get_global_id(0)*4;
+                while (idx + 4 < out_size)
+                {
+                    *(global output_vec_t *) (output + idx) =
+                        GET_RANDOM_NUM(gen_bits(&k, &c));
+                    idx += 4*get_global_size(0);
+                }
+
+                // output tail
+                output_vec_t tail_ran = GET_RANDOM_NUM(gen_bits(&k, &c));
+                if (idx < out_size)
+                  output[idx] = tail_ran.x;
+                if (idx+1 < out_size)
+                  output[idx+1] = tail_ran.y;
+                if (idx+2 < out_size)
+                  output[idx+2] = tail_ran.z;
+                if (idx+3 < out_size)
+                  output[idx+3] = tail_ran.w;
+            }
+            """ % {
+                "gen_name": self.generator_name,
+                "header_name": self.header_name,
+                "output_t": c_type,
+                "key_length": self.key_length,
+                "include_box_muller": int(distribution == "normal"),
+                "rng_expr": rng_expr
+                }
+
+        prg = cl.Program(self.context, src).build()
+        knl = prg.generate
+        knl.set_scalar_arg_dtypes(
+                [np.int32] * (self.key_length - 1 + 4)
+                + [None, np.int64, arg_dtype, arg_dtype])
+
+        return knl, counter_multiplier, size_multiplier
+
+    def _fill(self, distribution, ary, scale, shift, queue=None):
+        """Fill *ary* with uniformly distributed random numbers in the interval
+        *(a, b)*, endpoints excluded.
+
+        :return: a :class:`pyopencl.Event`
+        """
+
+        if queue is None:
+            queue = ary.queue
+
+        knl, counter_multiplier, size_multiplier = \
+                self.get_gen_kernel(ary.dtype, distribution)
+
+        args = self.key + self.counter + [
+                ary.data, ary.size*size_multiplier,
+                scale, shift]
+
+        n = ary.size
+        from pyopencl.array import splay
+        gsize, lsize = splay(queue, ary.size)
+
+        evt = knl(queue, gsize, lsize, *args)
+
+        self.counter[0] += n * counter_multiplier
+        c1_incr, self.counter[0] = divmod(self.counter[0], self.counter_max)
+        if c1_incr:
+            self.counter[1] += c1_incr
+            c2_incr, self.counter[1] = divmod(self.counter[1], self.counter_max)
+            self.counter[2] += c2_incr
+
+        return evt
+
+    def fill_uniform(self, ary, a=0, b=1, queue=None):
+        return self._fill("uniform", ary,
+                scale=(b-a), shift=a, queue=queue)
+
+    def uniform(self, *args, **kwargs):
+        """Make a new empty array, apply :meth:`fill_uniform` to it.
+        """
+        a = kwargs.pop("a", 0)
+        b = kwargs.pop("b", 1)
+
+        result = cl_array.empty(*args, **kwargs)
+
+        self.fill_uniform(result, queue=result.queue, a=a, b=b)
+        return result
+
+    def fill_normal(self, ary, mu=0, sigma=1, queue=None):
+        """Fill *ary* with normally distributed numbers with mean *mu* and
+        standard deviation *sigma*.
+        """
+
+        return self._fill("normal", ary, scale=sigma, shift=mu, queue=queue)
+
+    def normal(self, *args, **kwargs):
+        """Make a new empty array, apply :meth:`fill_normal` to it.
+        """
+        mu = kwargs.pop("mu", 0)
+        sigma = kwargs.pop("sigma", 1)
+
+        result = cl_array.empty(*args, **kwargs)
+
+        result.add_event(
+                self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma))
+        return result
+
+
+class PhiloxGenerator(Random123GeneratorBase):
+    header_name = "pyopencl-random123/philox.cl"
+    generator_name = "philox4x32"
+    key_length = 2
+
+
+class ThreefryGenerator(Random123GeneratorBase):
+    header_name = "pyopencl-random123/threefry.cl"
+    generator_name = "threefry4x32"
+    key_length = 4
+
+# }}}
+
 
 @first_arg_dependent_memoize
-def _get_generator(queue, luxury=None):
-    gen = RanluxGenerator(queue, luxury=luxury)
-    queue.finish()
+def _get_generator(context):
+    if context.devices[0].type & cl.device_type.CPU:
+        gen = PhiloxGenerator(context)
+    else:
+        gen = ThreefryGenerator(context)
+
     return gen
 
 
-def fill_rand(result, queue=None, luxury=4, a=0, b=1):
+def fill_rand(result, queue=None, luxury=None, a=0, b=1):
     """Fill *result* with random values of `dtype` in the range [0,1).
     """
+    if luxury is not None:
+        from warnings import warn
+        warn("Specifying the 'luxury' argument is deprecated and will stop being "
+                "supported in PyOpenCL 2018.x", stacklevel=2)
+
     if queue is None:
         queue = result.queue
-    gen = _get_generator(queue, luxury=luxury)
+    gen = _get_generator(queue.context)
     gen.fill_uniform(result, a=a, b=b)
 
 
@@ -412,8 +714,13 @@ def rand(queue, shape, dtype, luxury=None, a=0, b=1):
     in the range [a,b).
     """
 
+    if luxury is not None:
+        from warnings import warn
+        warn("Specifying the 'luxury' argument is deprecated and will stop being "
+                "supported in PyOpenCL 2018.x", stacklevel=2)
+
     from pyopencl.array import Array
-    gen = _get_generator(queue, luxury)
+    gen = _get_generator(queue.context)
     result = Array(queue, shape, dtype)
     result.add_event(
             gen.fill_uniform(result, a=a, b=b))
diff --git a/test/test_array.py b/test/test_array.py
index ff2b94ca1413a27282876014023aa0107844461d..7b48a95410638d3ea0ab7d0fa9925de836add785 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -38,6 +38,8 @@ from pyopencl.tools import (  # noqa
 from pyopencl.characterize import has_double_support, has_struct_arg_count_bug
 from pyopencl.cffi_cl import _PYPY
 
+from pyopencl.clrandom import RanluxGenerator, PhiloxGenerator, ThreefryGenerator
+
 
 # {{{ helpers
 
@@ -413,54 +415,79 @@ def test_divide_array(ctx_factory):
 
 # {{{ RNG
 
-def test_random_float_in_range(ctx_factory):
+@pytest.mark.parametrize("rng_class",
+        [RanluxGenerator, PhiloxGenerator, ThreefryGenerator])
+@pytest.mark.parametrize("ary_size", [300, 301, 302, 303, 10007])
+def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    from pyopencl.clrandom import RanluxGenerator
-
     if has_double_support(context.devices[0]):
         dtypes = [np.float32, np.float64]
     else:
         dtypes = [np.float32]
 
-    gen = RanluxGenerator(queue, 5120)
+    if rng_class is RanluxGenerator:
+        gen = rng_class(queue, 5120)
+    else:
+        gen = rng_class(context)
+
+    for dtype in dtypes:
+        print(dtype)
+        ran = cl_array.zeros(queue, ary_size, dtype)
+        gen.fill_uniform(ran)
+
+        if plot_hist:
+            import matplotlib.pyplot as pt
+            pt.hist(ran.get(), 30)
+            pt.show()
 
-    for ary_size in [300, 301, 302, 303, 10007]:
-        for dtype in dtypes:
-            ran = cl_array.zeros(queue, ary_size, dtype)
-            gen.fill_uniform(ran)
-            assert (0 < ran.get()).all()
-            assert (ran.get() < 1).all()
+        assert (0 < ran.get()).all()
+        assert (ran.get() < 1).all()
 
+        if rng_class is RanluxGenerator:
             gen.synchronize(queue)
 
-            ran = cl_array.zeros(queue, ary_size, dtype)
-            gen.fill_uniform(ran, a=4, b=7)
-            assert (4 < ran.get()).all()
-            assert (ran.get() < 7).all()
+        ran = cl_array.zeros(queue, ary_size, dtype)
+        gen.fill_uniform(ran, a=4, b=7)
+        assert (4 < ran.get()).all()
+        assert (ran.get() < 7).all()
 
-            ran = gen.normal(queue, (10007,), dtype, mu=4, sigma=3)
+        ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3)
+
+        if plot_hist:
+            import matplotlib.pyplot as pt
+            pt.hist(ran.get(), 30)
+            pt.show()
 
 
 @pytest.mark.parametrize("dtype", [np.int32, np.int64])
-def test_random_int_in_range(ctx_factory, dtype):
+@pytest.mark.parametrize("rng_class",
+        [RanluxGenerator, PhiloxGenerator, ThreefryGenerator])
+def test_random_int_in_range(ctx_factory, rng_class, dtype, plot_hist=False):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    from pyopencl.clrandom import RanluxGenerator
-    gen = RanluxGenerator(queue, 5120)
+    if rng_class is RanluxGenerator:
+        gen = rng_class(queue, 5120)
+    else:
+        gen = rng_class(context)
+
+    # if (dtype == np.int64
+    #         and context.devices[0].platform.vendor.startswith("Advanced Micro")):
+    #     pytest.xfail("AMD miscompiles 64-bit RNG math")
+
+    ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300).get()
+    assert (200 <= ran).all()
+    assert (ran < 300).all()
 
-    if (dtype == np.int64
-            and context.devices[0].platform.vendor.startswith("Advanced Micro")):
-        pytest.xfail("AMD miscompiles 64-bit RNG math")
+    print(np.min(ran), np.max(ran))
+    assert np.max(ran) > 295
 
-    ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300)
-    assert (200 <= ran.get()).all()
-    assert (ran.get() < 300).all()
-    #from matplotlib import pyplot as pt
-    #pt.hist(ran.get())
-    #pt.show()
+    if plot_hist:
+        from matplotlib import pyplot as pt
+        pt.hist(ran)
+        pt.show()
 
 # }}}