diff --git a/LICENSE b/LICENSE index a273dcc5e460c2bd6bc9d9aa4c5709b930712029..a830ab0fde039982bfc6262f36f0539d566fc8a4 100644 --- a/LICENSE +++ b/LICENSE @@ -47,6 +47,38 @@ implementation). These parts are licensed as follows: with software licensed exclusively under GPL2. (Most software is licensed as GPL2 or later, in which case this is not an issue.) +PyOpenCL includes parts of the Random123 suite of random number generators: + + Copyright 2010-2012, D. E. Shaw Research. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + PyOpenCL includes the RANLUXCL random number generator: Copyright (c) 2011 Ivar Ursin Nikolaisen diff --git a/doc/array.rst b/doc/array.rst index 8c3256eb983bb9e78f39f3ee4a6128a59869c900..4f4b7ef66f5b050c8455d3d19ebc9b6c693bcbe6 100644 --- a/doc/array.rst +++ b/doc/array.rst @@ -292,14 +292,3 @@ Generating Arrays of Random Numbers ----------------------------------- .. automodule:: pyopencl.clrandom - - .. autoclass:: RanluxGenerator - - .. automethod:: fill_uniform - .. automethod:: uniform - .. automethod:: fill_normal - .. automethod:: normal - .. automethod:: synchronize - - .. autofunction:: rand - .. autofunction:: fill_rand diff --git a/doc/misc.rst b/doc/misc.rst index d6f3b530a02a871f475830dd88f968000a3e3977..eef15c1cf6b2ddab34303179bc73c9e19af4c3c7 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -118,6 +118,10 @@ Version 2016.2 This version is currently under development. You can get snapshots from PyOpenCL's `git repository `_ +* Deprecate RANLUXCL. It will be removed in the 2018.x series of PyOpenCL. +* Introduce Random123 random number generators. See :mod:`pyopencl.clrandom` + for more information. + Version 2016.1 -------------- diff --git a/pyopencl/cl/pyopencl-random123/array.h b/pyopencl/cl/pyopencl-random123/array.h new file mode 100644 index 0000000000000000000000000000000000000000..8af5a4f4520686e0bec582e80c901257ec287bad --- /dev/null +++ b/pyopencl/cl/pyopencl-random123/array.h @@ -0,0 +1,325 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _r123array_dot_h__ +#define _r123array_dot_h__ +#include "openclfeatures.h" + +#ifndef __cplusplus +#define CXXMETHODS(_N, W, T) +#define CXXOVERLOADS(_N, W, T) +#else + +#include +#include +#include +#include +#include +#include + +/** @defgroup arrayNxW The r123arrayNxW classes + + Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers. + It is functionally equivalent to the C++0x std::array, + but does not require C++0x features or libraries. + + In addition to meeting most of the requirements of a Container, + it also has a member function, incr(), which increments the zero-th + element and carrys overflows into higher indexed elements. Thus, + by using incr(), sequences of up to 2^(N*W) distinct values + can be produced. + + If SSE is supported by the compiler, then the class + r123array1xm128i is also defined, in which the data member is an + array of one r123128i object. + + @cond HIDDEN_FROM_DOXYGEN +*/ + +template +inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){ + value_type v=0; + for(size_t i=0; i<(3+sizeof(value_type))/4; ++i) + v |= ((value_type)(*p32++)) << (32*i); + return v; +} + +// Work-alike methods and typedefs modeled on std::array: +#define CXXMETHODS(_N, W, T) \ + typedef T value_type; \ + typedef T* iterator; \ + typedef const T* const_iterator; \ + typedef value_type& reference; \ + typedef const value_type& const_reference; \ + typedef size_t size_type; \ + typedef ptrdiff_t difference_type; \ + typedef T* pointer; \ + typedef const T* const_pointer; \ + typedef std::reverse_iterator reverse_iterator; \ + typedef std::reverse_iterator const_reverse_iterator; \ + /* Boost.array has static_size. C++11 specializes tuple_size */ \ + enum {static_size = _N}; \ + R123_CUDA_DEVICE reference operator[](size_type i){return v[i];} \ + R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];} \ + R123_CUDA_DEVICE reference at(size_type i){ if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \ + R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >= _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \ + R123_CUDA_DEVICE size_type size() const { return _N; } \ + R123_CUDA_DEVICE size_type max_size() const { return _N; } \ + R123_CUDA_DEVICE bool empty() const { return _N==0; }; \ + R123_CUDA_DEVICE iterator begin() { return &v[0]; } \ + R123_CUDA_DEVICE iterator end() { return &v[_N]; } \ + R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; } \ + R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; } \ + R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; } \ + R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; } \ + R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); } \ + R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \ + R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); } \ + R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \ + R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \ + R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } \ + R123_CUDA_DEVICE pointer data(){ return &v[0]; } \ + R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; } \ + R123_CUDA_DEVICE reference front(){ return v[0]; } \ + R123_CUDA_DEVICE const_reference front() const{ return v[0]; } \ + R123_CUDA_DEVICE reference back(){ return v[_N-1]; } \ + R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; } \ + R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \ + /* CUDA3 does not have std::equal */ \ + for (size_t i = 0; i < _N; ++i) \ + if (v[i] != rhs.v[i]) return false; \ + return true; \ + } \ + R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \ + /* CUDA3 does not have std::fill_n */ \ + R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \ + R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \ + /* CUDA3 does not have std::swap_ranges */ \ + for (size_t i = 0; i < _N; ++i) { \ + T tmp = v[i]; \ + v[i] = rhs.v[i]; \ + rhs.v[i] = tmp; \ + } \ + } \ + R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){ \ + /* This test is tricky because we're trying to avoid spurious \ + complaints about illegal shifts, yet still be compile-time \ + evaulated. */ \ + if(sizeof(T)>((sizeof(T)3?3:0] is to silence \ + a spurious error from icpc \ + */ \ + ++v[_N>1?1:0]; \ + if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \ + ++v[_N>2?2:0]; \ + if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this; \ + ++v[_N>3?3:0]; \ + for(size_t i=4; i<_N; ++i){ \ + if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this; \ + ++v[i]; \ + } \ + return *this; \ + } \ + /* seed(SeedSeq) would be a constructor if having a constructor */ \ + /* didn't cause headaches with defaults */ \ + template \ + R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){ \ + r123array##_N##x##W ret; \ + const size_t Ngen = _N*((3+sizeof(value_type))/4); \ + uint32_t u32[Ngen]; \ + uint32_t *p32 = &u32[0]; \ + ss.generate(&u32[0], &u32[Ngen]); \ + for(size_t i=0; i<_N; ++i){ \ + ret.v[i] = assemble_from_u32(p32); \ + p32 += (3+sizeof(value_type))/4; \ + } \ + return ret; \ + } \ +protected: \ + R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \ + /* n may be greater than the maximum value of a single value_type */ \ + value_type vtn; \ + vtn = n; \ + v[0] += n; \ + const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \ + for(size_t i=1; i<_N; ++i){ \ + if(rshift){ \ + n >>= rshift; \ + }else{ \ + n=0; \ + } \ + if( v[i-1] < vtn ) \ + ++n; \ + if( n==0 ) break; \ + vtn = n; \ + v[i] += n; \ + } \ + return *this; \ + } \ + + +// There are several tricky considerations for the insertion and extraction +// operators: +// - we would like to be able to print r123array16x8 as a sequence of 16 integers, +// not as 16 bytes. +// - we would like to be able to print r123array1xm128i. +// - we do not want an int conversion operator in r123m128i because it causes +// lots of ambiguity problems with automatic promotions. +// Solution: r123arrayinsertable and r123arrayextractable + +template +struct r123arrayinsertable{ + const T& v; + r123arrayinsertable(const T& t_) : v(t_) {} + friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable& t){ + return os << t.v; + } +}; + +template<> +struct r123arrayinsertable{ + const uint8_t& v; + r123arrayinsertable(const uint8_t& t_) : v(t_) {} + friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable& t){ + return os << (int)t.v; + } +}; + +template +struct r123arrayextractable{ + T& v; + r123arrayextractable(T& t_) : v(t_) {} + friend std::istream& operator>>(std::istream& is, r123arrayextractable& t){ + return is >> t.v; + } +}; + +template<> +struct r123arrayextractable{ + uint8_t& v; + r123arrayextractable(uint8_t& t_) : v(t_) {} + friend std::istream& operator>>(std::istream& is, r123arrayextractable& t){ + int i; + is >> i; + t.v = i; + return is; + } +}; + +#define CXXOVERLOADS(_N, W, T) \ + \ +inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){ \ + os << r123arrayinsertable(a.v[0]); \ + for(size_t i=1; i<_N; ++i) \ + os << " " << r123arrayinsertable(a.v[i]); \ + return os; \ +} \ + \ +inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){ \ + for(size_t i=0; i<_N; ++i){ \ + r123arrayextractable x(a.v[i]); \ + is >> x; \ + } \ + return is; \ +} \ + \ +namespace r123{ \ + typedef r123array##_N##x##W Array##_N##x##W; \ +} + +#endif /* __cplusplus */ + +/* _r123array_tpl expands to a declaration of struct r123arrayNxW. + + In C, it's nothing more than a struct containing an array of N + objects of type T. + + In C++ it's the same, but endowed with an assortment of member + functions, typedefs and friends. In C++, r123arrayNxW looks a lot + like std::array, has most of the capabilities of a container, + and satisfies the requirements outlined in compat/Engine.hpp for + counter and key types. ArrayNxW, in the r123 namespace is + a typedef equivalent to r123arrayNxW. +*/ + +#define _r123array_tpl(_N, W, T) \ + /** @ingroup arrayNxW */ \ + /** @see arrayNxW */ \ +struct r123array##_N##x##W{ \ + T v[_N]; \ + CXXMETHODS(_N, W, T) \ +}; \ + \ +CXXOVERLOADS(_N, W, T) + +/** @endcond */ + +_r123array_tpl(1, 32, uint32_t) /* r123array1x32 */ +_r123array_tpl(2, 32, uint32_t) /* r123array2x32 */ +_r123array_tpl(4, 32, uint32_t) /* r123array4x32 */ +_r123array_tpl(8, 32, uint32_t) /* r123array8x32 */ + +_r123array_tpl(1, 64, uint64_t) /* r123array1x64 */ +_r123array_tpl(2, 64, uint64_t) /* r123array2x64 */ +_r123array_tpl(4, 64, uint64_t) /* r123array4x64 */ + +_r123array_tpl(16, 8, uint8_t) /* r123array16x8 for ARSsw, AESsw */ + +#if R123_USE_SSE +_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */ +#endif + +/* In C++, it's natural to use sizeof(a::value_type), but in C it's + pretty convoluted to figure out the width of the value_type of an + r123arrayNxW: +*/ +#define R123_W(a) (8*sizeof(((a *)0)->v[0])) + +/** @namespace r123 + Most of the Random123 C++ API is contained in the r123 namespace. +*/ + +#endif + diff --git a/pyopencl/cl/pyopencl-random123/openclfeatures.h b/pyopencl/cl/pyopencl-random123/openclfeatures.h new file mode 100644 index 0000000000000000000000000000000000000000..af03d3092318c6c27f1a65ce8104c1609b1e66e1 --- /dev/null +++ b/pyopencl/cl/pyopencl-random123/openclfeatures.h @@ -0,0 +1,89 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __openclfeatures_dot_hpp +#define __openclfeatures_dot_hpp + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#define R123_ASSERT(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 1 +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +// XXX ATI APP SDK 2.4 clBuildProgram SEGVs if one uses uint64_t instead of +// ulong to mul_hi. And gets lots of complaints from stdint.h +// on some machines. +// But these typedefs mean we cannot include stdint.h with +// these headers? Do we need R123_64T, R123_32T, R123_8T? +typedef ulong uint64_t; +typedef uint uint32_t; +typedef uchar uint8_t; +#define UINT64_C(x) ((ulong)(x##UL)) + +#endif diff --git a/pyopencl/cl/pyopencl-random123/philox.cl b/pyopencl/cl/pyopencl-random123/philox.cl new file mode 100644 index 0000000000000000000000000000000000000000..c6fe4464bf17cc3b20cb346e77d9ac73ae650275 --- /dev/null +++ b/pyopencl/cl/pyopencl-random123/philox.cl @@ -0,0 +1,486 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _philox_dot_h_ +#define _philox_dot_h_ + +/** \cond HIDDEN_FROM_DOXYGEN */ + +#include "openclfeatures.h" +#include "array.h" + + +/* +// Macros _Foo_tpl are code generation 'templates' They define +// inline functions with names obtained by mangling Foo and the +// macro arguments. E.g., +// _mulhilo_tpl(32, uint32_t, uint64_t) +// expands to a definition of: +// mulhilo32(uint32_t, uint32_t, uint32_t *, uint32_t *) +// We then 'instantiate the template' to define +// several different functions, e.g., +// mulhilo32 +// mulhilo64 +// These functions will be visible to user code, and may +// also be used later in subsequent templates and definitions. + +// A template for mulhilo using a temporary of twice the word-width. +// Gcc figures out that this can be reduced to a single 'mul' instruction, +// despite the apparent use of double-wide variables, shifts, etc. It's +// obviously not guaranteed that all compilers will be that smart, so +// other implementations might be preferable, e.g., using an intrinsic +// or an asm block. On the other hand, for 32-bit multiplies, +// this *is* perfectly standard C99 - any C99 compiler should +// understand it and produce correct code. For 64-bit multiplies, +// it's only usable if the compiler recognizes that it can do +// arithmetic on a 128-bit type. That happens to be true for gcc on +// x86-64, and powerpc64 but not much else. +*/ +#define _mulhilo_dword_tpl(W, Word, Dword) \ +R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \ + Dword product = ((Dword)a)*((Dword)b); \ + *hip = product>>W; \ + return (Word)product; \ +} + +/* +// A template for mulhilo using gnu-style asm syntax. +// INSN can be "mulw", "mull" or "mulq". +// FIXME - porting to other architectures, we'll need still-more conditional +// branching here. Note that intrinsics are usually preferable. +*/ +#ifdef __powerpc__ +#define _mulhilo_asm_tpl(W, Word, INSN) \ +R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \ + Word dx = 0; \ + __asm__("\n\t" \ + INSN " %0,%1,%2\n\t" \ + : "=r"(dx) \ + : "r"(b), "r"(ax) \ + ); \ + *hip = dx; \ + return ax*b; \ +} +#else +#define _mulhilo_asm_tpl(W, Word, INSN) \ +R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \ + Word dx; \ + __asm__("\n\t" \ + INSN " %2\n\t" \ + : "=a"(ax), "=d"(dx) \ + : "r"(b), "0"(ax) \ + ); \ + *hip = dx; \ + return ax; \ +} +#endif /* __powerpc__ */ + +/* +// A template for mulhilo using MSVC-style intrinsics +// For example,_umul128 is an msvc intrinsic, c.f. +// http://msdn.microsoft.com/en-us/library/3dayytw9.aspx +*/ +#define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN) \ +R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \ + return INTRIN(a, b, hip); \ +} + +/* N.B. This really should be called _mulhilo_mulhi_intrin. It just + happens that CUDA was the first time we used the idiom. */ +#define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN) \ +R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \ + *hip = INTRIN(a, b); \ + return a*b; \ +} + +/* +// A template for mulhilo using only word-size operations and +// C99 operators (no adc, no mulhi). It +// requires four multiplies and a dozen or so shifts, adds +// and tests. It's not clear what this is good for, other than +// completeness. On 32-bit platforms, it could be used to +// implement philoxNx64, but on such platforms both the philoxNx32 +// and the threefryNx64 cbrngs are going to have much better +// performance. It is enabled below by R123_USE_MULHILO64_C99, +// but that is currently (Sep 2011) not set by any of the +// features/XXfeatures.h headers. It can, of course, be +// set with a compile-time -D option. +*/ +#define _mulhilo_c99_tpl(W, Word) \ +R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \ + const unsigned WHALF = W/2; \ + const Word LOMASK = ((((Word)1)<>WHALF; \ + Word alo = a& LOMASK; \ + Word bhi = b>>WHALF; \ + Word blo = b& LOMASK; \ + \ + Word ahbl = ahi*blo; \ + Word albh = alo*bhi; \ + \ + Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK)); \ + Word hi = ahi*bhi + (ahbl>>WHALF) + (albh>>WHALF); \ + hi += ahbl_albh >> WHALF; /* carry from the sum of lo(ahbl) + lo(albh) ) */ \ + /* carry from the sum with alo*blo */ \ + hi += ((lo >> WHALF) < (ahbl_albh&LOMASK)); \ + *hip = hi; \ + return lo; \ +} + +/* +// A template for mulhilo on a platform that can't do it +// We could put a C version here, but is it better to run *VERY* +// slowly or to just stop and force the user to find another CBRNG? +*/ +#define _mulhilo_fail_tpl(W, Word) \ +R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \ + R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \ +} + +/* +// N.B. There's an MSVC intrinsic called _emul, +// which *might* compile into better code than +// _mulhilo_dword_tpl +*/ +#if R123_USE_MULHILO32_ASM +#ifdef __powerpc__ +_mulhilo_asm_tpl(32, uint32_t, "mulhwu") +#else +_mulhilo_asm_tpl(32, uint32_t, "mull") +#endif /* __powerpc__ */ +#else +_mulhilo_dword_tpl(32, uint32_t, uint64_t) +#endif + +#if R123_USE_PHILOX_64BIT +#if R123_USE_MULHILO64_ASM +#ifdef __powerpc64__ +_mulhilo_asm_tpl(64, uint64_t, "mulhdu") +#else +_mulhilo_asm_tpl(64, uint64_t, "mulq") +#endif /* __powerpc64__ */ +#elif R123_USE_MULHILO64_MSVC_INTRIN +_mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128) +#elif R123_USE_MULHILO64_CUDA_INTRIN +_mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi) +#elif R123_USE_MULHILO64_OPENCL_INTRIN +_mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi) +#elif R123_USE_MULHILO64_MULHI_INTRIN +_mulhilo_cuda_intrin_tpl(64, uint64_t, R123_MULHILO64_MULHI_INTRIN) +#elif R123_USE_GNU_UINT128 +_mulhilo_dword_tpl(64, uint64_t, __uint128_t) +#elif R123_USE_MULHILO64_C99 +_mulhilo_c99_tpl(64, uint64_t) +#else +_mulhilo_fail_tpl(64, uint64_t) +#endif +#endif + +/* +// The multipliers and Weyl constants are "hard coded". +// To change them, you can #define them with different +// values before #include-ing this file. +// This isn't terribly elegant, but it works for C as +// well as C++. A nice C++-only solution would be to +// use template parameters in the style of +*/ +#ifndef PHILOX_M2x64_0 +#define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93) +#endif + +#ifndef PHILOX_M4x64_0 +#define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93) +#endif + +#ifndef PHILOX_M4x64_1 +#define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157) +#endif + +#ifndef PHILOX_M2x32_0 +#define PHILOX_M2x32_0 ((uint32_t)0xd256d193) +#endif + +#ifndef PHILOX_M4x32_0 +#define PHILOX_M4x32_0 ((uint32_t)0xD2511F53) +#endif +#ifndef PHILOX_M4x32_1 +#define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57) +#endif + +#ifndef PHILOX_W64_0 +#define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15) /* golden ratio */ +#endif +#ifndef PHILOX_W64_1 +#define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B) /* sqrt(3)-1 */ +#endif + +#ifndef PHILOX_W32_0 +#define PHILOX_W32_0 ((uint32_t)0x9E3779B9) +#endif +#ifndef PHILOX_W32_1 +#define PHILOX_W32_1 ((uint32_t)0xBB67AE85) +#endif + +#ifndef PHILOX2x32_DEFAULT_ROUNDS +#define PHILOX2x32_DEFAULT_ROUNDS 10 +#endif + +#ifndef PHILOX2x64_DEFAULT_ROUNDS +#define PHILOX2x64_DEFAULT_ROUNDS 10 +#endif + +#ifndef PHILOX4x32_DEFAULT_ROUNDS +#define PHILOX4x32_DEFAULT_ROUNDS 10 +#endif + +#ifndef PHILOX4x64_DEFAULT_ROUNDS +#define PHILOX4x64_DEFAULT_ROUNDS 10 +#endif + +/* The ignored fourth argument allows us to instantiate the + same macro regardless of N. */ +#define _philox2xWround_tpl(W, T) \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \ + T hi; \ + T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi); \ + struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}}; \ + return out; \ +} +#define _philox2xWbumpkey_tpl(W) \ +R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \ + key.v[0] += PHILOX_W##W##_0; \ + return key; \ +} + +#define _philox4xWround_tpl(W, T) \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \ + T hi0; \ + T hi1; \ + T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0); \ + T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1); \ + struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1, \ + hi0^ctr.v[3]^key.v[1], lo0}}; \ + return out; \ +} + +#define _philox4xWbumpkey_tpl(W) \ +R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \ + key.v[0] += PHILOX_W##W##_0; \ + key.v[1] += PHILOX_W##W##_1; \ + return key; \ +} + +#define _philoxNxW_tpl(N, Nhalf, W, T) \ +/** @ingroup PhiloxNxW */ \ +enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \ +typedef struct r123array##N##x##W philox##N##x##W##_ctr_t; \ +typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t; \ +typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t; \ +R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \ + R123_ASSERT(R<=16); \ + if(R>0){ ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ + return ctr; \ +} + +_philox2xWbumpkey_tpl(32) +_philox4xWbumpkey_tpl(32) +_philox2xWround_tpl(32, uint32_t) /* philo2x32round */ +_philox4xWround_tpl(32, uint32_t) /* philo4x32round */ +/** \endcond */ +_philoxNxW_tpl(2, 1, 32, uint32_t) /* philox2x32bijection */ +_philoxNxW_tpl(4, 2, 32, uint32_t) /* philox4x32bijection */ +#if R123_USE_PHILOX_64BIT +/** \cond HIDDEN_FROM_DOXYGEN */ +_philox2xWbumpkey_tpl(64) +_philox4xWbumpkey_tpl(64) +_philox2xWround_tpl(64, uint64_t) /* philo2x64round */ +_philox4xWround_tpl(64, uint64_t) /* philo4x64round */ +/** \endcond */ +_philoxNxW_tpl(2, 1, 64, uint64_t) /* philox2x64bijection */ +_philoxNxW_tpl(4, 2, 64, uint64_t) /* philox4x64bijection */ +#endif /* R123_USE_PHILOX_64BIT */ + +#define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k) +#define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k) +#if R123_USE_PHILOX_64BIT +#define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k) +#define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k) +#endif /* R123_USE_PHILOX_64BIT */ + +#ifdef __cplusplus +#include + +/** \cond HIDDEN_FROM_DOXYGEN */ + +#define _PhiloxNxW_base_tpl(CType, KType, N, W) \ +namespace r123{ \ +template \ +struct Philox##N##x##W##_R{ \ + typedef CType ctr_type; \ + typedef KType key_type; \ + typedef KType ukey_type; \ + static const unsigned int rounds=ROUNDS; \ + inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \ + R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \ + return philox##N##x##W##_R(ROUNDS, ctr, key); \ + } \ +}; \ +typedef Philox##N##x##W##_R Philox##N##x##W; \ + } // namespace r123 +/** \endcond */ + +_PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32) // Philox2x32_R +_PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32) // Philox4x32_R +#if R123_USE_PHILOX_64BIT +_PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64) // Philox2x64_R +_PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64) // Philox4x64_R +#endif + +/* The _tpl macros don't quite work to do string-pasting inside comments. + so we just write out the boilerplate documentation four times... */ + +/** +@defgroup PhiloxNxW Philox Classes and Typedefs + +The PhiloxNxW classes export the member functions, typedefs and +operator overloads required by a @ref CBRNG "CBRNG" class. + +As described in +Parallel Random Numbers: As Easy as 1, 2, 3 . +The Philox family of counter-based RNGs use integer multiplication, xor and permutation of W-bit words +to scramble its N-word input key. Philox is a mnemonic for Product HI LO Xor). + + +@class r123::Philox2x32_R +@ingroup PhiloxNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Philox round +function will be applied. + +As of November 2011, the authors know of no statistical flaws with +ROUNDS=6 or more for Philox2x32. + +@typedef r123::Philox2x32 +@ingroup PhiloxNxW + Philox2x32 is equivalent to Philox2x32_R<10>. With 10 rounds, + Philox2x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Philox2x64_R +@ingroup PhiloxNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Philox round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=6 or more for Philox2x64. + +@typedef r123::Philox2x64 +@ingroup PhiloxNxW + Philox2x64 is equivalent to Philox2x64_R<10>. With 10 rounds, + Philox2x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Philox4x32_R +@ingroup PhiloxNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Philox round +function will be applied. + +In November 2011, the authors recorded some suspicious p-values (approximately 1.e-7) from +some very long (longer than the default BigCrush length) SimpPoker tests. Despite +the fact that even longer tests reverted to "passing" p-values, a cloud remains over +Philox4x32 with 7 rounds. The authors know of no statistical flaws with +ROUNDS=8 or more for Philox4x32. + +@typedef r123::Philox4x32 +@ingroup PhiloxNxW + Philox4x32 is equivalent to Philox4x32_R<10>. With 10 rounds, + Philox4x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Philox4x64_R +@ingroup PhiloxNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Philox round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=7 or more for Philox4x64. + +@typedef r123::Philox4x64 +@ingroup PhiloxNxW + Philox4x64 is equivalent to Philox4x64_R<10>. With 10 rounds, + Philox4x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. +*/ + +#endif /* __cplusplus */ + +#endif /* _philox_dot_h_ */ diff --git a/pyopencl/cl/pyopencl-random123/threefry.cl b/pyopencl/cl/pyopencl-random123/threefry.cl new file mode 100644 index 0000000000000000000000000000000000000000..1a83ddc1d32b05376a2278ad625ba2c5bc14bedb --- /dev/null +++ b/pyopencl/cl/pyopencl-random123/threefry.cl @@ -0,0 +1,864 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _threefry_dot_h_ +#define _threefry_dot_h_ +#include "openclfeatures.h" +#include "array.h" + +/** \cond HIDDEN_FROM_DOXYGEN */ +/* Significant parts of this file were copied from + from: + Skein_FinalRnd/ReferenceImplementation/skein.h + Skein_FinalRnd/ReferenceImplementation/skein_block.c + + in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip + + This file has been modified so that it may no longer perform its originally + intended function. If you're looking for a Skein or Threefish source code, + please consult the original file. + + The original file had the following header: +************************************************************************** +** +** Interface declarations and internal definitions for Skein hashing. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +*************************************************************************** + +*/ + +/* See comment at the top of philox.h for the macro pre-process + strategy. */ + +/* Rotation constants: */ +enum r123_enum_threefry64x4 { + /* These are the R_256 constants from the Threefish reference sources + with names changed to R_64x4... */ + R_64x4_0_0=14, R_64x4_0_1=16, + R_64x4_1_0=52, R_64x4_1_1=57, + R_64x4_2_0=23, R_64x4_2_1=40, + R_64x4_3_0= 5, R_64x4_3_1=37, + R_64x4_4_0=25, R_64x4_4_1=33, + R_64x4_5_0=46, R_64x4_5_1=12, + R_64x4_6_0=58, R_64x4_6_1=22, + R_64x4_7_0=32, R_64x4_7_1=32 +}; + +enum r123_enum_threefry64x2 { + /* + // Output from skein_rot_search: (srs64_B64-X1000) + // Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57 + // Start: Tue Mar 1 10:07:48 2011 + // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format + */ + R_64x2_0_0=16, + R_64x2_1_0=42, + R_64x2_2_0=12, + R_64x2_3_0=31, + R_64x2_4_0=16, + R_64x2_5_0=32, + R_64x2_6_0=24, + R_64x2_7_0=21 + /* 4 rounds: minHW = 4 [ 4 4 4 4 ] + // 5 rounds: minHW = 8 [ 8 8 8 8 ] + // 6 rounds: minHW = 16 [ 16 16 16 16 ] + // 7 rounds: minHW = 32 [ 32 32 32 32 ] + // 8 rounds: minHW = 64 [ 64 64 64 64 ] + // 9 rounds: minHW = 64 [ 64 64 64 64 ] + //10 rounds: minHW = 64 [ 64 64 64 64 ] + //11 rounds: minHW = 64 [ 64 64 64 64 ] */ +}; + +enum r123_enum_threefry32x4 { + /* Output from skein_rot_search: (srs-B128-X5000.out) + // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 + // Start: Mon Aug 24 22:41:36 2009 + // ... + // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */ + R_32x4_0_0=10, R_32x4_0_1=26, + R_32x4_1_0=11, R_32x4_1_1=21, + R_32x4_2_0=13, R_32x4_2_1=27, + R_32x4_3_0=23, R_32x4_3_1= 5, + R_32x4_4_0= 6, R_32x4_4_1=20, + R_32x4_5_0=17, R_32x4_5_1=11, + R_32x4_6_0=25, R_32x4_6_1=10, + R_32x4_7_0=18, R_32x4_7_1=20 + + /* 4 rounds: minHW = 3 [ 3 3 3 3 ] + // 5 rounds: minHW = 7 [ 7 7 7 7 ] + // 6 rounds: minHW = 12 [ 13 12 13 12 ] + // 7 rounds: minHW = 22 [ 22 23 22 23 ] + // 8 rounds: minHW = 31 [ 31 31 31 31 ] + // 9 rounds: minHW = 32 [ 32 32 32 32 ] + //10 rounds: minHW = 32 [ 32 32 32 32 ] + //11 rounds: minHW = 32 [ 32 32 32 32 ] */ + +}; + +enum r123_enum_threefry32x2 { + /* Output from skein_rot_search (srs32x2-X5000.out) + // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 + // Start: Tue Jul 12 11:11:33 2011 + // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */ + R_32x2_0_0=13, + R_32x2_1_0=15, + R_32x2_2_0=26, + R_32x2_3_0= 6, + R_32x2_4_0=17, + R_32x2_5_0=29, + R_32x2_6_0=16, + R_32x2_7_0=24 + + /* 4 rounds: minHW = 4 [ 4 4 4 4 ] + // 5 rounds: minHW = 6 [ 6 8 6 8 ] + // 6 rounds: minHW = 9 [ 9 12 9 12 ] + // 7 rounds: minHW = 16 [ 16 24 16 24 ] + // 8 rounds: minHW = 32 [ 32 32 32 32 ] + // 9 rounds: minHW = 32 [ 32 32 32 32 ] + //10 rounds: minHW = 32 [ 32 32 32 32 ] + //11 rounds: minHW = 32 [ 32 32 32 32 ] */ + }; + +enum r123_enum_threefry_wcnt { + WCNT2=2, + WCNT4=4 +}; +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N)); +R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N) +{ + return (x << (N & 63)) | (x >> ((64-N) & 63)); +} + +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N)); +R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N) +{ + return (x << (N & 31)) | (x >> ((32-N) & 31)); +} + +#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32)) +#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) +#define SKEIN_KS_PARITY32 0x1BD11BDA + +#ifndef THREEFRY2x32_DEFAULT_ROUNDS +#define THREEFRY2x32_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY2x64_DEFAULT_ROUNDS +#define THREEFRY2x64_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY4x32_DEFAULT_ROUNDS +#define THREEFRY4x32_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY4x64_DEFAULT_ROUNDS +#define THREEFRY4x64_DEFAULT_ROUNDS 20 +#endif + +#define _threefry2x_tpl(W) \ +typedef struct r123array2x##W threefry2x##W##_ctr_t; \ +typedef struct r123array2x##W threefry2x##W##_key_t; \ +typedef struct r123array2x##W threefry2x##W##_ukey_t; \ +R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \ + threefry2x##W##_ctr_t X; \ + uint##W##_t ks[2+1]; \ + int i; /* avoid size_t to avoid need for stddef.h */ \ + R123_ASSERT(Nrounds<=32); \ + ks[2] = SKEIN_KS_PARITY##W; \ + for (i=0;i < 2; i++) \ + { \ + ks[i] = k.v[i]; \ + X.v[i] = in.v[i]; \ + ks[2] ^= k.v[i]; \ + } \ + \ + /* Insert initial key before round 0 */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; \ + \ + if(Nrounds>0){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>1){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>2){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>3){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>3){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; \ + X.v[1] += 1; /* X.v[2-1] += r */ \ + } \ + if(Nrounds>4){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>5){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>6){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>7){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>7){ \ + /* InjectKey(r=2) */ \ + X.v[0] += ks[2]; X.v[1] += ks[0]; \ + X.v[1] += 2; \ + } \ + if(Nrounds>8){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>9){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>10){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>11){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>11){ \ + /* InjectKey(r=3) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; \ + X.v[1] += 3; \ + } \ + if(Nrounds>12){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>13){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>14){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>15){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>15){ \ + /* InjectKey(r=4) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; \ + X.v[1] += 4; \ + } \ + if(Nrounds>16){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>17){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>18){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>19){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>19){ \ + /* InjectKey(r=5) */ \ + X.v[0] += ks[2]; X.v[1] += ks[0]; \ + X.v[1] += 5; \ + } \ + if(Nrounds>20){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>21){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>22){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>23){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>23){ \ + /* InjectKey(r=6) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; \ + X.v[1] += 6; \ + } \ + if(Nrounds>24){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>25){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>26){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>27){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>27){ \ + /* InjectKey(r=7) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; \ + X.v[1] += 7; \ + } \ + if(Nrounds>28){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>29){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>30){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>31){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>31){ \ + /* InjectKey(r=8) */ \ + X.v[0] += ks[2]; X.v[1] += ks[0]; \ + X.v[1] += 8; \ + } \ + return X; \ +} \ + /** @ingroup ThreefryNxW */ \ +enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \ + return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \ +} + + +#define _threefry4x_tpl(W) \ +typedef struct r123array4x##W threefry4x##W##_ctr_t; \ +typedef struct r123array4x##W threefry4x##W##_key_t; \ +typedef struct r123array4x##W threefry4x##W##_ukey_t; \ +R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \ + threefry4x##W##_ctr_t X; \ + uint##W##_t ks[4+1]; \ + int i; /* avoid size_t to avoid need for stddef.h */ \ + R123_ASSERT(Nrounds<=72); \ + ks[4] = SKEIN_KS_PARITY##W; \ + for (i=0;i < 4; i++) \ + { \ + ks[i] = k.v[i]; \ + X.v[i] = in.v[i]; \ + ks[4] ^= k.v[i]; \ + } \ + \ + /* Insert initial key before round 0 */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + \ + if(Nrounds>0){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>1){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>2){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>3){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>3){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 1; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>4){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>5){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>6){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>7){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>7){ \ + /* InjectKey(r=2) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 2; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>8){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>9){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>10){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>11){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>11){ \ + /* InjectKey(r=3) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 3; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>12){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>13){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>14){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>15){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>15){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \ + X.v[4-1] += 4; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>16){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>17){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>18){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>19){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>19){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + X.v[4-1] += 5; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>20){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>21){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>22){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>23){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>23){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 6; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>24){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>25){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>26){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>27){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>27){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 7; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>28){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>29){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>30){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>31){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>31){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 8; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>32){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>33){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>34){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>35){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>35){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \ + X.v[4-1] += 9; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>36){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>37){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>38){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>39){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>39){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + X.v[4-1] += 10; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>40){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>41){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>42){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>43){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>43){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 11; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>44){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>45){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>46){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>47){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>47){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 12; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>48){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>49){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>50){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>51){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>51){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 13; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>52){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>53){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>54){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>55){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>55){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \ + X.v[4-1] += 14; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>56){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>57){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>58){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>59){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>59){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + X.v[4-1] += 15; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>60){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>61){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>62){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>63){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>63){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 16; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>64){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>65){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>66){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>67){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>67){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 17; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>68){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>69){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>70){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>71){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>71){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 18; /* X.v[WCNT4-1] += r */ \ + } \ + \ + return X; \ +} \ + /** @ingroup ThreefryNxW */ \ +enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \ + return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \ +} +/** \endcond */ + +_threefry2x_tpl(64) +_threefry2x_tpl(32) +_threefry4x_tpl(64) +_threefry4x_tpl(32) + +/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better + than a static inline function. Why? */ +#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k) +#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k) +#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k) +#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k) + +#ifdef __cplusplus +/** \cond HIDDEN_FROM_DOXYGEN */ +#define _threefryNxWclass_tpl(NxW) \ +namespace r123{ \ +template \ + struct Threefry##NxW##_R{ \ + typedef threefry##NxW##_ctr_t ctr_type; \ + typedef threefry##NxW##_key_t key_type; \ + typedef threefry##NxW##_key_t ukey_type; \ + static const unsigned int rounds=R; \ + inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \ + R123_STATIC_ASSERT(R<=72, "threefry is only unrolled up to 72 rounds\n"); \ + return threefry##NxW##_R(R, ctr, key); \ + } \ +}; \ + typedef Threefry##NxW##_R Threefry##NxW; \ +} // namespace r123 + +/** \endcond */ + +_threefryNxWclass_tpl(2x32) +_threefryNxWclass_tpl(4x32) +_threefryNxWclass_tpl(2x64) +_threefryNxWclass_tpl(4x64) + +/* The _tpl macros don't quite work to do string-pasting inside comments. + so we just write out the boilerplate documentation four times... */ + +/** +@defgroup ThreefryNxW Threefry Classes and Typedefs + +The ThreefryNxW classes export the member functions, typedefs and +operator overloads required by a @ref CBRNG "CBRNG" class. + +As described in +Parallel Random Numbers: As Easy as 1, 2, 3 , +the Threefry family is closely related to the Threefish block cipher from + Skein Hash Function. +Threefry is \b not suitable for cryptographic use. + +Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output. + +@class r123::Threefry2x32_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=13 or more for Threefry2x32. + +@typedef r123::Threefry2x32 +@ingroup ThreefryNxW + Threefry2x32 is equivalent to Threefry2x32_R<20>. With 20 rounds, + Threefry2x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + +@class r123::Threefry2x64_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +In November 2011, the authors discovered that 13 rounds of +Threefry2x64 sequenced by strided, interleaved key and counter +increments failed a very long (longer than the default BigCrush +length) WeightDistrub test. At the same time, it was confirmed that +14 rounds passes much longer tests (up to 5x10^12 samples) of a +similar nature. The authors know of no statistical flaws with +ROUNDS=14 or more for Threefry2x64. + +@typedef r123::Threefry2x64 +@ingroup ThreefryNxW + Threefry2x64 is equivalent to Threefry2x64_R<20>. With 20 rounds, + Threefry2x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Threefry4x32_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=12 or more for Threefry4x32. + +@typedef r123::Threefry4x32 +@ingroup ThreefryNxW + Threefry4x32 is equivalent to Threefry4x32_R<20>. With 20 rounds, + Threefry4x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Threefry4x64_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=12 or more for Threefry4x64. + +@typedef r123::Threefry4x64 +@ingroup ThreefryNxW + Threefry4x64 is equivalent to Threefry4x64_R<20>. With 20 rounds, + Threefry4x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. +*/ + +#endif + +#endif diff --git a/pyopencl/cl/pyopencl-ranluxcl.cl b/pyopencl/cl/pyopencl-ranluxcl.cl index a40cadf1e3f5e28a390eb796d7ac7aaeda47a48a..eac9e18de850cd94b2276475e8b9a5f3110e80da 100644 --- a/pyopencl/cl/pyopencl-ranluxcl.cl +++ b/pyopencl/cl/pyopencl-ranluxcl.cl @@ -1,3 +1,6 @@ +/* RanluxCL is deprecated in PyOpenCL and will be removed in the 2018.x + * versions of the package. */ + #ifndef RANLUXCL_CL #define RANLUXCL_CL diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py index 7d68800ffd973a3e7c6791a926a922cc6b08fc36..6d4b2222fd9843449753a760efcf83fdcf6716b2 100644 --- a/pyopencl/clrandom.py +++ b/pyopencl/clrandom.py @@ -1,8 +1,7 @@ # encoding: utf8 -from __future__ import division -from __future__ import absolute_import +from __future__ import division, absolute_import -__copyright__ = "Copyright (C) 2009 Andreas Kloeckner" +__copyright__ = "Copyright (C) 2009-16 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -28,27 +27,36 @@ THE SOFTWARE. # {{{ documentation __doc__ = u""" -PyOpenCL now includes and uses the `RANLUXCL random number generator -`_ by Ivar Ursin Nikolaisen. In -addition to being usable through the convenience functions above, it is -available in any piece of code compiled through PyOpenCL by:: +PyOpenCL now includes and uses some of the `Random123 random number generators +`_ by D.E. Shaw +Research. In addition to being usable through the convenience functions above, +they are available in any piece of code compiled through PyOpenCL by:: - #include + #include + #include -See the `source -`_ -for some documentation if you're planning on using RANLUXCL directly. +See the `Philox source +`_ +and the `Threefry source +`_ +for some documentation if you're planning on using Random123 directly. -The RANLUX generator is described in the following two articles. If you use the -generator for scientific purposes, please consider citing them: +.. note:: -* Martin Lüscher, A portable high-quality random number generator for lattice - field theory simulations, `Computer Physics Communications 79 (1994) 100-110 - `_ + PyOpenCL previously had documented support for the `RANLUXCL random number + generator `_ by Ivar Ursin + Nikolaisen. This support is now deprecated because of the general slowness + of these generators and will be removed from PyOpenCL in the 2018.x series. + All users are encouraged to switch to one of the Random123 genrators, + :class:`PhiloxGenerator` or :class:`ThreefryGenerator`. + +.. autoclass:: PhiloxGenerator + +.. autoclass:: ThreefryGenerator + +.. autofunction:: rand +.. autofunction:: fill_rand -* F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom - number generator of Lüscher, `Computer Physics Communications 79 (1994) 111-114 - `_ """ # }}} @@ -61,8 +69,14 @@ from pytools import memoize_method import numpy as np +# {{{ RanluxGenerator (deprecated) + class RanluxGenerator(object): """ + .. warning:: + + This class is deprecated, to be removed in PyOpenCL 2018.x. + .. versionadded:: 2011.2 .. attribute:: state @@ -390,20 +404,308 @@ class RanluxGenerator(object): self.get_sync_kernel()(queue, (self.num_work_items,), self.wg_size, self.state.data) +# }}} + + +# {{{ Random123 generators + +class Random123GeneratorBase(object): + """ + .. versionadded:: 2016.2 + + .. automethod:: fill_uniform + .. automethod:: uniform + .. automethod:: fill_normal + .. automethod:: normal + """ + + def __init__(self, context, key=None, counter=None): + int32_info = np.iinfo(np.int32) + + if key is None: + from random import randrange + key = [randrange(int(int32_info.min), int(int32_info.max)+1) + for i in range(self.key_length-1)] + if counter is None: + from random import randrange + counter = [randrange(int(int32_info.min), int(int32_info.max)+1) + for i in range(4)] + + self.context = context + self.key = key + self.counter = counter + + self.counter_max = int32_info.max + + @memoize_method + def get_gen_kernel(self, dtype, distribution): + size_multiplier = 1 + arg_dtype = dtype + + rng_key = (distribution, dtype) + + if rng_key in [("uniform", np.float64), ("normal", np.float64)]: + c_type = "double" + scale1_const = "((double) %r)" % (1/2**32) + scale2_const = "((double) %r)" % (1/2**64) + if distribution == "normal": + transform = "box_muller" + else: + transform = "" + + rng_expr = ( + "shift + scale * " + "%s( %s * convert_double4(gen)" + "+ %s * convert_double4(gen))" + % (transform, scale1_const, scale2_const)) + + counter_multiplier = 2 + + elif rng_key in [(dist, cmp_dtype) + for dist in ["normal", "uniform"] + for cmp_dtype in [ + np.float32, + cl.array.vec.float2, + cl.array.vec.float3, + cl.array.vec.float4, + ]]: + c_type = "float" + scale_const = "((float) %r)" % (1/2**32) + + if distribution == "normal": + transform = "box_muller" + else: + transform = "" + + rng_expr = ( + "shift + scale * %s(%s * convert_float4(gen))" + % (transform, scale_const)) + counter_multiplier = 1 + arg_dtype = np.float32 + try: + _, size_multiplier = cl.array.vec.type_to_scalar_and_count[dtype] + except KeyError: + pass + + elif rng_key == ("uniform", np.int32): + c_type = "int" + rng_expr = ( + "shift + convert_int4((convert_long4(gen) * scale) / %s)" + % (str(2**32)+"l") + ) + counter_multiplier = 1 + + elif rng_key == ("uniform", np.int64): + c_type = "long" + rng_expr = ( + "shift" + "+ convert_long4(gen) * (scale/two32) " + "+ ((convert_long4(gen) * scale) / two32)" + .replace("two32", (str(2**32)+"l"))) + counter_multiplier = 2 + + else: + raise TypeError( + "unsupported RNG distribution/data type combination '%s/%s'" + % rng_key) + + src = """//CL// + #include <%(header_name)s> + + typedef %(output_t)s output_t; + typedef %(output_t)s4 output_vec_t; + typedef %(gen_name)s_ctr_t ctr_t; + typedef %(gen_name)s_key_t key_t; + + uint4 gen_bits(key_t *key, ctr_t *ctr) + { + union { + ctr_t ctr_el; + uint4 vec_el; + } u; + + u.ctr_el = %(gen_name)s(*ctr, *key); + if (++ctr->v[0] == 0) + if (++ctr->v[1] == 0) + ++ctr->v[2]; + + return u.vec_el; + } + + #if %(include_box_muller)s + output_vec_t box_muller(output_vec_t x) + { + #define BOX_MULLER(I, COMPA, COMPB) \ + output_t r##I = sqrt(-2*log(x.COMPA)); \ + output_t c##I; \ + output_t s##I = sincos((output_t) (2*M_PI) * x.COMPB, &c##I); + + BOX_MULLER(0, x, y); + BOX_MULLER(1, z, w); + return (output_vec_t) (r0*c0, r0*s0, r1*c1, r1*s1); + } + #endif + + #define GET_RANDOM_NUM(gen) %(rng_expr)s + + kernel void generate( + int k1, + #if %(key_length)s > 2 + int k2, int k3, + #endif + int c0, int c1, int c2, int c3, + global output_t *output, + long out_size, + output_t scale, + output_t shift) + { + #if %(key_length)s == 2 + key_t k = {{get_global_id(0), k1}}; + #else + key_t k = {{get_global_id(0), k1, k2, k3}}; + #endif + + ctr_t c = {{c0, c1, c2, c3}}; + + // output bulk + unsigned long idx = get_global_id(0)*4; + while (idx + 4 < out_size) + { + *(global output_vec_t *) (output + idx) = + GET_RANDOM_NUM(gen_bits(&k, &c)); + idx += 4*get_global_size(0); + } + + // output tail + output_vec_t tail_ran = GET_RANDOM_NUM(gen_bits(&k, &c)); + if (idx < out_size) + output[idx] = tail_ran.x; + if (idx+1 < out_size) + output[idx+1] = tail_ran.y; + if (idx+2 < out_size) + output[idx+2] = tail_ran.z; + if (idx+3 < out_size) + output[idx+3] = tail_ran.w; + } + """ % { + "gen_name": self.generator_name, + "header_name": self.header_name, + "output_t": c_type, + "key_length": self.key_length, + "include_box_muller": int(distribution == "normal"), + "rng_expr": rng_expr + } + + prg = cl.Program(self.context, src).build() + knl = prg.generate + knl.set_scalar_arg_dtypes( + [np.int32] * (self.key_length - 1 + 4) + + [None, np.int64, arg_dtype, arg_dtype]) + + return knl, counter_multiplier, size_multiplier + + def _fill(self, distribution, ary, scale, shift, queue=None): + """Fill *ary* with uniformly distributed random numbers in the interval + *(a, b)*, endpoints excluded. + + :return: a :class:`pyopencl.Event` + """ + + if queue is None: + queue = ary.queue + + knl, counter_multiplier, size_multiplier = \ + self.get_gen_kernel(ary.dtype, distribution) + + args = self.key + self.counter + [ + ary.data, ary.size*size_multiplier, + scale, shift] + + n = ary.size + from pyopencl.array import splay + gsize, lsize = splay(queue, ary.size) + + evt = knl(queue, gsize, lsize, *args) + + self.counter[0] += n * counter_multiplier + c1_incr, self.counter[0] = divmod(self.counter[0], self.counter_max) + if c1_incr: + self.counter[1] += c1_incr + c2_incr, self.counter[1] = divmod(self.counter[1], self.counter_max) + self.counter[2] += c2_incr + + return evt + + def fill_uniform(self, ary, a=0, b=1, queue=None): + return self._fill("uniform", ary, + scale=(b-a), shift=a, queue=queue) + + def uniform(self, *args, **kwargs): + """Make a new empty array, apply :meth:`fill_uniform` to it. + """ + a = kwargs.pop("a", 0) + b = kwargs.pop("b", 1) + + result = cl_array.empty(*args, **kwargs) + + self.fill_uniform(result, queue=result.queue, a=a, b=b) + return result + + def fill_normal(self, ary, mu=0, sigma=1, queue=None): + """Fill *ary* with normally distributed numbers with mean *mu* and + standard deviation *sigma*. + """ + + return self._fill("normal", ary, scale=sigma, shift=mu, queue=queue) + + def normal(self, *args, **kwargs): + """Make a new empty array, apply :meth:`fill_normal` to it. + """ + mu = kwargs.pop("mu", 0) + sigma = kwargs.pop("sigma", 1) + + result = cl_array.empty(*args, **kwargs) + + result.add_event( + self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma)) + return result + + +class PhiloxGenerator(Random123GeneratorBase): + header_name = "pyopencl-random123/philox.cl" + generator_name = "philox4x32" + key_length = 2 + + +class ThreefryGenerator(Random123GeneratorBase): + header_name = "pyopencl-random123/threefry.cl" + generator_name = "threefry4x32" + key_length = 4 + +# }}} + @first_arg_dependent_memoize -def _get_generator(queue, luxury=None): - gen = RanluxGenerator(queue, luxury=luxury) - queue.finish() +def _get_generator(context): + if context.devices[0].type & cl.device_type.CPU: + gen = PhiloxGenerator(context) + else: + gen = ThreefryGenerator(context) + return gen -def fill_rand(result, queue=None, luxury=4, a=0, b=1): +def fill_rand(result, queue=None, luxury=None, a=0, b=1): """Fill *result* with random values of `dtype` in the range [0,1). """ + if luxury is not None: + from warnings import warn + warn("Specifying the 'luxury' argument is deprecated and will stop being " + "supported in PyOpenCL 2018.x", stacklevel=2) + if queue is None: queue = result.queue - gen = _get_generator(queue, luxury=luxury) + gen = _get_generator(queue.context) gen.fill_uniform(result, a=a, b=b) @@ -412,8 +714,13 @@ def rand(queue, shape, dtype, luxury=None, a=0, b=1): in the range [a,b). """ + if luxury is not None: + from warnings import warn + warn("Specifying the 'luxury' argument is deprecated and will stop being " + "supported in PyOpenCL 2018.x", stacklevel=2) + from pyopencl.array import Array - gen = _get_generator(queue, luxury) + gen = _get_generator(queue.context) result = Array(queue, shape, dtype) result.add_event( gen.fill_uniform(result, a=a, b=b)) diff --git a/test/test_array.py b/test/test_array.py index ff2b94ca1413a27282876014023aa0107844461d..7b48a95410638d3ea0ab7d0fa9925de836add785 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -38,6 +38,8 @@ from pyopencl.tools import ( # noqa from pyopencl.characterize import has_double_support, has_struct_arg_count_bug from pyopencl.cffi_cl import _PYPY +from pyopencl.clrandom import RanluxGenerator, PhiloxGenerator, ThreefryGenerator + # {{{ helpers @@ -413,54 +415,79 @@ def test_divide_array(ctx_factory): # {{{ RNG -def test_random_float_in_range(ctx_factory): +@pytest.mark.parametrize("rng_class", + [RanluxGenerator, PhiloxGenerator, ThreefryGenerator]) +@pytest.mark.parametrize("ary_size", [300, 301, 302, 303, 10007]) +def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False): context = ctx_factory() queue = cl.CommandQueue(context) - from pyopencl.clrandom import RanluxGenerator - if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] - gen = RanluxGenerator(queue, 5120) + if rng_class is RanluxGenerator: + gen = rng_class(queue, 5120) + else: + gen = rng_class(context) + + for dtype in dtypes: + print(dtype) + ran = cl_array.zeros(queue, ary_size, dtype) + gen.fill_uniform(ran) + + if plot_hist: + import matplotlib.pyplot as pt + pt.hist(ran.get(), 30) + pt.show() - for ary_size in [300, 301, 302, 303, 10007]: - for dtype in dtypes: - ran = cl_array.zeros(queue, ary_size, dtype) - gen.fill_uniform(ran) - assert (0 < ran.get()).all() - assert (ran.get() < 1).all() + assert (0 < ran.get()).all() + assert (ran.get() < 1).all() + if rng_class is RanluxGenerator: gen.synchronize(queue) - ran = cl_array.zeros(queue, ary_size, dtype) - gen.fill_uniform(ran, a=4, b=7) - assert (4 < ran.get()).all() - assert (ran.get() < 7).all() + ran = cl_array.zeros(queue, ary_size, dtype) + gen.fill_uniform(ran, a=4, b=7) + assert (4 < ran.get()).all() + assert (ran.get() < 7).all() - ran = gen.normal(queue, (10007,), dtype, mu=4, sigma=3) + ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3) + + if plot_hist: + import matplotlib.pyplot as pt + pt.hist(ran.get(), 30) + pt.show() @pytest.mark.parametrize("dtype", [np.int32, np.int64]) -def test_random_int_in_range(ctx_factory, dtype): +@pytest.mark.parametrize("rng_class", + [RanluxGenerator, PhiloxGenerator, ThreefryGenerator]) +def test_random_int_in_range(ctx_factory, rng_class, dtype, plot_hist=False): context = ctx_factory() queue = cl.CommandQueue(context) - from pyopencl.clrandom import RanluxGenerator - gen = RanluxGenerator(queue, 5120) + if rng_class is RanluxGenerator: + gen = rng_class(queue, 5120) + else: + gen = rng_class(context) + + # if (dtype == np.int64 + # and context.devices[0].platform.vendor.startswith("Advanced Micro")): + # pytest.xfail("AMD miscompiles 64-bit RNG math") + + ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300).get() + assert (200 <= ran).all() + assert (ran < 300).all() - if (dtype == np.int64 - and context.devices[0].platform.vendor.startswith("Advanced Micro")): - pytest.xfail("AMD miscompiles 64-bit RNG math") + print(np.min(ran), np.max(ran)) + assert np.max(ran) > 295 - ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300) - assert (200 <= ran.get()).all() - assert (ran.get() < 300).all() - #from matplotlib import pyplot as pt - #pt.hist(ran.get()) - #pt.show() + if plot_hist: + from matplotlib import pyplot as pt + pt.hist(ran) + pt.show() # }}}