Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
pyopencl
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Andreas Klöckner
pyopencl
Commits
7c83a521
Commit
7c83a521
authored
12 years ago
by
Andreas Klöckner
Browse files
Options
Downloads
Patches
Plain Diff
Update RANLUX.
parent
3f06ec73
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/cl/pyopencl-ranluxcl.cl
+157
-93
157 additions, 93 deletions
src/cl/pyopencl-ranluxcl.cl
with
157 additions
and
93 deletions
src/cl/pyopencl-ranluxcl.cl
+
157
−
93
View file @
7c83a521
...
...
@@ -10,8 +10,8 @@ which should perfectly replicate the numbers generated by the original Fortran
*****
QUICK
USAGE
DESCRIPTION
**************************************************
1.
Create
an
OpenCL
buffer
with
room
for
at
least
28
32-bit
variables
(
112
by
te
)
.
I.e.,
in
C/C++:
size_t
buffSize
=
numWorkitems
*
112
;
1.
Create
an
OpenCL
buffer
with
room
for
at
least
28
32-bit
variables
(
112
byte
)
per
work-i
te
m
.
I.e.,
in
C/C++:
size_t
buffSize
=
numWorkitems
*
112
;
2.
Pass
the
buffer
and
an
unsigned
integer
seed
<ins>
to
a
kernel
that
launches
the
ranluxcl_initialization
function.
The
seed
<ins>
can
be
any
unsigned
32-bit
...
...
@@ -33,8 +33,7 @@ points not included, then uploads the seeds again:
#
include
"ranluxcl.cl"
kernel
void
Kernel_Example
(
global
ranluxcl_state_t
*ranluxcltab
)
{
//ranluxclstate
is
a
struct
of
7
float4
variables
//storing
the
state
of
the
generator.
//ranluxclstate
stores
the
state
of
the
generator.
ranluxcl_state_t
ranluxclstate
;
//Download
state
into
ranluxclstate
struct.
...
...
@@ -354,7 +353,7 @@ void ranluxcl_download_seed(ranluxcl_state_t *rst,
void
ranluxcl_upload_seed
(
ranluxcl_state_t
*rst,
global
ranluxcl_state_t
*ranluxcltab
)
{
ranluxcltab[RANLUXCL_MYID]
=
(
*rst
)
;
ranluxcltab[RANLUXCL_MYID]
=
(
*rst
)
;
}
/*
...
...
@@ -581,90 +580,24 @@ void ranluxcl_synchronize(ranluxcl_state_t *rst)
ranluxcl32
(
rst
)
;
}
void
ranluxcl_initialization
(
uint
ins,
global
ranluxcl_state_t
*ranluxcltab
)
/*
*
Uses
a
64-bit
xorshift
PRNG
by
George
Marsaglia
to
initialize
the
generator.
*
*
This
function
can
be
used
instead
of
ranluxcl_initialization
if
manual
*
control
of
the
seed
of
each
generator
is
desired.
x
must
be
unique
for
each
*
time
this
function
is
called,
and
*ranluxcltab
should
point
to
the
specific
*
entry
in
the
table
to
be
initialized.
Compare
this
to
ranluxcl_initialization
*
where
ins
needs
only
be
unique
for
each
NDRange,
and
*ranluxcltab
points
*
to
the
base
address
of
the
table
for
the
entire
NDRange.
Also
note
that
*
depending
on
what
you
are
doing
the
ranluxcl_upload_seed
and
*
ranluxcl_download_seed
functions
may
not
do
what
you
want,
so
make
sure
*
you
know
what
you
are
doing!
*/
void
ranluxcl_init
(
ulong
x,
global
ranluxcl_state_t
*ranluxcltab
)
{
ranluxcl_state_t
rst
;
#
ifdef
RANLUXCL_USE_LEGACY_INITIALIZATION
//Using
legacy
initialization
from
original
Fortan
77
implementation
//ins
is
scaled
so
that
if
the
user
makes
another
call
somewhere
else
//with
ins
+
1
there
should
be
no
overlap.
Also
adding
one
//allows
us
to
use
ins
=
0.
int
k,
maxWorkitems
;
#
ifdef
RANLUXCL_MAXWORKITEMS
maxWorkitems
=
RANLUXCL_MAXWORKITEMS
;
#
else
maxWorkitems
=
RANLUXCL_NUMWORKITEMS
;
#
endif
//RANLUXCL_MAXWORKITEMS
int
scaledins
=
ins
*
maxWorkitems
+
1
;
int
js
=
scaledins
+
RANLUXCL_MYID
;
//Make
sure
js
is
not
too
small
(
should
really
be
an
error
)
if
(
js
<
1
)
js
=
1
;
#
define
IC
2147483563
#
define
ITWO24
16777216
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s01=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s02=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s03=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s04=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s05=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s06=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s07=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s08=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s09=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s10=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s11=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s12=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s13=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s14=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s15=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s16=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s17=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s18=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s19=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s20=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s21=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s22=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s23=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s24=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
#
undef
IC
#
undef
ITWO24
#
else
//RANLUXCL_USE_LEGACY_INITIALIZATION
//Using
default
initialization
#
define
RANLUXCL_POW2_24
16777216
#
define
RANLUXCL_56
0x00FFFFFFFFFFFFFF
#
define
RANLUXCL_48
0x0000FFFFFFFFFFFF
...
...
@@ -674,13 +607,7 @@ void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
#
define
RANLUXCL_16
0x000000000000FFFF
#
define
RANLUXCL_8
0x00000000000000FF
//We
scale
ins
by
(
2^32
)
-1.
As
long
as
we
never
use
more
than
(
2^32
)
-1
//work-items
per
NDRange
the
initial
states
should
never
be
the
same.
//We
use
a
simple
64-bit
xorshift
RNG
by
George
Marsaglia,
which
has
a
//period
of
2^64
-
1.
ulong
x1,
x2,
x3
;
ulong
x
=
(
ulong
)
RANLUXCL_MYID
+
(
ulong
)
ins
*
((
ulong
)
UINT_MAX
+
1
)
;
//Logical
shifts
used
so
that
all
64
bits
of
output
are
used
(
24
bits
//per
float
)
,
to
be
certain
that
all
initial
states
are
different.
...
...
@@ -753,7 +680,128 @@ void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
#
undef
RANLUXCL_16
#
undef
RANLUXCL_8
#
endif
//RANLUXCL_USE_LEGACY_INITIALIZATION
rst.in24
=
0
;
rst.stepnr
=
0
;
rst.carry
=
0.0f
;
if
(
rst.s24
==
0.0f
)
rst.carry
=
RANLUXCL_TWOM24
;
#
ifndef
RANLUXCL_NO_WARMUP
//Warming
up
the
generator,
ensuring
there
are
no
initial
correlations.
//16
is
a
"magic number"
.
It
is
the
number
of
times
we
must
generate
//a
batch
of
24
numbers
to
ensure
complete
decorrelation,
however
it
//seems
like
it
is
necessary
to
double
this
for
the
special
case
when
//the
generator
is
initialized
to
all
zeros.
for
(
int
i=0
; i<16 * 2; i++){
ranluxcl_os
(
rst.s09,
rst.s10,
&
(
rst.s24
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s08,
rst.s09,
&
(
rst.s23
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s07,
rst.s08,
&
(
rst.s22
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s06,
rst.s07,
&
(
rst.s21
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s05,
rst.s06,
&
(
rst.s20
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s04,
rst.s05,
&
(
rst.s19
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s03,
rst.s04,
&
(
rst.s18
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s02,
rst.s03,
&
(
rst.s17
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s01,
rst.s02,
&
(
rst.s16
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s24,
rst.s01,
&
(
rst.s15
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s23,
rst.s24,
&
(
rst.s14
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s22,
rst.s23,
&
(
rst.s13
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s21,
rst.s22,
&
(
rst.s12
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s20,
rst.s21,
&
(
rst.s11
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s19,
rst.s20,
&
(
rst.s10
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s18,
rst.s19,
&
(
rst.s09
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s17,
rst.s18,
&
(
rst.s08
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s16,
rst.s17,
&
(
rst.s07
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s15,
rst.s16,
&
(
rst.s06
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s14,
rst.s15,
&
(
rst.s05
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s13,
rst.s14,
&
(
rst.s04
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s12,
rst.s13,
&
(
rst.s03
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s11,
rst.s12,
&
(
rst.s02
)
,
&
(
rst.carry
))
;
ranluxcl_os
(
rst.s10,
rst.s11,
&
(
rst.s01
)
,
&
(
rst.carry
))
;
}
#
endif
//RANLUXCL_NO_WARMUP
//Upload
the
state
*ranluxcltab
=
rst
;
}
void
ranluxcl_init_legacy
(
uint
ins,
global
ranluxcl_state_t
*ranluxcltab
)
{
//Using
legacy
initialization
from
original
Fortan
77
implementation
//ins
is
scaled
so
that
if
the
user
makes
another
call
somewhere
else
//with
ins
+
1
there
should
be
no
overlap.
Also
adding
one
//allows
us
to
use
ins
=
0.
int
k,
maxWorkitems
;
ranluxcl_state_t
rst
;
#
ifdef
RANLUXCL_MAXWORKITEMS
maxWorkitems
=
RANLUXCL_MAXWORKITEMS
;
#
else
maxWorkitems
=
RANLUXCL_NUMWORKITEMS
;
#
endif
//RANLUXCL_MAXWORKITEMS
int
scaledins
=
ins
*
maxWorkitems
+
1
;
int
js
=
scaledins
+
RANLUXCL_MYID
;
//Make
sure
js
is
not
too
small
(
should
really
be
an
error
)
if
(
js
<
1
)
js
=
1
;
#
define
IC
2147483563
#
define
ITWO24
16777216
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s01=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s02=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s03=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s04=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s05=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s06=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s07=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s08=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s09=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s10=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s11=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s12=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s13=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s14=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s15=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s16=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s17=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s18=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s19=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s20=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s21=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s22=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s23=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
k
=
js/53668
; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
rst.s24=
(
js%ITWO24
)
*RANLUXCL_TWOM24
;
#
undef
IC
#
undef
ITWO24
rst.in24
=
0
;
rst.stepnr
=
0
;
...
...
@@ -797,6 +845,22 @@ void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
ranluxcl_upload_seed
(
&rst,
ranluxcltab
)
;
}
void
ranluxcl_initialization
(
uint
ins,
global
ranluxcl_state_t
*ranluxcltab
)
{
#
ifdef
RANLUXCL_USE_LEGACY_INITIALIZATION
ranluxcl_init_legacy
(
ins,
ranluxcltab
)
;
#
else
//
Not
RANLUXCL_USE_LEGACY_INITIALIZATION
//
We
scale
ins
by
2^32.
As
long
as
we
never
use
more
than
(
2^32
)
-1
//
work-items
per
NDRange
the
initial
states
should
never
be
the
same.
ulong
x
=
(
ulong
)
RANLUXCL_MYID
+
(
ulong
)
ins
*
((
ulong
)
UINT_MAX
+
1
)
;
ranluxcl_init
(
x,
ranluxcltab
+
RANLUXCL_MYID
)
;
#
endif
//
RANLUXCL_USE_LEGACY_INITIALIZATION
}
float4
ranluxcl32norm
(
ranluxcl_state_t
*rst
)
{
//Returns
a
vector
where
each
component
is
a
normally
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment