Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
pyopencl
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Andreas Klöckner
pyopencl
Commits
017fe9d3
Commit
017fe9d3
authored
14 years ago
by
Andreas Klöckner
Browse files
Options
Downloads
Patches
Plain Diff
Fix slowness of matrix-multiply example.
parent
9740b3c2
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
examples/matrix-multiply.py
+52
-56
52 additions, 56 deletions
examples/matrix-multiply.py
with
52 additions
and
56 deletions
examples/matrix-multiply.py
+
52
−
56
View file @
017fe9d3
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
from
__future__
import
division
from
__future__
import
division
kernel_code
=
"""
KERNEL_CODE
=
"""
// Thread block size
// Thread block size
#define BLOCK_SIZE %(block_size)d
#define BLOCK_SIZE %(block_size)d
...
@@ -36,8 +36,8 @@ kernel_code = """
...
@@ -36,8 +36,8 @@ kernel_code = """
* Device code.
* Device code.
*/
*/
#define AS(
i
,
j
) As[i + j * BLOCK_SIZE]
#define AS(
j
,
i
) As[i + j * BLOCK_SIZE]
#define BS(
i
,
j
) Bs[i + j * BLOCK_SIZE]
#define BS(
j
,
i
) Bs[i + j * BLOCK_SIZE]
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! Matrix multiplication on the device: C = A * B
...
@@ -45,9 +45,11 @@ kernel_code = """
...
@@ -45,9 +45,11 @@ kernel_code = """
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
__kernel __attribute__((reqd_work_group_size(16,16,1)))
__kernel __attribute__((reqd_work_group_size(16,16,1)))
void
void
matrixMul( __global float* C, __global float* A, __global float* B,
matrixMul( __global float* C, __global float* A, __global float* B)
__local float* As, __local float* Bs)
{
{
__local float As[BLOCK_SIZE*BLOCK_SIZE];
__local float Bs[BLOCK_SIZE*BLOCK_SIZE];
// Block index
// Block index
int bx = get_group_id(0);
int bx = get_group_id(0);
int by = get_group_id(1);
int by = get_group_id(1);
...
@@ -73,7 +75,7 @@ matrixMul( __global float* C, __global float* A, __global float* B,
...
@@ -73,7 +75,7 @@ matrixMul( __global float* C, __global float* A, __global float* B,
// Csub is used to store the element of the block sub-matrix
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
// that is computed by the thread
float Csub = 0;
float Csub = 0
.0f
;
// Loop over all the sub-matrices of A and B
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
// required to compute the block sub-matrix
...
@@ -119,14 +121,13 @@ block_size = 16
...
@@ -119,14 +121,13 @@ block_size = 16
ctx
=
cl
.
create_some_context
()
ctx
=
cl
.
create_some_context
()
for
dev
in
ctx
.
devices
:
for
dev
in
ctx
.
devices
:
assert
dev
.
local_mem_size
>
0
assert
dev
.
local_mem_size
>
0
queue
=
cl
.
CommandQueue
(
ctx
,
queue
=
cl
.
CommandQueue
(
ctx
,
properties
=
cl
.
command_queue_properties
.
PROFILING_ENABLE
)
properties
=
cl
.
command_queue_properties
.
PROFILING_ENABLE
)
#queue = cl.CommandQueue(ctx)
#queue = cl.CommandQueue(ctx)
if
False
:
if
False
:
a_height
=
4096
a_height
=
4096
#a_height = 1024
#a_height = 1024
...
@@ -144,39 +145,33 @@ elif False:
...
@@ -144,39 +145,33 @@ elif False:
else
:
else
:
# CL SDK
# CL SDK
a_height
=
50
*
block_size
a_width
=
50
*
block_size
a_width
=
100
*
block_size
a_height
=
100
*
block_size
b_height
=
a_width
b_width
=
50
*
block_size
b_width
=
50
*
block_size
b_height
=
a_width
h_a
=
numpy
.
random
.
rand
(
a_height
,
a_width
).
astype
(
numpy
.
float32
)
c_width
=
b_width
h_b
=
numpy
.
random
.
rand
(
a_width
,
a_height
).
astype
(
numpy
.
float32
)
c_height
=
a_height
h_c
=
numpy
.
empty
((
a_height
,
a_height
)).
astype
(
numpy
.
float32
)
print
h_a
.
shape
,
h_b
.
shape
mf
=
cl
.
mem_flags
h_a
=
numpy
.
random
.
rand
(
a_height
,
a_width
).
astype
(
numpy
.
float32
)
h_b
=
numpy
.
random
.
rand
(
b_height
,
b_width
).
astype
(
numpy
.
float32
)
h_c
=
numpy
.
empty
((
c_height
,
c_width
)).
astype
(
numpy
.
float32
)
kernel_params
=
{
"
block_size
"
:
block_size
,
kernel_params
=
{
"
block_size
"
:
block_size
,
"
w_a
"
:
a_width
,
"
h_a
"
:
a_height
,
"
w_b
"
:
a_height
}
"
w_a
"
:
a_width
,
"
h_a
"
:
a_height
,
"
w_b
"
:
b_width
}
prg
=
cl
.
Program
(
ctx
,
kernel_code
%
kernel_params
,
prg
=
cl
.
Program
(
ctx
,
KERNEL_CODE
%
kernel_params
,
).
build
(
options
=
"
-cl-mad-enable -cl-fast-relaxed-math
"
)
).
build
(
options
=
"
-cl-mad-enable -cl-fast-relaxed-math
"
)
kernel
=
prg
.
matrixMul
kernel
=
prg
.
matrixMul
#print prg.binaries[0]
#print prg.binaries[0]
#def __call__(self, queue, tgt, src, shape):
# w, h = shape
assert
a_width
%
block_size
==
0
assert
a_width
%
block_size
==
0
assert
a_height
%
block_size
==
0
assert
a_height
%
block_size
==
0
assert
b_width
%
block_size
==
0
assert
b_width
%
block_size
==
0
# kernel(queue, (w, h), tgt, src, numpy.uint32(w), numpy.uint32(h))
# transfer host -> device -----------------------------------------------------
mf
=
cl
.
mem_flags
# __call__(queue, a_t_buf, a_buf, source.shape)
# args: queue, domain, *args
t1
=
time
()
t1
=
time
()
...
@@ -186,56 +181,57 @@ d_c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=h_c.nbytes)
...
@@ -186,56 +181,57 @@ d_c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=h_c.nbytes)
push_time
=
time
()
-
t1
push_time
=
time
()
-
t1
# warmup
# warmup ----------------------------------------------------------------------
event
=
kernel
(
queue
,
(
a_height
,
a_height
),
d_c_buf
,
d_a_buf
,
d_b_buf
,
for
i
in
range
(
5
):
cl
.
LocalMemory
(
4
*
block_size
**
2
),
event
=
kernel
(
queue
,
h_c
.
shape
,
d_c_buf
,
d_a_buf
,
d_b_buf
,
cl
.
LocalMemory
(
4
*
block_size
**
2
),
local_size
=
(
block_size
,
block_size
))
local_size
=
(
block_size
,
block_size
))
event
.
wait
()
event
.
wait
()
queue
.
finish
()
# actual benchmark ------------------------------------------------------------
t1
=
time
()
t1
=
time
()
count
=
20
count
=
20
for
i
in
range
(
count
):
for
i
in
range
(
count
):
event
=
kernel
(
queue
,
(
a_height
,
a_height
),
d_c_buf
,
d_a_buf
,
d_b_buf
,
event
=
kernel
(
queue
,
h_c
.
shape
,
d_c_buf
,
d_a_buf
,
d_b_buf
,
cl
.
LocalMemory
(
4
*
block_size
**
2
),
cl
.
LocalMemory
(
4
*
block_size
**
2
),
local_size
=
(
block_size
,
block_size
))
local_size
=
(
block_size
,
block_size
))
event
.
wait
()
event
.
wait
()
gpu_time
=
(
time
()
-
t1
)
/
count
gpu_time
=
time
()
-
t1
# transfer device -> host -----------------------------------------------------
t1
=
time
()
t1
=
time
()
cl
.
enqueue_read_buffer
(
queue
,
d_c_buf
,
h_c
).
wait
()
cl
.
enqueue_read_buffer
(
queue
,
d_c_buf
,
h_c
).
wait
()
pull_time
=
time
()
-
t1
pull_time
=
time
()
-
t1
ans1
=
h_c
# timing output ---------------------------------------------------------------
gpu_total_time
=
gpu_time
+
push_time
+
pull_time
gpu_total_time
=
gpu_time
+
push_time
+
pull_time
print
"
GPU
(s)
total:
"
,
gpu_total_time
print
"
GPU
push+compute+pull
total
[s]
:
"
,
gpu_total_time
print
"
PU
SH
"
,
push_time
print
"
G
PU
push [s]:
"
,
push_time
print
"
PU
LL
"
,
pull_time
print
"
G
PU
pull [s]:
"
,
pull_time
print
"
COMPUTE
"
,
gpu_time
/
count
print
"
GPU compute (host-timed) [s]:
"
,
gpu_time
print
"
COMPUTE2
"
,
(
event
.
profile
.
end
-
event
.
profile
.
start
)
*
1e-9
print
"
GPU compute (event-timed) [s]:
"
,
(
event
.
profile
.
end
-
event
.
profile
.
start
)
*
1e-9
gflop
=
h_c
.
size
*
(
a_width
*
2.
)
/
(
1000
**
3.
)
gflop
=
h_c
.
size
*
(
a_width
*
2.
)
/
(
1000
**
3.
)
gflops
=
gflop
/
(
gpu_time
/
count
)
gflops
=
gflop
/
gpu_time
print
"
gflops:
"
,
gflops
do_cpu
=
False
print
print
"
GFlops/s:
"
,
gflops
if
do_cpu
:
# cpu comparison --------------------------------------------------------------
t1
=
time
()
t1
=
time
()
ans2
=
numpy
.
dot
(
h_a
,
h_b
)
h_c_cpu
=
numpy
.
dot
(
h_a
,
h_b
)
cpu_time
=
time
()
-
t1
cpu_time
=
time
()
-
t1
print
"
CPU (s)
"
,
cpu_time
print
print
"
GPU==CPU:
"
,
numpy
.
allclose
(
h_c
,
h_c_cpu
)
print
print
"
CPU time (s)
"
,
cpu_time
print
print
"
GPU speedup:
"
,
cpu_time
/
gpu_total_time
print
"
GPU speedup (with transfer):
"
,
cpu_time
/
gpu_total_time
print
"
GPU speedup (without transfer):
"
,
cpu_time
/
gpu_time
print
"
GPU==CPU:
"
,
numpy
.
allclose
(
ans1
,
ans2
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment