Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
L
loopy
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Andreas Klöckner
loopy
Commits
2513ee98
Commit
2513ee98
authored
13 years ago
by
Andreas Klöckner
Browse files
Options
Downloads
Patches
Plain Diff
Automatically find idempotent instructions. Minor tweaks.
parent
6f140ea6
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
MEMO
+5
-1
5 additions, 1 deletion
MEMO
loopy/__init__.py
+3
-7
3 additions, 7 deletions
loopy/__init__.py
loopy/kernel.py
+12
-6
12 additions, 6 deletions
loopy/kernel.py
loopy/schedule.py
+62
-30
62 additions, 30 deletions
loopy/schedule.py
test/test_matmul.py
+72
-33
72 additions, 33 deletions
test/test_matmul.py
with
154 additions
and
77 deletions
MEMO
+
5
−
1
View file @
2513ee98
...
...
@@ -56,7 +56,8 @@ Things to consider
- Parallel dimension splitting/merging via tags
-> unnecessary?
- All user-supplied commands are assumed to be idempotent.
- Not using all hw loop dimensions causes an error, as
is the case for variant 3 in the rank_one test.
TODO
^^^^
...
...
@@ -83,6 +84,9 @@ TODO
- Better for loop bound generation
-> Try a triangular loop
- Nested slab decomposition (in conjunction with conditional hoisting) could
generate nested conditional code.
Dealt with
^^^^^^^^^^
...
...
This diff is collapsed.
Click to expand it.
loopy/__init__.py
+
3
−
7
View file @
2513ee98
...
...
@@ -38,14 +38,11 @@ from loopy.compiled import CompiledKernel, drive_timing_run
def
split_dimension
(
kernel
,
iname
,
inner_length
,
padded_length
=
None
,
outer_iname
=
None
,
inner_iname
=
None
,
outer_tag
=
None
,
inner_tag
=
None
,
outer_slab_increments
=
(
0
,
-
1
),
no_slabs
=
None
):
slabs
=
(
0
,
0
)
):
if
iname
not
in
kernel
.
all_inames
():
raise
ValueError
(
"
cannot split loop for unknown variable
'
%s
'"
%
iname
)
if
no_slabs
:
outer_slab_increments
=
(
0
,
0
)
if
padded_length
is
not
None
:
inner_tag
=
inner_tag
.
copy
(
forced_length
=
padded_length
)
...
...
@@ -115,7 +112,7 @@ def split_dimension(kernel, iname, inner_length, padded_length=None,
# }}}
iname_slab_increments
=
kernel
.
iname_slab_increments
.
copy
()
iname_slab_increments
[
outer_iname
]
=
outer_slab_increment
s
iname_slab_increments
[
outer_iname
]
=
slab
s
result
=
(
kernel
.
copy
(
domain
=
new_domain
,
iname_slab_increments
=
iname_slab_increments
,
...
...
@@ -321,8 +318,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
id
=
kernel
.
make_unique_instruction_id
(
based_on
=
cse_tag
),
assignee
=
assignee
,
expression
=
new_inner_expr
,
forced_iname_deps
=
forced_iname_deps
,
idempotent
=
True
)
forced_iname_deps
=
forced_iname_deps
)
cse_result_insns
.
append
(
new_insn
)
...
...
This diff is collapsed.
Click to expand it.
loopy/kernel.py
+
12
−
6
View file @
2513ee98
...
...
@@ -224,10 +224,8 @@ class Instruction(Record):
dependencies) without changing the meaning of the program.
"""
def
__init__
(
self
,
id
,
assignee
,
expression
,
idempotent
,
forced_iname_deps
=
[],
insn_deps
=
[]):
assert
isinstance
(
idempotent
,
bool
)
id
,
assignee
,
expression
,
forced_iname_deps
=
[],
insn_deps
=
[],
idempotent
=
None
):
Record
.
__init__
(
self
,
id
=
id
,
assignee
=
assignee
,
expression
=
expression
,
...
...
@@ -258,6 +256,15 @@ class Instruction(Record):
result
=
"
%s: %s <- %s
\n
[%s]
"
%
(
self
.
id
,
self
.
assignee
,
self
.
expression
,
"
,
"
.
join
(
sorted
(
self
.
all_inames
())))
if
self
.
idempotent
==
True
:
result
+=
"
(idempotent)
"
elif
self
.
idempotent
==
False
:
result
+=
"
(not idempotent)
"
elif
self
.
idempotent
is
None
:
result
+=
"
(idempotence unknown)
"
else
:
raise
RuntimeError
(
"
unexpected value for Instruction.idempotent
"
)
if
self
.
insn_deps
:
result
+=
"
\n
:
"
+
"
,
"
.
join
(
self
.
insn_deps
)
...
...
@@ -450,8 +457,7 @@ class LoopKernel(Record):
id
=
self
.
make_unique_instruction_id
(
insns
,
based_on
=
label
),
insn_deps
=
insn_deps
,
forced_iname_deps
=
forced_iname_deps
,
assignee
=
lhs
,
expression
=
rhs
,
idempotent
=
True
)
assignee
=
lhs
,
expression
=
rhs
)
if
isinstance
(
domain
,
str
):
ctx
=
isl
.
Context
()
...
...
This diff is collapsed.
Click to expand it.
loopy/schedule.py
+
62
−
30
View file @
2513ee98
...
...
@@ -58,8 +58,7 @@ def realize_reduction(kernel, inames=None, reduction_tag=None):
extra_used_ids
=
set
(
ni
.
id
for
ni
in
new_insns
)),
assignee
=
target_var
,
forced_iname_deps
=
list
(
insn
.
all_inames
()
-
set
(
expr
.
inames
)),
expression
=
expr
.
operation
.
neutral_element
,
idempotent
=
True
)
expression
=
expr
.
operation
.
neutral_element
)
new_insns
.
append
(
init_insn
)
...
...
@@ -69,8 +68,7 @@ def realize_reduction(kernel, inames=None, reduction_tag=None):
assignee
=
target_var
,
expression
=
expr
.
operation
(
target_var
,
sub_expr
),
insn_deps
=
[
init_insn
.
id
],
forced_iname_deps
=
list
(
insn
.
all_inames
()),
idempotent
=
False
)
forced_iname_deps
=
list
(
insn
.
all_inames
()))
new_insns
.
append
(
reduction_insn
)
...
...
@@ -210,10 +208,10 @@ def check_for_unused_hw_axes(kernel):
raise
RuntimeError
(
"
auto local tag encountered
"
)
if
group_axes
!=
group_axes_used
:
raise
RuntimeError
(
"
instruction
'
%s
'
does not use all
hw
group axes
"
raise
RuntimeError
(
"
instruction
'
%s
'
does not use all group
hw
axes
"
%
insn
.
id
)
if
local_axes
!=
local_axes_used
:
raise
RuntimeError
(
"
instruction
'
%s
'
does not use all
hw
local axes
"
raise
RuntimeError
(
"
instruction
'
%s
'
does not use all local
hw
axes
"
%
insn
.
id
)
...
...
@@ -305,53 +303,65 @@ def adjust_local_temp_var_storage(kernel):
# }}}
# {{{ automatic dependencies
# {{{ automatic dependencies
, find idempotent instructions
def
find_
write
rs
(
kernel
):
def
find_
accesso
rs
(
kernel
,
readers
):
"""
:return: a dict that maps variable names to ids of insns that
write to that variable.
"""
writer_insn_ids
=
{}
result
=
{}
admissible_
write_
vars
=
(
admissible_vars
=
(
set
(
arg
.
name
for
arg
in
kernel
.
args
)
|
set
(
kernel
.
temporary_variables
.
iterkeys
()))
for
insn
in
kernel
.
instructions
:
var_name
=
insn
.
get_assignee_var_name
()
if
var_name
not
in
admissible_write_vars
:
raise
RuntimeError
(
"
writing to
'
%s
'
is not allowed
"
%
var_name
)
if
readers
:
from
loopy.symbolic
import
DependencyMapper
var_names
=
DependencyMapper
()(
insn
.
expression
)
&
admissible_vars
else
:
var_name
=
insn
.
get_assignee_var_name
()
writer_insn_ids
.
setdefault
(
var_name
,
set
()).
add
(
insn
.
id
)
if
var_name
not
in
admissible_vars
:
raise
RuntimeError
(
"
writing to
'
%s
'
is not allowed
"
%
var_name
)
var_names
=
[
var_name
]
return
writer_insn_ids
for
var_name
in
var_names
:
result
.
setdefault
(
var_name
,
set
()).
add
(
insn
.
id
)
return
result
def
add_automatic_dependencies
(
kernel
):
writer_map
=
find_
write
rs
(
kernel
)
def
add_
idempotence_and_
automatic_dependencies
(
kernel
):
writer_map
=
find_
accesso
rs
(
kernel
,
readers
=
False
)
arg_names
=
set
(
arg
.
name
for
arg
in
kernel
.
args
)
var_names
=
arg_names
|
set
(
kernel
.
temporary_variables
.
iterkeys
())
from
loopy.symbolic
import
DependencyMapper
dep_map
=
DependencyMapper
(
composite_leaves
=
False
)
new_insns
=
[]
dm
=
DependencyMapper
(
composite_leaves
=
False
)
dep_map
=
{}
for
insn
in
kernel
.
instructions
:
read_vars
=
(
set
(
var
.
name
for
var
in
d
ep_map
(
insn
.
expression
))
dep_map
[
insn
.
id
]
=
(
set
(
var
.
name
for
var
in
d
m
(
insn
.
expression
))
&
var_names
)
new_insns
=
[]
for
insn
in
kernel
.
instructions
:
auto_deps
=
[]
for
var
in
read_vars
:
# {{{ add automatic dependencies
all_my_var_writers
=
set
()
for
var
in
dep_map
[
insn
.
id
]:
var_writers
=
writer_map
.
get
(
var
,
set
())
all_my_var_writers
|=
var_writers
if
not
var_writers
and
var
not
in
v
ar_names
:
if
not
var_writers
and
var
not
in
ar
g
_names
:
from
warnings
import
warn
warn
(
"'
%s
'
is read, but never written.
"
%
var
)
...
...
@@ -365,9 +375,26 @@ def add_automatic_dependencies(kernel):
if
len
(
var_writers
)
==
1
:
auto_deps
.
extend
(
var_writers
)
# }}}
# {{{ find dependency loops, flag idempotence
while
True
:
last_all_my_var_writers
=
all_my_var_writers
for
writer_insn_id
in
last_all_my_var_writers
:
for
var
in
dep_map
[
writer_insn_id
]:
all_my_var_writers
=
all_my_var_writers
|
writer_map
.
get
(
var
,
set
())
if
last_all_my_var_writers
==
all_my_var_writers
:
break
# }}}
new_insns
.
append
(
insn
.
copy
(
insn_deps
=
insn
.
insn_deps
+
auto_deps
))
insn_deps
=
insn
.
insn_deps
+
auto_deps
,
idempotent
=
insn
.
id
not
in
all_my_var_writers
))
return
kernel
.
copy
(
instructions
=
new_insns
)
...
...
@@ -514,7 +541,7 @@ def assign_automatic_axes(kernel, only_axis_0=True):
from
loopy
import
split_dimension
return
assign_automatic_axes
(
split_dimension
(
kernel
,
iname
,
inner_length
=
local_size
[
axis
],
outer_tag
=
UnrollTag
(),
inner_tag
=
new_tag
,
no_slabs
=
True
),
outer_tag
=
UnrollTag
(),
inner_tag
=
new_tag
),
only_axis_0
=
only_axis_0
)
new_iname_to_tag
=
kernel
.
iname_to_tag
.
copy
()
...
...
@@ -613,7 +640,7 @@ def generate_loop_schedules_internal(kernel, schedule=[]):
for
insn_id
in
unscheduled_insn_ids
:
insn
=
kernel
.
id_to_insn
[
insn_id
]
if
insn
.
idempotent
:
if
insn
.
idempotent
==
True
:
# If insn is idempotent, it may be placed inside a more deeply
# nested loop without harm.
...
...
@@ -621,7 +648,8 @@ def generate_loop_schedules_internal(kernel, schedule=[]):
insn
.
all_inames
()
-
parallel_inames
<=
active_inames
-
parallel_inames
)
else
:
elif
insn
.
idempotent
==
False
:
# If insn is not idempotent, we must insist that it is placed inside
# the exactly correct set of loops.
...
...
@@ -630,6 +658,10 @@ def generate_loop_schedules_internal(kernel, schedule=[]):
==
active_inames
-
parallel_inames
)
else
:
raise
RuntimeError
(
"
instruction
'
%s
'
has undetermined idempotence
"
%
insn
.
id
)
if
(
iname_deps_satisfied
and
set
(
insn
.
insn_deps
)
<=
scheduled_insn_ids
):
scheduled_insn_ids
.
add
(
insn
.
id
)
...
...
@@ -782,7 +814,7 @@ def insert_barriers(kernel, schedule, level=0):
# {{{ issue dependency-based barriers for this instruction
if
insn
.
i
d
in
owed_barriers
:
if
set
(
insn
.
i
nsn_deps
)
&
owed_barriers
:
issue_barrier
(
is_pre_barrier
=
False
)
# }}}
...
...
@@ -827,7 +859,7 @@ def generate_loop_schedules(kernel):
# }}}
kernel
=
assign_automatic_axes
(
kernel
)
kernel
=
add_automatic_dependencies
(
kernel
)
kernel
=
add_
idempotence_and_
automatic_dependencies
(
kernel
)
kernel
=
adjust_local_temp_var_storage
(
kernel
)
check_for_double_use_of_hw_axes
(
kernel
)
...
...
This diff is collapsed.
Click to expand it.
test/test_matmul.py
+
72
−
33
View file @
2513ee98
from
__future__
import
division
import
numpy
as
np
import
numpy.linalg
as
la
import
pyopencl
as
cl
...
...
@@ -214,16 +216,16 @@ def test_plain_matrix_mul_new_ui(ctx_factory):
name
=
"
matmul
"
,
assumptions
=
"
n >= 16
"
)
knl
=
lp
.
split_dimension
(
knl
,
"
i
"
,
16
,
outer_tag
=
"
g.0
"
,
inner_tag
=
"
l.1
"
,
no_slabs
=
True
)
outer_tag
=
"
g.0
"
,
inner_tag
=
"
l.1
"
)
knl
=
lp
.
split_dimension
(
knl
,
"
j
"
,
8
,
outer_tag
=
"
g.1
"
,
inner_tag
=
"
l.0
"
,
no_slabs
=
True
)
knl
=
lp
.
split_dimension
(
knl
,
"
k
"
,
32
,
no_slabs
=
True
)
outer_tag
=
"
g.1
"
,
inner_tag
=
"
l.0
"
)
knl
=
lp
.
split_dimension
(
knl
,
"
k
"
,
32
)
knl
=
lp
.
realize_cse
(
knl
,
"
lhsmat
"
,
dtype
,
[
"
k_inner
"
,
"
i_inner
"
])
knl
=
lp
.
realize_cse
(
knl
,
"
rhsmat
"
,
dtype
,
[
"
j_inner
"
,
"
k_inner
"
])
kernel_gen
=
lp
.
generate_loop_schedules
(
knl
)
kernel_gen
=
lp
.
check_kernels
(
kernel_gen
,
dict
(
n
=
n
),
kill_level_min
=
6
)
kernel_gen
=
lp
.
check_kernels
(
kernel_gen
,
dict
(
n
=
n
),
kill_level_min
=
5
)
a
=
make_well_conditioned_dev_matrix
(
queue
,
n
,
dtype
=
dtype
,
order
=
order
)
b
=
make_well_conditioned_dev_matrix
(
queue
,
n
,
dtype
=
dtype
,
order
=
order
)
...
...
@@ -251,8 +253,7 @@ def test_rank_one(ctx_factory):
queue
=
cl
.
CommandQueue
(
ctx
,
properties
=
cl
.
command_queue_properties
.
PROFILING_ENABLE
)
n
=
int
(
get_suitable_size
(
ctx
)
**
(
3
/
2
))
print
n
n
=
int
(
get_suitable_size
(
ctx
)
**
(
2.7
/
2
))
knl
=
lp
.
LoopKernel
(
ctx
.
devices
[
0
],
"
[n] -> {[i,j]: 0<=i,j<n}
"
,
...
...
@@ -267,33 +268,71 @@ def test_rank_one(ctx_factory):
],
name
=
"
rank_one
"
,
assumptions
=
"
n >= 16
"
)
#knl = lp.split_dimension(knl, "i", 16,
#outer_tag="g.0", inner_tag="l.1", no_slabs=True)
#knl = lp.split_dimension(knl, "j", 8,
#outer_tag="g.1", inner_tag="l.0", no_slabs=True)
#knl = lp.split_dimension(knl, "k", 32, no_slabs=True)
knl
=
lp
.
realize_cse
(
knl
,
"
a
"
,
dtype
)
#, ["i_inner"])
knl
=
lp
.
realize_cse
(
knl
,
"
b
"
,
dtype
)
#, ["j_inner"])
kernel_gen
=
lp
.
generate_loop_schedules
(
knl
)
kernel_gen
=
lp
.
check_kernels
(
kernel_gen
,
dict
(
n
=
n
),
kill_level_min
=
6
)
a
=
cl_random
.
rand
(
queue
,
n
,
dtype
=
dtype
)
b
=
cl_random
.
rand
(
queue
,
n
,
dtype
=
dtype
)
refsol
=
a
.
get
()[:,
np
.
newaxis
]
*
b
.
get
()
c
=
cl_array
.
empty
(
queue
,
refsol
.
shape
,
refsol
.
dtype
)
def
launcher
(
kernel
,
gsize
,
lsize
,
check
):
evt
=
kernel
(
queue
,
gsize
(
n
),
lsize
(
n
),
a
.
data
,
b
.
data
,
c
.
data
,
n
,
g_times_l
=
True
)
if
check
:
check_error
(
refsol
,
c
.
get
())
return
evt
lp
.
drive_timing_run
(
kernel_gen
,
queue
,
launcher
,
n
**
2
)
def
variant_1
(
knl
):
knl
=
lp
.
realize_cse
(
knl
,
"
a
"
,
dtype
)
knl
=
lp
.
realize_cse
(
knl
,
"
b
"
,
dtype
)
return
knl
def
variant_2
(
knl
):
knl
=
lp
.
split_dimension
(
knl
,
"
i
"
,
16
,
outer_tag
=
"
g.0
"
,
inner_tag
=
"
l.0
"
)
knl
=
lp
.
split_dimension
(
knl
,
"
j
"
,
16
,
outer_tag
=
"
g.1
"
,
inner_tag
=
"
l.1
"
)
knl
=
lp
.
realize_cse
(
knl
,
"
a
"
,
dtype
)
knl
=
lp
.
realize_cse
(
knl
,
"
b
"
,
dtype
)
return
knl
def
variant_3
(
knl
):
knl
=
lp
.
split_dimension
(
knl
,
"
i
"
,
16
,
outer_tag
=
"
g.0
"
,
inner_tag
=
"
l.0
"
)
knl
=
lp
.
split_dimension
(
knl
,
"
j
"
,
16
,
outer_tag
=
"
g.1
"
,
inner_tag
=
"
l.1
"
)
knl
=
lp
.
realize_cse
(
knl
,
"
a
"
,
dtype
,
[
"
i_inner
"
])
knl
=
lp
.
realize_cse
(
knl
,
"
b
"
,
dtype
,
[
"
j_inner
"
])
return
knl
def
variant_4
(
knl
):
knl
=
lp
.
split_dimension
(
knl
,
"
i
"
,
256
,
outer_tag
=
"
g.0
"
,
slabs
=
(
0
,
-
1
))
knl
=
lp
.
split_dimension
(
knl
,
"
j
"
,
256
,
outer_tag
=
"
g.1
"
,
slabs
=
(
0
,
-
1
))
knl
=
lp
.
realize_cse
(
knl
,
"
a
"
,
dtype
,
[
"
i_inner
"
])
knl
=
lp
.
realize_cse
(
knl
,
"
b
"
,
dtype
,
[
"
j_inner
"
])
knl
=
lp
.
split_dimension
(
knl
,
"
i_inner
"
,
16
,
inner_tag
=
"
l.0
"
)
knl
=
lp
.
split_dimension
(
knl
,
"
j_inner
"
,
16
,
inner_tag
=
"
l.1
"
)
knl
=
lp
.
split_dimension
(
knl
,
"
j_inner_0
"
,
16
,
outer_tag
=
"
l.1
"
,
inner_tag
=
"
l.0
"
)
knl
=
lp
.
split_dimension
(
knl
,
"
i_inner_0
"
,
16
,
outer_tag
=
"
l.1
"
,
inner_tag
=
"
l.0
"
)
return
knl
#for variant in [variant_1, variant_2, variant_3]:
for
variant
in
[
variant_4
]:
kernel_gen
=
lp
.
generate_loop_schedules
(
variant
(
knl
))
kernel_gen
=
lp
.
check_kernels
(
kernel_gen
,
dict
(
n
=
n
),
kill_level_min
=
5
)
a
=
cl_random
.
rand
(
queue
,
n
,
dtype
=
dtype
)
b
=
cl_random
.
rand
(
queue
,
n
,
dtype
=
dtype
)
refsol
=
a
.
get
()[:,
np
.
newaxis
]
*
b
.
get
()
c
=
cl_array
.
empty
(
queue
,
refsol
.
shape
,
refsol
.
dtype
)
def
launcher
(
kernel
,
gsize
,
lsize
,
check
):
evt
=
kernel
(
queue
,
gsize
(
n
),
lsize
(
n
),
a
.
data
,
b
.
data
,
c
.
data
,
n
,
g_times_l
=
True
)
if
check
:
check_error
(
refsol
,
c
.
get
())
return
evt
lp
.
drive_timing_run
(
kernel_gen
,
queue
,
launcher
,
n
**
2
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment