Skip to content
Snippets Groups Projects
Commit e9fee6d1 authored by Andreas Klöckner's avatar Andreas Klöckner
Browse files

Various minor tweaks

parent 59474f3c
No related branches found
No related tags found
No related merge requests found
......@@ -16,13 +16,17 @@ Things to consider
- Every loop in loopy is opened at most once.
Too restrictive?
- Why do CSEs necessarily have to duplicate the inames?
- Why do precomputes necessarily have to duplicate the inames?
-> because that would be necessary for a sequential prefetch
- Cannot do slab decomposition on inames that share a tag with
other inames
-> Is that reasonable?
- Entering a loop means:
- setting up conditionals related to it (slabs/bounds)
- allowing loops nested inside to depend on loop state
- Not using all hw loop dimensions causes an error, as
is the case for variant 3 in the rank_one test.
......@@ -53,7 +57,6 @@ To-do
- rename IndexTag -> InameTag
- Data implementation tags
- retag semantics once strides have been computed
- turn base_indices into offset
- vectorization
- automatic copies
......@@ -63,7 +66,6 @@ To-do
- automatic copies from an array with one set of tags
to the same array with another set.
- Make tests run on GPUs
Fixes:
......
......@@ -150,6 +150,11 @@ class CodeGenerationState(object):
self.c_code_mapper = c_code_mapper
def copy(self, implemented_domain=None, c_code_mapper=None):
return CodeGenerationState(
implemented_domain=implemented_domain or self.implemented_domain,
c_code_mapper=c_code_mapper or self.c_code_mapper)
def intersect(self, other):
new_impl, new_other = isl.align_two(self.implemented_domain, other)
return CodeGenerationState(
......@@ -217,6 +222,8 @@ class POD(PODBase):
# }}}
# {{{ implemented data info
class ImplementedDataInfo(Record):
"""
.. attribute:: name
......@@ -269,6 +276,8 @@ class ImplementedDataInfo(Record):
stride_for_name_and_axis=stride_for_name_and_axis,
allows_offset=allows_offset)
# }}}
# {{{ main code generation entrypoint
......
......@@ -54,7 +54,7 @@ def get_bounds_checks(domain, check_inames, implemented_domain,
else:
result = result.compute_divs()
result = isl.align_spaces(result, implemented_domain)
result, implemented_domain = isl.align_two(result, implemented_domain)
result = result.gist(implemented_domain)
if overapproximate:
......
......@@ -199,6 +199,9 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
from loopy.isl_helpers import static_value_of_pw_aff
lower_bound = static_value_of_pw_aff(bounds.lower_bound_pw_aff,
constants_only=False)
# These bounds are 'implemented' by the hardware. Make sure
# that the downstream conditional generators realize that.
slab = make_slab(domain.get_space(), iname,
lower_bound, lower_bound+hw_axis_size)
codegen_state = codegen_state.intersect(slab)
......
......@@ -748,7 +748,7 @@ class CompiledKernel:
return kernel
@memoize_method
def cl_kernel_info(self, arg_to_dtype_set, code_op=None):
def cl_kernel_info(self, arg_to_dtype_set=frozenset(), code_op=None):
kernel = self.get_kernel(arg_to_dtype_set)
from loopy.codegen import generate_code
......
......@@ -196,7 +196,6 @@ def loop_nest_map(kernel):
# ILP tags are special because they are parallel tags
# and therefore 'in principle' nest around everything.
# But they're realized by the scheduler as a loop
# (and the scheduler is the only custom
# at the innermost level, so we'll cut them some
# slack here.
continue
......
......@@ -1127,8 +1127,11 @@ def test_vector_ilp_with_prefetch(ctx_factory):
knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])
code, info = lp.generate_code(knl)
cknl = lp.CompiledKernel(ctx, knl)
cknl.cl_kernel_info()
import re
code = cknl.get_code()
assert len(list(re.finditer("barrier", code))) == 1
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment