Skip to content
Commits on Source (3)
......@@ -1059,7 +1059,6 @@ earlier:
acc_k = 0.0f;
if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
a_fetch[lid(0)] = a[16 * gid(0) + lid(0)];
barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */;
if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
{
for (int k = 0; k <= 15; ++k)
......
......@@ -335,11 +335,15 @@ def get_return_from_kernel_mapping(kernel):
# {{{ check for write races in accesses
def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table):
def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table,
address_space):
"""
Returns *True* if the execution instances of *insn_a* and *insn_b*, accessing
the same variable via access maps *map_a* and *map_b*, result in an access race.
:arg address_space: An instance of :class:`loopy.kernel.data.AddressSpace`
of the variable whose accesses are being checked for a race.
.. note::
The accesses ``map_a``, ``map_b`` lead to write races iff there exists 2
......@@ -348,9 +352,12 @@ def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table):
import pymbolic.primitives as p
from loopy.symbolic import isl_set_from_expr, aff_from_expr, aff_to_expr
from loopy.kernel.data import (filter_iname_tags_by_type,
HardwareConcurrentTag)
HardwareConcurrentTag,
AddressSpace)
from loopy.kernel.tools import get_hw_axis_base_for_codegen
assert address_space in [AddressSpace.LOCAL, AddressSpace.GLOBAL]
gsize, lsize = knl.get_grid_size_upper_bounds(callables_table,
return_dict=True)
......@@ -470,25 +477,40 @@ def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table):
# {{{ Step 5: create the set any(l.i.A != l.i.B) OR any(g.i.A != g.i.B)
space = set_a.space
unequal_global_id_set = isl.Set.empty(set_a.get_space())
unequal_local_id_set = isl.Set.empty(set_a.get_space())
unequal_group_id_set = isl.Set.empty(set_a.get_space())
equal_group_id_set = isl.BasicSet.universe(set_a.get_space())
for i_l in lsize:
lid_a = p.Variable(f"l.{i_l}.A")
lid_b = p.Variable(f"l.{i_l}.B")
unequal_global_id_set |= (isl_set_from_expr(space,
p.Comparison(lid_a, "!=", lid_b))
)
unequal_local_id_set |= (isl_set_from_expr(space,
p.Comparison(lid_a, "!=", lid_b))
)
for i_g in gsize:
gid_a = p.Variable(f"g.{i_g}.A")
gid_b = p.Variable(f"g.{i_g}.B")
unequal_global_id_set |= (isl_set_from_expr(space,
p.Comparison(gid_a, "!=", gid_b))
)
unequal_group_id_set |= (isl_set_from_expr(space,
p.Comparison(gid_a, "!=", gid_b))
)
equal_group_id_set &= (isl_set_from_expr(space,
p.Comparison(gid_a, "==", gid_b))
)
# }}}
return not (set_a & set_b & unequal_global_id_set).is_empty()
if address_space == AddressSpace.GLOBAL:
return not (set_a
& set_b
& (unequal_local_id_set
| unequal_group_id_set)
).is_empty()
else:
return not (set_a
& set_b
& unequal_local_id_set
& equal_group_id_set).is_empty()
class AccessMapDescriptor(enum.Enum):
......@@ -582,7 +604,10 @@ class WriteRaceChecker:
return _check_for_access_races(insn1_amap, self.kernel.id_to_insn[insn1],
insn2_amap, self.kernel.id_to_insn[insn2],
self.kernel, self.callables_table)
self.kernel, self.callables_table,
(self.kernel
.get_var_descriptor(var_name)
.address_space))
# }}}
......
......@@ -3644,6 +3644,30 @@ def test_barrier_non_zero_hw_lbound():
assert barrier_between(knl, "w_a", "w_b")
def test_no_unnecessary_lbarrier(ctx_factory):
# This regression would fail on loopy.git <= 268a7f4
# (Issue reported by @thilinarmtb)
t_unit = lp.make_kernel(
"{[i_outer, i_inner]: 0 <= i_outer < n and 0 <= i_inner < 16}",
"""
<> s_a[i_inner] = ai[i_outer * 16 + i_inner] {id=write_s_a}
ao[i_outer * 16 + i_inner] = 2.0 * s_a[i_inner] {id=write_ao, dep=write_s_a}
""",
assumptions="n>=0")
t_unit = lp.add_dtypes(t_unit, dict(ai=np.float32))
t_unit = lp.tag_inames(t_unit, dict(i_inner="l.0", i_outer="g.0"))
t_unit = lp.set_temporary_address_space(t_unit, "s_a", "local")
t_unit = lp.prioritize_loops(t_unit, "i_outer,i_inner")
t_unit = lp.preprocess_kernel(t_unit)
knl = lp.get_one_linearized_kernel(t_unit.default_entrypoint,
t_unit.callables_table)
assert not barrier_between(knl, "write_s_a", "write_ao")
if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])
......