From 7241bd636afe82566aa0e80b7c7b2dbb9e49312a Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 16 Feb 2022 17:21:28 -0600
Subject: [PATCH] [bugfix]: precompute over insns after a gbarrier

---
 loopy/transform/precompute.py | 14 ++++++++++++++
 test/test_transform.py        | 26 ++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 7c20d7a01..201abd470 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -29,6 +29,8 @@ from loopy.diagnostic import LoopyError
 from pymbolic.mapper.substitutor import make_subst_func
 from loopy.translation_unit import TranslationUnit
 from loopy.kernel.function_interface import CallableKernel, ScalarCallable
+from loopy.kernel.tools import (kernel_has_global_barriers,
+                                find_most_recent_global_barrier)
 import numpy as np
 
 from pymbolic import var
@@ -217,6 +219,18 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
 
         self.replaced_something = True
 
+        # {{{ add gbarriers that the replaced insn depends-on to compute insn's deps
+
+        if (kernel_has_global_barriers(expn_state.kernel)
+                and (find_most_recent_global_barrier(expn_state.kernel,
+                                                     expn_state.instruction.id
+                                                     ) is not None)):
+            self.compute_insn_depends_on.add(
+                find_most_recent_global_barrier(expn_state.kernel,
+                                                expn_state.instruction.id))
+
+        # }}}
+
         return new_outer_expr
 
     def map_kernel(self, kernel):
diff --git a/test/test_transform.py b/test/test_transform.py
index e42eeb498..2043b127e 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -1366,6 +1366,32 @@ def test_rename_inames_existing_ok(ctx_factory):
     lp.auto_test_vs_ref(knl, ctx, ref_knl)
 
 
+def test_precompute_with_gbarrier(ctx_factory):
+    # See https://github.com/inducer/loopy/issues/543
+    ctx = ctx_factory()
+
+    t_unit = lp.make_kernel(
+        ["{[i0, j0]: 0<=i0<100 and 0<=j0<10}",
+         "{[i1, j1]: 0<=i1<100 and 0<=j1<10}"],
+        """
+        out0[i0] = sum(j0, A[i0] * x[j0])
+        ... gbarrier {id=gbarrier}
+        out1[i1] = sum(j1, A[i1] * x[j1])
+        """, seq_dependencies=True)
+    t_unit = lp.add_dtypes(t_unit, {"A": np.float64,
+                                    "x": np.float64})
+    ref_t_unit = t_unit
+
+    t_unit = lp.add_prefetch(t_unit,
+                             "x",
+                             sweep_inames=["j1"],
+                             within="writes:out1",
+                             prefetch_insn_id="x_fetch")
+    assert "gbarrier" in t_unit.default_entrypoint.id_to_insn["x_fetch"].depends_on
+
+    lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab