diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index 4a07b63330747aa69d7ed498e004d60b7c312a7b..d293e3ebe998a632bd547f94a67e675ff0592bfb 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -72,6 +72,8 @@ Manipulating Instructions
 
 .. autofunction:: tag_instructions
 
+.. autofunction:: add_nosync
+
 Registering Library Routines
 ----------------------------
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 942c7d56e01f9d037b0e2b601f88bc8b96dda151..5eaa12b8124f86cfaf08cf2e83c3382861d9e0f2 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1479,6 +1479,8 @@ Barriers
 :mod:`loopy` may infer the need for a barrier when it is not necessary. The
 ``no_sync_with`` instruction attribute can be used to resolve this.
 
+See also :func:`loopy.add_nosync`.
+
 TODO
 
 .. }}}
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 6cbb3362ef91b27c3b7b1cf6a591f7f9a20c2f7a..aa1d43172a4bd6472f5974c292c4256946fcf542 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -54,7 +54,11 @@ from loopy.kernel.tools import (
         get_dot_dependency_graph,
         show_dependency_graph,
         add_dtypes,
-        add_and_infer_dtypes)
+        add_and_infer_dtypes,
+        get_global_barrier_order,
+        find_most_recent_global_barrier,
+        get_subkernels,
+        get_subkernel_to_insn_id_map)
 from loopy.kernel.creation import make_kernel, UniqueName
 from loopy.library.reduction import register_reduction_parser
 
@@ -75,7 +79,8 @@ from loopy.transform.instruction import (
         set_instruction_priority, add_dependency,
         remove_instructions,
         replace_instruction_ids,
-        tag_instructions)
+        tag_instructions,
+        add_nosync)
 
 from loopy.transform.data import (
         add_prefetch, change_arg_to_image,
@@ -189,6 +194,7 @@ __all__ = [
         "remove_instructions",
         "replace_instruction_ids",
         "tag_instructions",
+        "add_nosync",
 
         "extract_subst", "expand_subst", "assignment_to_subst",
         "find_rules_matching", "find_one_rule_matching",
@@ -215,6 +221,10 @@ __all__ = [
         "show_dependency_graph",
         "add_dtypes",
         "add_and_infer_dtypes",
+        "get_global_barrier_order",
+        "find_most_recent_global_barrier",
+        "get_subkernels",
+        "get_subkernel_to_insn_id_map",
 
         "infer_unknown_types",
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 793d31791a3295ef1d7c03132f43489ab828f089..324f7da1a21de0115ea060ff7ef55e52ab0913d4 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -786,6 +786,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             for var_name in insn.read_dependency_names() & admissible_vars:
                 result.setdefault(var_name, set()).add(insn.id)
 
+        return result
+
     @memoize_method
     def writer_map(self):
         """
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 2033425236836ecf000d6c341c46dcb8b087a29a..8bdc72d54a91c6e8b4f9ec0ca3053831627d3eae 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -34,6 +34,8 @@ import numpy as np
 import islpy as isl
 from islpy import dim_type
 from loopy.diagnostic import LoopyError, warn_with_kernel
+from pytools import memoize_on_first_arg
+
 
 import logging
 logger = logging.getLogger(__name__)
@@ -316,10 +318,16 @@ class SetOperationCacheManager:
         return result
 
     def dim_min(self, set, *args):
+        if set.plain_is_empty():
+            raise LoopyError("domain '%s' is empty" % set)
+
         from loopy.isl_helpers import dim_min_with_elimination
         return self.op(set, "dim_min", dim_min_with_elimination, args)
 
     def dim_max(self, set, *args):
+        if set.plain_is_empty():
+            raise LoopyError("domain '%s' is empty" % set)
+
         from loopy.isl_helpers import dim_max_with_elimination
         return self.op(set, "dim_max", dim_max_with_elimination, args)
 
@@ -1367,4 +1375,195 @@ def draw_dependencies_as_unicode_arrows(
 
 # }}}
 
+
+# {{{ global barrier order finding
+
+@memoize_on_first_arg
+def get_global_barrier_order(kernel):
+    """Return a :class:`tuple` of the listing the ids of global barrier instructions
+    as they appear in order in the kernel.
+
+    See also :class:`loopy.instruction.BarrierInstruction`.
+    """
+    barriers = []
+    visiting = set()
+    visited = set()
+
+    unvisited = set(insn.id for insn in kernel.instructions)
+
+    def is_barrier(my_insn_id):
+        insn = kernel.id_to_insn[my_insn_id]
+        from loopy.kernel.instruction import BarrierInstruction
+        return isinstance(insn, BarrierInstruction) and insn.kind == "global"
+
+    while unvisited:
+        stack = [unvisited.pop()]
+
+        while stack:
+            top = stack[-1]
+
+            if top in visiting:
+                visiting.remove(top)
+                if is_barrier(top):
+                    barriers.append(top)
+
+            if top in visited:
+                stack.pop()
+                continue
+
+            visited.add(top)
+            visiting.add(top)
+
+            for child in kernel.id_to_insn[top].depends_on:
+                # Check for no cycles.
+                assert child not in visiting
+                stack.append(child)
+
+    # Ensure this is the only possible order.
+    #
+    # We do this by looking at the barriers in order.
+    # We check for each adjacent pair (a,b) in the order if a < b,
+    # i.e. if a is reachable by a chain of dependencies from b.
+
+    visiting.clear()
+    visited.clear()
+
+    for prev_barrier, barrier in zip(barriers, barriers[1:]):
+        # Check if prev_barrier is reachable from barrier.
+        stack = [barrier]
+        visited.discard(prev_barrier)
+
+        while stack:
+            top = stack[-1]
+
+            if top in visiting:
+                visiting.remove(top)
+
+            if top in visited:
+                stack.pop()
+                continue
+
+            visited.add(top)
+            visiting.add(top)
+
+            if top == prev_barrier:
+                visiting.clear()
+                break
+
+            for child in kernel.id_to_insn[top].depends_on:
+                stack.append(child)
+        else:
+            # Search exhausted and we did not find prev_barrier.
+            raise LoopyError("barriers '%s' and '%s' are not ordered"
+                             % (prev_barrier, barrier))
+
+    return tuple(barriers)
+
+# }}}
+
+
+# {{{ find most recent global barrier
+
+@memoize_on_first_arg
+def find_most_recent_global_barrier(kernel, insn_id):
+    """Return the id of the latest occuring global barrier which the
+    given instruction (indirectly or directly) depends on, or *None* if this
+    instruction does not depend on a global barrier.
+
+    The return value is guaranteed to be unique because global barriers are
+    totally ordered within the kernel.
+    """
+
+    global_barrier_order = get_global_barrier_order(kernel)
+
+    if len(global_barrier_order) == 0:
+        return None
+
+    insn = kernel.id_to_insn[insn_id]
+
+    if len(insn.depends_on) == 0:
+        return None
+
+    def is_barrier(my_insn_id):
+        insn = kernel.id_to_insn[my_insn_id]
+        from loopy.kernel.instruction import BarrierInstruction
+        return isinstance(insn, BarrierInstruction) and insn.kind == "global"
+
+    global_barrier_to_ordinal = dict(
+            (b, i) for i, b in enumerate(global_barrier_order))
+
+    def get_barrier_ordinal(barrier_id):
+        return (global_barrier_to_ordinal[barrier_id]
+                if barrier_id is not None
+                else -1)
+
+    direct_barrier_dependencies = set(
+            dep for dep in insn.depends_on if is_barrier(dep))
+
+    if len(direct_barrier_dependencies) > 0:
+        return max(direct_barrier_dependencies, key=get_barrier_ordinal)
+    else:
+        return max((find_most_recent_global_barrier(kernel, dep)
+                    for dep in insn.depends_on),
+                key=get_barrier_ordinal)
+
+# }}}
+
+
+# {{{ subkernel tools
+
+@memoize_on_first_arg
+def get_subkernels(kernel):
+    """Return a :class:`tuple` of the names of the subkernels in the kernel. The
+    kernel must be scheduled.
+
+    See also :class:`loopy.schedule.CallKernel`.
+    """
+    from loopy.kernel import kernel_state
+    if kernel.state != kernel_state.SCHEDULED:
+        raise LoopyError("Kernel must be scheduled")
+
+    from loopy.schedule import CallKernel
+
+    return tuple(sched_item.kernel_name
+            for sched_item in kernel.schedule
+            if isinstance(sched_item, CallKernel))
+
+
+@memoize_on_first_arg
+def get_subkernel_to_insn_id_map(kernel):
+    """Return a :class:`dict` mapping subkernel names to a :class:`frozenset`
+    consisting of the instruction ids scheduled within the subkernel. The
+    kernel must be scheduled.
+    """
+    from loopy.kernel import kernel_state
+    if kernel.state != kernel_state.SCHEDULED:
+        raise LoopyError("Kernel must be scheduled")
+
+    from loopy.schedule import (
+            sched_item_to_insn_id, CallKernel, ReturnFromKernel)
+
+    subkernel = None
+    result = {}
+
+    for sched_item in kernel.schedule:
+        if isinstance(sched_item, CallKernel):
+            subkernel = sched_item.kernel_name
+            result[subkernel] = set()
+
+        if isinstance(sched_item, ReturnFromKernel):
+            subkernel = None
+
+        if subkernel is not None:
+            for insn_id in sched_item_to_insn_id(sched_item):
+                result[subkernel].add(insn_id)
+
+    for subkernel in result:
+        result[subkernel] = frozenset(result[subkernel])
+
+    return result
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 7c9c9688604179dce2aa7dcd6954d76a0df32cc7..2be78f8e5c25a3b48c195f52715f9d6453100e3b 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -34,7 +34,6 @@ def find_instructions(kernel, insn_match):
     match = parse_match(insn_match)
     return [insn for insn in kernel.instructions if match(kernel, insn)]
 
-
 # }}}
 
 
@@ -130,6 +129,8 @@ def remove_instructions(kernel, insn_ids):
     Dependencies across (one, for now) deleted isntructions are propagated.
     Behavior is undefined for now for chains of dependencies within the
     set of deleted instructions.
+
+    This also updates *no_sync_with* for all instructions.
     """
 
     if not insn_ids:
@@ -155,7 +156,14 @@ def remove_instructions(kernel, insn_ids):
         for dep_id in depends_on & insn_ids:
             new_deps = new_deps | id_to_insn[dep_id].depends_on
 
-        new_insns.append(insn.copy(depends_on=frozenset(new_deps)))
+        # update no_sync_with
+
+        new_no_sync_with = frozenset((insn_id, scope)
+                for insn_id, scope in insn.no_sync_with
+                if insn_id not in insn_ids)
+
+        new_insns.append(
+                insn.copy(depends_on=new_deps, no_sync_with=new_no_sync_with))
 
     return kernel.copy(
             instructions=new_insns)
@@ -171,6 +179,7 @@ def replace_instruction_ids(kernel, replacements):
     for insn in kernel.instructions:
         changed = False
         new_depends_on = []
+        new_no_sync_with = []
 
         for dep in insn.depends_on:
             if dep in replacements:
@@ -179,8 +188,18 @@ def replace_instruction_ids(kernel, replacements):
             else:
                 new_depends_on.append(dep)
 
+        for insn_id, scope in insn.no_sync_with:
+            if insn_id in replacements:
+                new_no_sync_with.extend(
+                        (repl, scope) for repl in replacements[insn_id])
+                changed = True
+            else:
+                new_no_sync_with.append((insn_id, scope))
+
         new_insns.append(
-                insn.copy(depends_on=frozenset(new_depends_on))
+                insn.copy(
+                    depends_on=frozenset(new_depends_on),
+                    no_sync_with=frozenset(new_no_sync_with))
                 if changed else insn)
 
     return kernel.copy(instructions=new_insns)
@@ -207,4 +226,79 @@ def tag_instructions(kernel, new_tag, within=None):
 # }}}
 
 
+# {{{ add nosync
+
+def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False):
+    """Add a *no_sync_with* directive between *source* and *sink*.
+    *no_sync_with* is only added if *sink* depends on *source* or
+    if the instruction pair is in a conflicting group.
+
+    This function does not check for the presence of a memory dependency.
+
+    :arg kernel: The kernel
+    :arg source: Either a single instruction id, or any instruction id
+        match understood by :func:`loopy.match.parse_match`.
+    :arg sink: Either a single instruction id, or any instruction id
+        match understood by :func:`loopy.match.parse_match`.
+    :arg scope: A valid *no_sync_with* scope. See
+        :attr:`loopy.InstructionBase.no_sync_with` for allowable scopes.
+    :arg bidirectional: A :class:`bool`. If *True*, add a *no_sync_with*
+        to both the source and sink instructions, otherwise the directive
+        is only added to the sink instructions.
+    :arg force: A :class:`bool`. If *True*, add a *no_sync_with* directive
+        even without the presence of a dependency edge or conflicting
+        instruction group.
+
+    :return: The updated kernel
+    """
+
+    if isinstance(source, str) and source in kernel.id_to_insn:
+        sources = frozenset([source])
+    else:
+        sources = frozenset(
+                source.id for source in find_instructions(kernel, source))
+
+    if isinstance(sink, str) and sink in kernel.id_to_insn:
+        sinks = frozenset([sink])
+    else:
+        sinks = frozenset(
+                sink.id for sink in find_instructions(kernel, sink))
+
+    def insns_in_conflicting_groups(insn1_id, insn2_id):
+        insn1 = kernel.id_to_insn[insn1_id]
+        insn2 = kernel.id_to_insn[insn2_id]
+        return (
+                bool(insn1.groups & insn2.conflicts_with_groups)
+                or
+                bool(insn2.groups & insn1.conflicts_with_groups))
+
+    from collections import defaultdict
+    nosync_to_add = defaultdict(set)
+
+    for sink in sinks:
+        for source in sources:
+
+            needs_nosync = force or (
+                    source in kernel.recursive_insn_dep_map()[sink]
+                    or insns_in_conflicting_groups(source, sink))
+
+            if not needs_nosync:
+                continue
+
+            nosync_to_add[sink].add((source, scope))
+            if bidirectional:
+                nosync_to_add[source].add((sink, scope))
+
+    new_instructions = list(kernel.instructions)
+
+    for i, insn in enumerate(new_instructions):
+        if insn.id in nosync_to_add:
+            new_instructions[i] = insn.copy(no_sync_with=insn.no_sync_with
+                    | frozenset(nosync_to_add[insn.id]))
+
+    return kernel.copy(instructions=new_instructions)
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 94cdb499c096337db0657267d5afd472eef80a9c..1218847a7c42bd420a993d86a7534f066c2ab20e 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2108,6 +2108,50 @@ def test_barrier_insertion_near_bottom_of_loop():
     assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1])
 
 
+def test_global_barrier_order_finding():
+    knl = lp.make_kernel(
+            "{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
+            """
+            for i
+                for itrip
+                    ... gbarrier {id=top}
+                    <> z[i] = z[i+1] + z[i]  {id=wr_z,dep=top}
+                    <> v[i] = 11  {id=wr_v,dep=top}
+                    ... gbarrier {dep=wr_z:wr_v,id=yoink}
+                    z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=yoink}
+                end
+                ... nop {id=nop}
+                ... gbarrier {dep=iupd,id=postloop}
+                z[i] = z[i] - z[i+1] + v[i]  {id=zzzv,dep=postloop}
+            end
+            """)
+
+    assert lp.get_global_barrier_order(knl) == ("top", "yoink", "postloop")
+
+    for insn, barrier in (
+            ("nop", None),
+            ("top", None),
+            ("wr_z", "top"),
+            ("wr_v", "top"),
+            ("yoink", "top"),
+            ("postloop", "yoink"),
+            ("zzzv", "postloop")):
+        assert lp.find_most_recent_global_barrier(knl, insn) == barrier
+
+
+def test_global_barrier_error_if_unordered():
+    # FIXME: Should be illegal to declare this
+    knl = lp.make_kernel("{[i]: 0 <= i < 10}",
+            """
+            ... gbarrier
+            ... gbarrier
+            """)
+
+    from loopy.diagnostic import LoopyError
+    with pytest.raises(LoopyError):
+        lp.get_global_barrier_order(knl)
+
+
 def test_struct_assignment(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
diff --git a/test/test_transform.py b/test/test_transform.py
index ac5a26f6a5683bf9e86055a2729ea5ee995dee1e..b5fcdf04c4781c5f370c911ceb7efcb4042f6b4e 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -402,6 +402,42 @@ def test_precompute_with_preexisting_inames_fail():
                 precompute_inames="ii,jj")
 
 
+def test_add_nosync():
+    orig_knl = lp.make_kernel("{[i]: 0<=i<10}",
+        """
+        <>tmp[i] = 10 {id=insn1}
+        <>tmp2[i] = 10 {id=insn2}
+
+        <>tmp3[2*i] = 0 {id=insn3}
+        <>tmp4 = 1 + tmp3[2*i] {id=insn4}
+
+        <>tmp5[i] = 0 {id=insn5,groups=g1}
+        tmp5[i] = 1 {id=insn6,conflicts=g1}
+        """)
+
+    orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local")
+    orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local")
+
+    # No dependency present - don't add nosync
+    knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2")
+    assert frozenset() == knl.id_to_insn["insn2"].no_sync_with
+
+    # Dependency present
+    knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3")
+    assert frozenset() == knl.id_to_insn["insn3"].no_sync_with
+    assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with
+
+    # Bidirectional
+    knl = lp.add_nosync(
+            orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True)
+    assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with
+    assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with
+
+    # Groups
+    knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6")
+    assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])