diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 1b017f701f8161e93c4fdc1c14644dfe4b4fa74c..23057cb13048c029fbc3db5ebacf58696b039286 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -957,8 +957,8 @@ Consider the following example:
     <BLANKLINE>
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
       {
-        a_temp[lid(0)] = a[16 * gid(0) + lid(0)];
         acc_k = 0.0f;
+        a_temp[lid(0)] = a[16 * gid(0) + lid(0)];
       }
       barrier(CLK_LOCAL_MEM_FENCE) /* for a_temp (insn_0_k_update depends on insn) */;
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
diff --git a/loopy/check.py b/loopy/check.py
index da49c1d116df1a9fbf92e8ef41822b6741405604..600f5670d7fea3bc201c231061f4442201aee5cb 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -449,45 +449,9 @@ def check_has_schedulable_iname_nesting(kernel):
 
 # {{{ check_variable_access_ordered
 
-class IndirectDependencyEdgeFinder(object):
-    def __init__(self, kernel):
-        self.kernel = kernel
-        self.dep_edge_cache = {}
-
-    def __call__(self, depender_id, dependee_id):
-        cache_key = (depender_id, dependee_id)
-
-        try:
-            result = self.dep_edge_cache[cache_key]
-        except KeyError:
-            pass
-        else:
-            if result is None:
-                from loopy.diagnostic import DependencyCycleFound
-                raise DependencyCycleFound("when "
-                        "checking for dependency edge between "
-                        "depender '%s' and dependee '%s'"
-                        % (depender_id, dependee_id))
-            else:
-                return result
-
-        depender = self.kernel.id_to_insn[depender_id]
-
-        if dependee_id in depender.depends_on:
-            self.dep_edge_cache[cache_key] = True
-            return True
-
-        self.dep_edge_cache[cache_key] = None
-        for dep in depender.depends_on:
-            if self(dep, dependee_id):
-                self.dep_edge_cache[cache_key] = True
-                return True
-
-        self.dep_edge_cache[cache_key] = False
-        return False
-
-
 def declares_nosync_with(kernel, var_address_space, dep_a, dep_b):
+    dep_a = kernel.id_to_insn[dep_a]
+    dep_b = kernel.id_to_insn[dep_b]
     from loopy.kernel.data import AddressSpace
     if var_address_space == AddressSpace.GLOBAL:
         search_scopes = ["global", "any"]
@@ -510,116 +474,181 @@ def declares_nosync_with(kernel, var_address_space, dep_a, dep_b):
     return ab_nosync and ba_nosync
 
 
+def _get_address_space(kernel, var):
+    from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg
+    if var in kernel.temporary_variables:
+        address_space = kernel.temporary_variables[var].address_space
+    else:
+        arg = kernel.arg_dict[var]
+        if isinstance(arg, ArrayArg):
+            address_space = arg.address_space
+        elif isinstance(arg, ValueArg):
+            address_space = AddressSpace.PRIVATE
+        else:
+            # No need to consider ConstantArg and ImageArg (for now)
+            # because those won't be written.
+            raise ValueError("could not determine address_space of '%s'" % var)
+    return address_space
+
+
+def _get_topological_order(kernel):
+    from pytools.graph import compute_sccs
+    from loopy.diagnostic import DependencyCycleFound
+
+    dep_map = {insn.id: insn.depends_on for insn in kernel.instructions}
+
+    sccs = compute_sccs(dep_map)
+    order = []
+
+    for scc in sccs:
+        if len(scc) != 1:
+            raise DependencyCycleFound(', '.join(scc))
+        order.append(scc[0])
+
+    return order
+
+
 def _check_variable_access_ordered_inner(kernel):
+    from loopy.kernel.tools import find_aliasing_equivalence_classes
+    from loopy.symbolic import AccessRangeOverlapChecker
+    overlap_checker = AccessRangeOverlapChecker(kernel)
+    aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel)
+
     logger.debug("%s: check_variable_access_ordered: start" % kernel.name)
 
-    checked_variables = kernel.get_written_variables() & (
-            set(kernel.temporary_variables) | set(arg for arg in kernel.arg_dict))
+    # dep_reqs_to_vars: A mapping from (writer_id, dep_req_id) between whom
+    # dependency must be established to the variables which prompted the
+    # dependency requirement.
+    dep_reqs_to_vars = {}
 
     wmap = kernel.writer_map()
     rmap = kernel.reader_map()
 
-    from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg
-    from loopy.kernel.tools import find_aliasing_equivalence_classes
+    # {{{ populate 'dep_reqs_to_vars'
 
-    depfind = IndirectDependencyEdgeFinder(kernel)
-    aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel)
-
-    for name in checked_variables:
-        # This is a tad redundant in that this could probably be restructured
-        # to iterate only over equivalence classes and not individual variables.
-        # But then the access-range overlap check below would have to be smarter.
-        eq_class = aliasing_equiv_classes[name]
+    for var in kernel.get_written_variables():
+        address_space = _get_address_space(kernel, var)
+        eq_class = aliasing_equiv_classes[var]
 
         readers = set.union(
                 *[rmap.get(eq_name, set()) for eq_name in eq_class])
         writers = set.union(
                 *[wmap.get(eq_name, set()) for eq_name in eq_class])
-        unaliased_readers = rmap.get(name, set())
-        unaliased_writers = wmap.get(name, set())
-
-        if not writers:
-            continue
-
-        if name in kernel.temporary_variables:
-            address_space = kernel.temporary_variables[name].address_space
-        else:
-            arg = kernel.arg_dict[name]
-            if isinstance(arg, ArrayArg):
-                address_space = arg.address_space
-            elif isinstance(arg, ValueArg):
-                address_space = AddressSpace.PRIVATE
-            else:
-                # No need to consider ConstantArg and ImageArg (for now)
-                # because those won't be written.
-                raise ValueError("could not determine address_space of '%s'" % name)
-
-        # Check even for PRIVATE address space, to ensure intentional program order.
 
-        from loopy.symbolic import AccessRangeOverlapChecker
-        overlap_checker = AccessRangeOverlapChecker(kernel)
+        for writer in writers:
+            required_deps = (readers | writers) - set([writer])
+            required_deps = set([req_dep for req_dep in required_deps if not
+                declares_nosync_with(kernel, address_space, writer,
+                    req_dep)])
 
-        for writer_id in writers:
-            for other_id in readers | writers:
-                if writer_id == other_id:
-                    continue
-
-                writer = kernel.id_to_insn[writer_id]
-                other = kernel.id_to_insn[other_id]
+            for req_dep in required_deps:
+                dep_reqs_to_vars.setdefault((writer, req_dep), set()).add(var)
 
-                has_dependency_relationship = (
-                        declares_nosync_with(kernel, address_space, other, writer)
-                        or
-                        depfind(writer_id, other_id)
-                        or
-                        depfind(other_id, writer_id)
-                        )
+    # }}}
 
-                if has_dependency_relationship:
-                    continue
+    # depends_on: mapping from insn_ids to their dependencies
+    depends_on = dict((insn.id, set()) for insn in
+            kernel.instructions)
+    # rev_depends: mapping from insn_ids to their reverse deps.
+    rev_depends = dict((insn.id, set()) for insn in
+            kernel.instructions)
 
-                is_relationship_by_aliasing = not (
-                        writer_id in unaliased_writers
-                        and (other_id in unaliased_writers
-                            or other_id in unaliased_readers))
+    # {{{ populate rev_depends, depends_on
 
-                # Do not enforce ordering for disjoint access ranges
-                if (not is_relationship_by_aliasing and not
-                    overlap_checker.do_access_ranges_overlap_conservative(
-                            writer_id, "w", other_id, "any", name)):
-                    continue
+    for insn in kernel.instructions:
+        depends_on[insn.id].update(insn.depends_on)
+        for dep in insn.depends_on:
+            rev_depends[dep].add(insn.id)
+    # }}}
+
+    topological_order = _get_topological_order(kernel)
+
+    def discard_dep_reqs_in_order(dep_reqs_to_vars, edges, order):
+        """
+        Subtracts dependency requirements of insn_ids by all direct/indirect
+        predecessors of a directed graph of insn_ids as nodes and *edges* as
+        the connectivity.
+
+        :arg order: An instance of :class:`list` of instruction ids in which the
+            *edges* graph is to be traversed.
+        """
+        # memoized_predecessors: mapping from insn_id to its direct/indirect
+        # predecessors
+        memoized_predecessors = {}
+
+        # reverse postorder traversal of dependency graph
+        for insn_id in order:
+            # accumulated_predecessors:insn_id's direct+indirect predecessors
+            accumulated_predecessors = memoized_predecessors.pop(insn_id, set())
+
+            for pred in accumulated_predecessors:
+                dep_reqs_to_vars.pop((insn_id, pred), None)
+
+            for successor in edges[insn_id]:
+                memoized_predecessors.setdefault(successor, set()).update(
+                        accumulated_predecessors | set([insn_id]))
+
+    # forward dep. graph traversal in reverse topological sort order
+    discard_dep_reqs_in_order(dep_reqs_to_vars, depends_on,
+            topological_order[::-1])
+    # reverse dep. graph traversal in topological sort order
+    discard_dep_reqs_in_order(dep_reqs_to_vars, rev_depends, topological_order)
+
+    # {{{ handle dependency requirements that weren't satisfied
+
+    for (writer_id, other_id), variables in six.iteritems(dep_reqs_to_vars):
+        writer = kernel.id_to_insn[writer_id]
+        other = kernel.id_to_insn[other_id]
+
+        for var in variables:
+            eq_class = aliasing_equiv_classes[var]
+            unaliased_readers = rmap.get(var, set())
+            unaliased_writers = wmap.get(var, set())
+
+            is_relationship_by_aliasing = not (
+                writer_id in unaliased_writers
+                and (writer_id in unaliased_writers
+                    or other_id in unaliased_readers))
+
+            # Do not enforce ordering for disjoint access ranges
+            if (not is_relationship_by_aliasing and not
+                overlap_checker.do_access_ranges_overlap_conservative(
+                        writer_id, "w", other_id, "any", var)):
+                continue
 
-                # Do not enforce ordering for aliasing-based relationships
-                # in different groups.
-                if (is_relationship_by_aliasing and (
-                        bool(writer.groups & other.conflicts_with_groups)
-                        or
-                        bool(other.groups & writer.conflicts_with_groups))):
-                    continue
+            # Do not enforce ordering for aliasing-based relationships
+            # in different groups.
+            if (is_relationship_by_aliasing and (
+                    bool(writer.groups & other.conflicts_with_groups)
+                    or
+                    bool(other.groups & writer.conflicts_with_groups))):
+                continue
 
-                msg = ("No dependency relationship found between "
-                        "'{writer_id}' which writes {var} and "
-                        "'{other_id}' which also accesses {var}. "
-                        "Either add a (possibly indirect) dependency "
-                        "between the two, or add them to each others' nosync "
-                        "set to indicate that no ordering is intended, or "
-                        "turn off this check by setting the "
-                        "'enforce_variable_access_ordered' option "
-                        "(more issues of this type may exist--only reporting "
-                        "the first one)"
-                        .format(
-                            writer_id=writer_id,
-                            other_id=other_id,
-                            var=(
-                                "the variable '%s'" % name
-                                if len(eq_class) == 1
-                                else (
-                                    "the aliasing equivalence class '%s'"
-                                    % ", ".join(eq_class))
-                                )))
-
-                from loopy.diagnostic import VariableAccessNotOrdered
-                raise VariableAccessNotOrdered(msg)
+            msg = ("No dependency relationship found between "
+                    "'{writer_id}' which writes {var} and "
+                    "'{other_id}' which also accesses {var}. "
+                    "Either add a (possibly indirect) dependency "
+                    "between the two, or add them to each others' nosync "
+                    "set to indicate that no ordering is intended, or "
+                    "turn off this check by setting the "
+                    "'enforce_variable_access_ordered' option "
+                    "(more issues of this type may exist--only reporting "
+                    "the first one)"
+                    .format(
+                        writer_id=writer_id,
+                        other_id=other_id,
+                        var=(
+                            "the variable '%s'" % var
+                            if len(eq_class) == 1
+                            else (
+                                "the aliasing equivalence class '%s'"
+                                % ", ".join(eq_class))
+                            )))
+
+            from loopy.diagnostic import VariableAccessNotOrdered
+            raise VariableAccessNotOrdered(msg)
+
+    # }}}
 
     logger.debug("%s: check_variable_access_ordered: done" % kernel.name)
 
@@ -628,7 +657,7 @@ def check_variable_access_ordered(kernel):
     """Checks that between each write to a variable and all other accesses to
     the variable there is either:
 
-    * an (at least indirect) depdendency edge, or
+    * a direct/indirect depdendency edge, or
     * an explicit statement that no ordering is necessary (expressed
       through a bi-directional :attr:`loopy.Instruction.no_sync_with`)
     """
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index b736191ec1dadb842e12453fbec3b68e831338f6..90b1b91f70b56f7da21705e573f35dfff79dbe65 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -55,57 +55,57 @@ def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domai
 
 # {{{ on which inames may a conditional depend?
 
-def get_usable_inames_for_conditional(kernel, sched_index):
+def get_usable_inames_for_conditional(kernel, sched_indices):
+
     from loopy.schedule import (
-        find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
+        find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within,
+        get_subkernel_indices)
     from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase,
                                    VectorizeTag,
                                    IlpBaseTag)
-
-    result = find_active_inames_at(kernel, sched_index)
-    crosses_barrier = has_barrier_within(kernel, sched_index)
-
+    active_inames_list = find_active_inames_at(kernel, sched_indices)
+    crosses_barrier_list = has_barrier_within(kernel, sched_indices)
     # Find our containing subkernel. Grab inames for all insns from there.
-    within_subkernel = False
-
-    for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index]):
-        from loopy.schedule import CallKernel, ReturnFromKernel
-        if isinstance(sched_item, CallKernel):
-            within_subkernel = True
-            subkernel_index = sched_item_index
-        elif isinstance(sched_item, ReturnFromKernel):
-            within_subkernel = False
-
-    if not within_subkernel:
-        # Outside all subkernels - use only inames available to host.
-        return frozenset(result)
-
-    insn_ids_for_subkernel = get_insn_ids_for_block_at(
-        kernel.schedule, subkernel_index)
-
-    inames_for_subkernel = (
-        iname
-        for insn in insn_ids_for_subkernel
-        for iname in kernel.insn_inames(insn))
-
-    for iname in inames_for_subkernel:
-        # Parallel inames are defined within a subkernel, BUT:
-        #
-        # - local indices may not be used in conditionals that cross barriers.
-        #
-        # - ILP indices and vector lane indices are not available in loop
-        #   bounds, they only get defined at the innermost level of nesting.
-
-        if (
-                kernel.iname_tags_of_type(iname, ConcurrentTag)
-                and not kernel.iname_tags_of_type(iname, VectorizeTag)
-                and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase)
-                    and crosses_barrier)
-                and not kernel.iname_tags_of_type(iname, IlpBaseTag)
-        ):
-            result.add(iname)
-
-    return frozenset(result)
+    subkernel_index_list = get_subkernel_indices(kernel, sched_indices)
+
+    inames_for_subkernel = {}
+
+    for subknl_idx in set(idx for idx in subkernel_index_list if idx is not None):
+        insn_ids_for_subkernel = get_insn_ids_for_block_at(
+                kernel.schedule, subknl_idx)
+
+        all_inames_in_the_subknl = set([
+            iname
+            for insn in insn_ids_for_subkernel
+            for iname in kernel.insn_inames(insn)])
+
+        def is_eligible_in_conditional(iname):
+            # Parallel inames are defined within a subkernel, BUT:
+            #
+            # - ILP indices and vector lane indices are not available in loop
+            #   bounds, they only get defined at the innermost level of nesting.
+            return (
+                    kernel.iname_tags_of_type(iname, ConcurrentTag)
+                    and not kernel.iname_tags_of_type(iname, VectorizeTag)
+                    and not kernel.iname_tags_of_type(iname, IlpBaseTag))
+
+        inames_for_subkernel[subknl_idx] = [iname for iname in
+                all_inames_in_the_subknl if is_eligible_in_conditional(iname)]
+
+    result = []
+
+    for active_inames, crosses_barrier, subknl_idx in zip(active_inames_list,
+            crosses_barrier_list, subkernel_index_list):
+        if subknl_idx is not None:
+            for iname in inames_for_subkernel[subknl_idx]:
+                # local indices may not be used in conditionals that cross barriers
+                if (not (kernel.iname_tags_of_type(iname, LocalIndexTagBase)
+                            and crosses_barrier)):
+                    active_inames.add(iname)
+
+        result.append(frozenset(active_inames))
+
+    return result
 
 # }}}
 
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 7319b16ac2fe9f39872558a3878161b89cab15d9..c90f4c6b1b92ffa4648804555b1716d5c190759e 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -279,15 +279,17 @@ def build_loop_nest(codegen_state, schedule_index):
     from loopy.schedule import find_used_inames_within
     from loopy.codegen.bounds import get_usable_inames_for_conditional
 
+    admissible_cond_inames = get_usable_inames_for_conditional(kernel,
+            my_sched_indices)
+
     sched_index_info_entries = [
             ScheduleIndexInfo(
-                schedule_indices=[i],
-                admissible_cond_inames=(
-                    get_usable_inames_for_conditional(kernel, i)),
-                required_predicates=get_required_predicates(kernel, i),
-                used_inames_within=find_used_inames_within(kernel, i)
+                schedule_indices=[my_sched_idx],
+                admissible_cond_inames=admissible_cond_inames[i],
+                required_predicates=get_required_predicates(kernel, my_sched_idx),
+                used_inames_within=find_used_inames_within(kernel, my_sched_idx)
                 )
-            for i in my_sched_indices
+            for i, my_sched_idx in enumerate(my_sched_indices)
             ]
 
     sched_index_info_entries = group_by(
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index b3a87798840bb1624d350c79830f29142e54ab6c..c7489e759b5e4217837db48dd462d0bd23d2bcea 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -353,7 +353,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):
     from loopy.codegen.bounds import get_usable_inames_for_conditional
 
     # Note: this does not include loop_iname itself!
-    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
+    usable_inames, = get_usable_inames_for_conditional(kernel, (sched_index,))
     domain = kernel.get_inames_domain(loop_iname)
 
     result = []
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 2d926aad4faa511aa2919630c9b0e96b7f253ad9..ddd245261ab0f064e25060122d4d6af65e889c58 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -744,9 +744,11 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     # {{{ iname wrangling
 
+    @memoize_method
     def iname_tags(self, iname):
         return self.iname_to_tags.get(iname, frozenset())
 
+    @memoize_method
     def iname_tags_of_type(self, iname, tag_type_or_types,
             max_num=None, min_num=None):
         """Return a subset of *tags* that matches type *tag_type*. Raises exception
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 032cdc2760597f1fa6f701a8a88252312deac797..43a2b2657ba9f192828c72b0dd700271e594b0d7 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -106,6 +106,16 @@ class Barrier(ScheduleItem):
 # {{{ schedule utilities
 
 def gather_schedule_block(schedule, start_idx):
+    """
+    Returns a :class:`tuple` of (list of schedule items, index just after the
+    block) for a block.
+
+    :arg schedule: An instance of :class:`list` of
+        :class:`loopy.schedule.ScheduleItem`s.
+
+    :arg start_idx: The index of a :class:`loopy.schedule.BeginBlockItem` of
+        the block whose schedule items are to be returned.
+    """
     assert isinstance(schedule[start_idx], BeginBlockItem)
     level = 0
 
@@ -157,32 +167,107 @@ def get_insn_ids_for_block_at(schedule, start_idx):
             if isinstance(sub_sched_item, RunInstruction))
 
 
-def find_active_inames_at(kernel, sched_index):
+def find_active_inames_at(kernel, sched_indices):
+    """
+    Returns an instance of :class:`list` of :class:`set`s of inames occuring at
+    each schedule index in *sched_indices*.
+
+    :arg sched_indices: A list of schedule indices of *kernel*.
+    """
     active_inames = []
+    sched_idx_to_active_inames = {0: set()}
+
+    sorted_sched_indices = sorted(sched_indices)
 
     from loopy.schedule import EnterLoop, LeaveLoop
-    for sched_item in kernel.schedule[:sched_index]:
+
+    max_sched_idx = sorted_sched_indices[-1]
+
+    if sorted_sched_indices and sorted_sched_indices[0] == 0:
+        sorted_sched_indices.pop(0)
+
+    for sched_idx, sched_item in enumerate(
+            kernel.schedule[:max_sched_idx]):
         if isinstance(sched_item, EnterLoop):
             active_inames.append(sched_item.iname)
         if isinstance(sched_item, LeaveLoop):
             active_inames.pop()
 
-    return set(active_inames)
+        if sched_idx == (sorted_sched_indices[0]-1):
+            sched_idx_to_active_inames[sched_idx+1] = set(active_inames)
+            sorted_sched_indices.pop(0)
 
+    # eventually everythin should be popped
+    assert len(sorted_sched_indices) == 0
 
-def has_barrier_within(kernel, sched_index):
-    sched_item = kernel.schedule[sched_index]
+    return [sched_idx_to_active_inames[idx] for idx in sched_indices]
 
-    if isinstance(sched_item, BeginBlockItem):
-        loop_contents, _ = gather_schedule_block(
-                kernel.schedule, sched_index)
-        from pytools import any
-        return any(isinstance(subsched_item, Barrier)
-                for subsched_item in loop_contents)
-    elif isinstance(sched_item, Barrier):
-        return True
-    else:
-        return False
+
+def has_barrier_within(kernel, sched_indices):
+    """
+    Returns a :class:`list` of :class:`bool`s, with an entry for each schedule
+    index in *sched_indices* denoting if either there is barrier at the
+    schedule index or the schedule index is a
+    :class:`loopy.schedule.BeginBlockItem` containing a barrier in the block.
+
+    :arg sched_indices: A list of schedule indices of *kernel*.
+    """
+    sched_idx_to_has_barrier_within = {}
+    begin_block_sched_indices = []
+
+    for sched_idx in sched_indices:
+        sched_item = kernel.schedule[sched_idx]
+
+        if isinstance(sched_item, Barrier):
+            sched_idx_to_has_barrier_within[sched_idx] = True
+        elif isinstance(sched_item, BeginBlockItem):
+            begin_block_sched_indices.append(sched_idx)
+        else:
+            sched_idx_to_has_barrier_within[sched_idx] = False
+
+    begin_block_sched_indices.sort()
+
+    for sched_idx in begin_block_sched_indices:
+        if sched_idx in sched_idx_to_has_barrier_within:
+            # this block has already been dealt in a previous block
+            continue
+
+        block_contents, _ = gather_schedule_block(
+                kernel.schedule, sched_idx)
+
+        level = 1
+        # block_stack: list of [sched_idx, has_barrier_within]'s for every
+        # level of block
+        block_stack = [[sched_idx, False]]
+
+        for i, sched_item in enumerate(block_contents[1:], start=sched_idx+1):
+            if level == 0:
+                break
+
+            if isinstance(sched_item, BeginBlockItem):
+                level += 1
+
+                block_stack.append([i, False])
+            elif isinstance(sched_item, EndBlockItem):
+                level -= 1
+
+                exit_block_sched_idx, exit_block_contains_barrier = block_stack.pop()
+
+                if block_stack:
+                    # inner block contains barrier => outer block contains barrier
+                    block_stack[-1][1] |= exit_block_contains_barrier
+
+                sched_idx_to_has_barrier_within[exit_block_sched_idx] = (
+                        exit_block_contains_barrier)
+            elif isinstance(sched_item, Barrier):
+                block_stack[-1][1] = True
+            else:
+                pass
+
+        assert level == 0
+
+    return [sched_idx_to_has_barrier_within[sched_idx] for sched_idx in
+            sched_indices]
 
 
 def find_used_inames_within(kernel, sched_index):
@@ -415,6 +500,45 @@ def sched_item_to_insn_id(sched_item):
                 and sched_item.originating_insn_id is not None):
             yield sched_item.originating_insn_id
 
+
+def get_subkernel_indices(kernel, sched_indices):
+    """
+    Returns an instance of :class:`list` of :class:`int`s, with an entry for
+    each schedule index in *sched_indices* denoting the index of
+    :class:`loopy.schedule.CallKernel` for the subkernel it is in, if no
+    subkernel contains a schedule index its entry is set to *None*.
+
+    :arg sched_indices: A list of schedule indices of *kernel*.
+    """
+    from loopy.schedule import CallKernel, ReturnFromKernel
+
+    subkernel_index = None
+    sorted_sched_indices = sorted(sched_indices)
+    sched_idx_to_subkernel_idx = {0: None}
+
+    max_sched_idx = sorted_sched_indices[-1]
+
+    if sorted_sched_indices and sorted_sched_indices[0] == 0:
+        sorted_sched_indices.pop(0)
+
+    for sched_idx, sched_item in enumerate(
+            kernel.schedule[:max_sched_idx]):
+        if isinstance(sched_item, CallKernel):
+            subkernel_index = sched_idx
+        elif isinstance(sched_item, ReturnFromKernel):
+            subkernel_index = None
+
+        if sched_idx == (sorted_sched_indices[0]-1):
+            sched_idx_to_subkernel_idx[sched_idx+1] = subkernel_index
+            sorted_sched_indices.pop(0)
+
+    # eventually everythin should be popped
+    assert len(sorted_sched_indices) == 0
+
+    return [sched_idx_to_subkernel_idx[sched_idx] for sched_idx in
+            sched_indices]
+
+
 # }}}
 
 
@@ -571,6 +695,7 @@ class SchedulerState(ImmutableRecord):
 
     .. attribute:: loop_priority
 
+        #FIXME: incorrect docs.
         See :func:`loop_nest_around_map`.
 
     .. attribute:: breakable_inames
@@ -586,6 +711,10 @@ class SchedulerState(ImmutableRecord):
 
     .. rubric:: Time-varying scheduler state
 
+    .. attribute:: insn_ids_to_try
+
+        #FIXME: docs?
+
     .. attribute:: active_inames
 
         A tuple of active inames.
@@ -641,6 +770,10 @@ class SchedulerState(ImmutableRecord):
 
         Used to produce warnings about deprecated 'boosting' behavior
         Should be removed along with boostability in 2017.x.
+
+    .. attribute:: insns_in_topologically_sorted_order
+
+        A list of loopy :class:`Instruction` objects in topologically sorted order
     """
 
     @property
@@ -651,6 +784,93 @@ class SchedulerState(ImmutableRecord):
             return None
 
 
+def get_insns_in_topologically_sorted_order(kernel):
+    from pytools.graph import compute_topological_order
+
+    rev_dep_map = {insn.id: set() for insn in kernel.instructions}
+    for insn in kernel.instructions:
+        for dep in insn.depends_on:
+            rev_dep_map[dep].add(insn.id)
+
+    ids = compute_topological_order(rev_dep_map)
+    return [kernel.id_to_insn[insn_id] for insn_id in ids]
+
+
+def schedule_as_many_run_insns_as_possible(sched_state):
+    """
+    Returns an instance of :class:`loopy.schedule.SchedulerState`, by appending
+    all available instructions in the current loop nesting to the schedule.
+    """
+
+    next_preschedule_item = (
+        sched_state.preschedule[0]
+        if len(sched_state.preschedule) > 0
+        else None)
+
+    if isinstance(next_preschedule_item, (CallKernel, ReturnFromKernel, Barrier)):
+        return sched_state
+
+    if not sched_state.within_subkernel:
+        # cannot schedule RunInstructions when not in subkernel
+        return sched_state
+
+    have_inames = frozenset(sched_state.active_inames) | sched_state.parallel_inames
+
+    toposorted_insns = sched_state.insns_in_topologically_sorted_order
+
+    # select the top instructions in toposorted_insns only which have active
+    # inames corresponding to those of sched_state
+    from loopy.kernel.instruction import MultiAssignmentBase
+
+    updated_sched_state = sched_state.copy()
+
+    newly_scheduled_insn_ids = []
+    ignored_unscheduled_insn_ids = set()
+
+    for insn in toposorted_insns:
+        if insn.id in sched_state.scheduled_insn_ids:
+            continue
+        if not insn.within_inames >= have_inames:
+            ignored_unscheduled_insn_ids.add(insn.id)
+            continue
+        if isinstance(insn, MultiAssignmentBase):
+            if (insn.within_inames - sched_state.parallel_inames) == frozenset(
+                    sched_state.active_inames) and not (insn.depends_on &
+                            ignored_unscheduled_insn_ids):
+                newly_scheduled_insn_ids.append(insn.id)
+                continue
+        break
+
+    num_presched_insns_newly_scheduled = len(set(newly_scheduled_insn_ids) &
+            sched_state.prescheduled_insn_ids)
+
+    assert all(isinstance(sched_item, RunInstruction) and sched_item.insn_id in
+            newly_scheduled_insn_ids for sched_item in
+            sched_state.preschedule[:num_presched_insns_newly_scheduled])
+    sched_items = tuple(RunInstruction(insn_id=insn_id) for insn_id in
+            newly_scheduled_insn_ids)
+
+    updated_schedule = updated_sched_state.schedule + sched_items
+    updated_scheduled_insn_ids = (updated_sched_state.scheduled_insn_ids
+            | frozenset(newly_scheduled_insn_ids))
+    updated_unscheduled_insn_ids = (
+            updated_sched_state.unscheduled_insn_ids
+            - frozenset(newly_scheduled_insn_ids))
+    if newly_scheduled_insn_ids:
+        new_insn_ids_to_try = None
+    else:
+        new_insn_ids_to_try = sched_state.insn_ids_to_try
+    updated_sched_state = updated_sched_state.copy(
+            insn_ids_to_try=new_insn_ids_to_try,
+            schedule=updated_schedule,
+            scheduled_insn_ids=updated_scheduled_insn_ids,
+            unscheduled_insn_ids=updated_unscheduled_insn_ids,
+            preschedule=sched_state.preschedule[num_presched_insns_newly_scheduled:]
+            )
+
+    return updated_sched_state
+
+
 def generate_loop_schedules_internal(
         sched_state, allow_boost=False, debug=None):
     # allow_insn is set to False initially and after entering each loop
@@ -664,6 +884,10 @@ def generate_loop_schedules_internal(
     else:
         rec_allow_boost = False
 
+    if not rec_allow_boost:
+        sched_state = (
+                schedule_as_many_run_insns_as_possible(sched_state))
+
     active_inames_set = frozenset(sched_state.active_inames)
 
     next_preschedule_item = (
@@ -1041,19 +1265,20 @@ def generate_loop_schedules_internal(
                         break
 
             if can_leave and not debug_mode:
-
-                for sub_sched in generate_loop_schedules_internal(
-                        sched_state.copy(
+                new_sched_state = sched_state.copy(
                             schedule=(
                                 sched_state.schedule
                                 + (LeaveLoop(iname=last_entered_loop),)),
                             active_inames=sched_state.active_inames[:-1],
+                            insn_ids_to_try=insn_ids_to_try,
                             preschedule=(
                                 sched_state.preschedule
                                 if last_entered_loop
                                 not in sched_state.prescheduled_inames
-                                else sched_state.preschedule[1:]),
-                        ),
+                                else sched_state.preschedule[1:]))
+
+                for sub_sched in generate_loop_schedules_internal(
+                        new_sched_state,
                         allow_boost=rec_allow_boost, debug=debug):
                     yield sub_sched
 
@@ -1254,8 +1479,7 @@ def generate_loop_schedules_internal(
                             iname),
                         reverse=True):
 
-                    for sub_sched in generate_loop_schedules_internal(
-                            sched_state.copy(
+                    new_sched_state = sched_state.copy(
                                 schedule=(
                                     sched_state.schedule
                                     + (EnterLoop(iname=iname),)),
@@ -1264,11 +1488,15 @@ def generate_loop_schedules_internal(
                                 entered_inames=(
                                     sched_state.entered_inames
                                     | frozenset((iname,))),
+                                insn_ids_to_try=insn_ids_to_try,
                                 preschedule=(
                                     sched_state.preschedule
                                     if iname not in sched_state.prescheduled_inames
                                     else sched_state.preschedule[1:]),
-                                ),
+                                )
+
+                    for sub_sched in generate_loop_schedules_internal(
+                            new_sched_state,
                             allow_boost=rec_allow_boost,
                             debug=debug):
                         found_viable_schedule = True
@@ -1916,7 +2144,10 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
             group_insn_counts=group_insn_counts(kernel),
             active_group_counts={},
 
-            uses_of_boostability=[])
+            uses_of_boostability=[],
+            insns_in_topologically_sorted_order=(
+                get_insns_in_topologically_sorted_order(kernel)),
+    )
 
     schedule_gen_kwargs = {}
     if kernel.options.ignore_boostable_into: