diff --git a/loopy/check.py b/loopy/check.py
index d8c63a302da8df9ec68879a81feea684a67a15b0..f490f85512817ed13710732f20973eae562bcf49 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -145,7 +145,6 @@ def check_for_write_races(kernel):
                 raise RuntimeError("temp var '%s' hasn't decided on "
                         "whether it is local" % temp_var.name)
 
-
         else:
             raise RuntimeError("invalid assignee name in instruction '%s'"
                     % insn.id)
diff --git a/loopy/kernel.py b/loopy/kernel.py
index b3acea7ffe733d12a7bd71fd64fa9026e3e29c08..09b5c9b9cb7d7334d4eda8109143f0484d866c8d 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -232,7 +232,9 @@ class Instruction(Record):
         by adding dependencies on any writes to temporaries read by this instruction.
     :ivar boostable: Whether the instruction may safely be executed
         inside more loops than advertised without changing the meaning
-        of the program. Allowed values are *None* (for unknwon), *True*, and *False*.
+        of the program. Allowed values are *None* (for unknown), *True*, and *False*.
+    :ivar boostable_into: a set of inames into which the instruction
+        may need to be boosted, as a heuristic help for the scheduler.
 
     The following two instance variables are only used until :func:`loopy.kernel.make_kernel` is
     finished:
@@ -245,6 +247,7 @@ class Instruction(Record):
     def __init__(self,
             id, assignee, expression,
             forced_iname_deps=set(), insn_deps=set(), boostable=None,
+            boostable_into=None,
             temp_var_type=None, duplicate_inames_and_tags=[]):
 
         assert isinstance(forced_iname_deps, set)
@@ -254,6 +257,7 @@ class Instruction(Record):
                 id=id, assignee=assignee, expression=expression,
                 forced_iname_deps=forced_iname_deps,
                 insn_deps=insn_deps, boostable=boostable,
+                boostable_into=boostable_into,
                 temp_var_type=temp_var_type, duplicate_inames_and_tags=duplicate_inames_and_tags)
 
     @memoize_method
@@ -276,7 +280,10 @@ class Instruction(Record):
                 self.assignee, self.expression)
 
         if self.boostable == True:
-            result += " (boostable)"
+            if self.boostable_into:
+                result += " (boostable into '%s')" % ",".join(self.boostable_into)
+            else:
+                result += " (boostable)"
         elif self.boostable == False:
             result += " (not boostable)"
         elif self.boostable is None:
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index ca6b0d0c621e56a942b66c3dd8d804bf164d16f1..30baf5e07da5cad9d97bddc35c64c6a2066b7d8e 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -287,6 +287,51 @@ def add_boostability_and_automatic_dependencies(kernel):
 
 # }}}
 
+# {{{ limit boostability
+
+def limit_boostability(kernel):
+    """Finds out which other inames an instruction's inames occur with
+    and then limits boostability to just those inames.
+    """
+
+    iname_occurs_with = {}
+    for insn in kernel.instructions:
+        insn_inames = kernel.insn_inames(insn)
+        for iname in insn_inames:
+            iname_occurs_with.setdefault(iname, set()).update(insn_inames)
+
+    iname_use_counts = {}
+    for insn in kernel.instructions:
+        for iname in kernel.insn_inames(insn):
+            iname_use_counts[iname] = iname_use_counts.get(iname, 0) + 1
+
+    single_use_inames = set(iname for iname, uc in iname_use_counts.iteritems()
+            if uc == 1)
+
+    new_insns = []
+    for insn in kernel.instructions:
+        if insn.boostable is None:
+            raise RuntimeError("insn '%s' has undetermined boostability" % insn.id)
+        elif insn.boostable:
+            boostable_into = set()
+            for iname in kernel.insn_inames(insn):
+                boostable_into.update(iname_occurs_with[iname])
+
+            boostable_into -= kernel.insn_inames(insn) | single_use_inames
+
+            # Even if boostable_into is empty, leave boostable flag on--it is used
+            # for boosting into unused hw axes.
+
+            insn = insn.copy(boostable_into=boostable_into)
+        else:
+            insn = insn.copy(boostable_into=set())
+
+        new_insns.append(insn)
+
+    return kernel.copy(instructions=new_insns)
+
+# }}}
+
 # {{{ guess good iname for local axis 0
 
 def get_axis_0_ranking(kernel, insn):
@@ -635,6 +680,7 @@ def preprocess_kernel(kernel):
 
     kernel = assign_automatic_axes(kernel)
     kernel = add_boostability_and_automatic_dependencies(kernel)
+    kernel = limit_boostability(kernel)
     kernel = adjust_local_temp_var_storage(kernel)
 
     return kernel
diff --git a/loopy/schedule.py b/loopy/schedule.py
index cef3f31bcad53c83c72ee1546114421d5b8e9c89..5523583dfacf48f4cae835ed4c54fb5f26c80138 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -171,7 +171,7 @@ def dump_schedule(schedule):
         else:
             assert False
 
-    return " ".join(entries), len(entries)
+    return " ".join(entries)
 
 class SchedulerDebugger:
     def __init__(self, debug_length):
@@ -180,16 +180,22 @@ class SchedulerDebugger:
         self.dead_end_counter = 0
         self.debug_length = debug_length
 
+        self.elapsed_store = 0
+        self.start()
+        self.wrote_status = False
+
         self.update()
 
     def update(self):
         if (self.success_counter + self.dead_end_counter) % 50 == 0:
-            sys.stdout.write("\rscheduling... %d successes, "
-                    "%d dead ends (longest %d)" % (
-                        self.success_counter,
-                        self.dead_end_counter,
-                        len(self.longest_rejected_schedule)))
-            sys.stdout.flush()
+            if self.debug_length or self.elapsed_time() > 1:
+                sys.stdout.write("\rscheduling... %d successes, "
+                        "%d dead ends (longest %d)" % (
+                            self.success_counter,
+                            self.dead_end_counter,
+                            len(self.longest_rejected_schedule)))
+                sys.stdout.flush()
+                self.wrote_status = True
 
     def log_success(self, schedule):
         self.success_counter += 1
@@ -202,14 +208,26 @@ class SchedulerDebugger:
         self.update()
 
     def done_scheduling(self):
-        sys.stdout.write("\rscheduler finished                                         \n")
-        sys.stdout.flush()
+        if self.wrote_status:
+            sys.stdout.write("\rscheduler finished"+40*" "+"\n")
+            sys.stdout.flush()
 
+    def elapsed_time(self):
+        from time import time
+        return self.elapsed_store + time() - self.start_time
+
+    def stop(self):
+        from time import time
+        self.elapsed_store += time()-self.start_time
+
+    def start(self):
+        from time import time
+        self.start_time = time()
 # }}}
 
 # {{{ scheduling algorithm
 
-def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=None):
+def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_boost=False, debug=None):
     all_insn_ids = set(insn.id for insn in kernel.instructions)
 
     scheduled_insn_ids = set(sched_item.insn_id for sched_item in schedule
@@ -248,33 +266,32 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N
                 and len(schedule) >= debug.debug_length):
             debug_mode = True
 
+    #print dump_schedule(schedule), len(schedule)
     if debug_mode:
         print kernel
         print "--------------------------------------------"
+        print "CURRENT SCHEDULE:"
         print dump_schedule(schedule)
+        print "--------------------------------------------"
 
-    #if len(schedule) == 3:
+    #if len(schedule) == 2:
         #from pudb import set_trace; set_trace()
 
-    if debug_mode:
-        print "active:", ",".join(active_inames)
-        print "entered:", ",".join(entered_inames)
-
     # }}}
 
     made_progress = False
 
     # {{{ see if any insn can be scheduled now
 
-    unscheduled_insn_ids = list(all_insn_ids - scheduled_insn_ids)
-    insns_with_satisfied_deps = set()
+    # Also take note of insns that have a chance of being schedulable inside
+    # the current loop nest, in this set:
 
-    for insn_id in unscheduled_insn_ids:
+    reachable_insn_ids = set()
+
+    for insn_id in all_insn_ids - scheduled_insn_ids:
         insn = kernel.id_to_insn[insn_id]
 
         schedule_now = set(insn.insn_deps) <= scheduled_insn_ids
-        if schedule_now:
-            insns_with_satisfied_deps.add(insn_id)
 
         if not schedule_now:
             if debug_mode:
@@ -282,53 +299,60 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N
                         insn.id, ",".join(set(insn.insn_deps) - scheduled_insn_ids))
             continue
 
-        if insn.boostable == True:
-            # If insn is boostable, it may be placed inside a more deeply
-            # nested loop without harm.
+        # If insn is boostable, it may be placed inside a more deeply
+        # nested loop without harm.
 
-            # But if it can be scheduled on the way *out* of the currently
-            # active loops, now is not the right moment.
+        # But if it can be scheduled on the way *out* of the currently
+        # active loops, now is not the right moment.
 
-            schedulable_at_loop_levels = []
+        schedulable_at_loop_levels = []
 
-            for active_loop_count in xrange(len(active_inames), -1, -1):
-                outer_active_inames = set(active_inames[:active_loop_count])
-                if (
-                        kernel.insn_inames(insn) - parallel_inames
-                        <=
-                        outer_active_inames - parallel_inames):
+        if allow_boost:
+            try_loop_counts = xrange(len(active_inames), -1, -1)
+        else:
+            try_loop_counts = [len(active_inames)]
 
-                    schedulable_at_loop_levels.append(active_loop_count)
+        for active_loop_count in try_loop_counts:
+            outer_active_inames = set(active_inames[:active_loop_count])
 
-            if schedulable_at_loop_levels != [len(active_inames)]:
-                schedule_now = False
-                if debug_mode:
-                    if schedulable_at_loop_levels:
-                        print ("instruction '%s' will be scheduled when more "
-                                "loops have been exited" % insn.id)
-                    else:
-                        print ("instruction '%s' is missing inames '%s'"
-                                % (insn.id, ",".join(
-                                    (kernel.insn_inames(insn) - parallel_inames)
-                                    -
-                                    (outer_active_inames - parallel_inames))))
+            want = kernel.insn_inames(insn) - parallel_inames
+            have = outer_active_inames - parallel_inames - insn.boostable_into
 
-        elif insn.boostable == False:
-            # If insn is not boostable, we must insist that it is placed inside
-            # the exactly correct set of loops.
+            if allow_boost:
+                have -= insn.boostable_into
 
-            schedule_now = schedule_now and (
-                    kernel.insn_inames(insn) - parallel_inames
-                    ==
-                    active_inames_set - parallel_inames)
+            if want == have:
+                schedulable_at_loop_levels.append(active_loop_count)
 
+        if schedulable_at_loop_levels != [len(active_inames)]:
+            schedule_now = False
             if debug_mode:
-                print ("instruction '%s' is not boostable and doesn't "
-                        "match the active inames" % insn.id)
+                if schedulable_at_loop_levels:
+                    print ("instruction '%s' will be scheduled when more "
+                            "loops have been exited" % insn.id)
+                else:
+                    want = (kernel.insn_inames(insn) - parallel_inames)
+                    have = (active_inames_set - parallel_inames)
+                    if want-have:
+                        print ("instruction '%s' is missing inames '%s'"
+                                % (insn.id, ",".join(want-have)))
+                    if have-want:
+                        print ("instruction '%s' won't work under inames '%s'"
+                                % (insn.id, ",".join(have-want)))
+
+        # {{{ determine reachability
+
+        want = kernel.insn_inames(insn) - parallel_inames
+        have = active_inames_set - parallel_inames
 
+        if (not schedule_now and have <= want):
+            reachable_insn_ids.add(insn_id)
         else:
-            raise RuntimeError("instruction '%s' has undetermined boostability"
-                    % insn.id)
+            if debug_mode:
+                print ("    '%s' also not reachable because it won't work under '%s'"
+                        % (insn.id, ",".join(have-want)))
+
+        # }}}
 
         if schedule_now:
             scheduled_insn_ids.add(insn.id)
@@ -346,6 +370,9 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N
         for insn_id in unscheduled_insn_ids:
             insn = kernel.id_to_insn[insn_id]
             if last_entered_loop in kernel.insn_inames(insn):
+                if debug_mode:
+                    print("cannot leave '%s' because '%s' still depends on it"
+                            % (last_entered_loop, insn.id))
                 can_leave = False
                 break
 
@@ -364,6 +391,15 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N
             - parallel_inames
             )
 
+    if debug_mode:
+        print "--------------------------------------------"
+        print "available :", ",".join(available_loops)
+        print "active:", ",".join(active_inames)
+        print "entered:", ",".join(entered_inames)
+        print "--------------------------------------------"
+        print "reachable insns:", ",".join(reachable_insn_ids)
+        print "--------------------------------------------"
+
     # Don't be eager about scheduling new loops--if progress has been made,
     # revert to top of scheduler and see if more progress can be made another
     # way. (hence 'and not made_progress')
@@ -372,27 +408,17 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N
         useful_loops = []
 
         for iname in available_loops:
-            # {{{ determine if that gets us closer to being able to scheduling an insn
+            # {{{ determine if that gets us closer to being able to schedule an insn
 
             useful = False
 
-            hypothetical_active_loops = active_inames_set | set([iname])
-            for insn_id in unscheduled_insn_ids:
-                if insn_id not in insns_with_satisfied_deps:
-                    continue
-
+            hypothetically_active_loops = active_inames_set | set([iname])
+            for insn_id in reachable_insn_ids:
                 insn = kernel.id_to_insn[insn_id]
-                if insn.boostable:
-                    # if insn is boostable, just increasing the number of used
-                    # inames is enough--not necessarily all must be truly 'useful'.
-
-                    if iname in kernel.insn_inames(insn):
-                        useful = True
-                        break
-                else:
-                    if hypothetical_active_loops <= kernel.insn_inames(insn):
-                        useful = True
-                        break
+                if (hypothetically_active_loops 
+                        <= (kernel.insn_inames(insn) | insn.boostable_into)):
+                    useful = True
+                    break
 
             if not useful:
                 if debug_mode:
@@ -425,7 +451,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N
 
         if debug_mode:
             print "useful inames: %s" % ",".join(useful_loops)
-            raw_input("Enter:")
 
         for tier in priority_tiers:
             found_viable_schedule = False
@@ -443,23 +468,32 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], debug=N
 
     # }}}
 
-
     if debug_mode:
         raw_input("Enter:")
 
     if not active_inames and not available_loops and not unscheduled_insn_ids:
         # if done, yield result
+        debug.log_success(schedule)
+
         yield schedule
+
     elif made_progress:
         # if not done, but made some progress--try from the top
+        for sub_sched in generate_loop_schedules_internal(
+                kernel, loop_priority, schedule,
+                debug=debug):
+            yield sub_sched
+    else:
+        if not allow_boost:
+            # try again with boosting allowed
             for sub_sched in generate_loop_schedules_internal(
-                    kernel, loop_priority, schedule,
-                    debug=debug):
+                    kernel, loop_priority, schedule=schedule,
+                    allow_boost=True, debug=debug):
                 yield sub_sched
-    else:
-        # dead end
-        if debug is not None:
-            debug.log_dead_end(schedule)
+        else:
+            # dead end
+            if debug is not None:
+                debug.log_dead_end(schedule)
 
 # }}}
 
@@ -577,7 +611,7 @@ def insert_barriers(kernel, schedule, level=0):
 
 # {{{ main scheduling entrypoint
 
-def generate_loop_schedules(kernel, loop_priority=[], debug=False):
+def generate_loop_schedules(kernel, loop_priority=[], debug=None):
     from loopy.preprocess import preprocess_kernel
     kernel = preprocess_kernel(kernel)
 
@@ -586,13 +620,7 @@ def generate_loop_schedules(kernel, loop_priority=[], debug=False):
 
     schedule_count = 0
 
-    if debug:
-        if debug == True:
-            debug = SchedulerDebugger(None)
-        else:
-            debug = SchedulerDebugger(debug)
-    else:
-        debug = None
+    debug = SchedulerDebugger(debug)
 
     for gen_sched in generate_loop_schedules_internal(kernel, loop_priority,
             debug=debug):
@@ -605,10 +633,14 @@ def generate_loop_schedules(kernel, loop_priority=[], debug=False):
                     "This often means that local memory was "
                     "written, but never read." % ",".join(owed_barriers), LoopyAdvisory)
 
+        debug.stop()
         yield kernel.copy(schedule=gen_sched)
+        debug.start()
 
         schedule_count += 1
 
+    debug.done_scheduling()
+
     if not schedule_count:
         raise RuntimeError("no valid schedules found")