diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 0b9608493d3239a44c6eeec0c85e23500c4aa427..afd8ed762e4ee5252f73105080403542732f2104 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -142,27 +142,38 @@ def add_comment(cmt, code):
 # {{{ code generation state
 
 class CodeGenerationState(object):
-    def __init__(self, implemented_domain, c_code_mapper):
-        """
-        :param implemented_domain: The entire implemented domain,
-            i.e. all constraints that have been enforced so far.
-        :param c_code_mapper: A C code mapper that does not take per-ILP
-            assignments into account.
-        """
-        self.implemented_domain = implemented_domain
+    """
+    .. attribute:: implemented_domain
+
+        The entire implemented domain (as an :class:`islpy.Set`)
+        i.e. all constraints that have been enforced so far.
+
+    .. attribute:: implemented_predicates
+
+        A :class:`frozenset` of predicates for which checks have been
+        implemented.
 
+    .. attribute:: c_code_mapper
+
+        A :class:`loopy.codegen.expression.CCodeMapper` that does not take
+        per-ILP assignments into account.
+    """
+    def __init__(self, implemented_domain, implemented_predicates, c_code_mapper):
+        self.implemented_domain = implemented_domain
+        self.implemented_predicates = implemented_predicates
         self.c_code_mapper = c_code_mapper
 
-    def copy(self, implemented_domain=None, c_code_mapper=None):
+    def copy(self, implemented_domain=None, implemented_predicates=frozenset(),
+            c_code_mapper=None):
         return CodeGenerationState(
                 implemented_domain=implemented_domain or self.implemented_domain,
+                implemented_predicates=
+                implemented_predicates or self.implemented_predicates,
                 c_code_mapper=c_code_mapper or self.c_code_mapper)
 
     def intersect(self, other):
         new_impl, new_other = isl.align_two(self.implemented_domain, other)
-        return CodeGenerationState(
-                new_impl & new_other,
-                self.c_code_mapper)
+        return self.copy(implemented_domain=new_impl & new_other)
 
     def fix(self, iname, aff):
         new_impl_domain = self.implemented_domain
@@ -185,9 +196,9 @@ class CodeGenerationState(object):
         expr = pw_aff_to_expr(aff)
 
         new_impl_domain = new_impl_domain.add_constraint(cns)
-        return CodeGenerationState(
-                new_impl_domain,
-                self.c_code_mapper.copy_and_assign(iname, expr))
+        return self.copy(
+                implemented_domain=new_impl_domain,
+                c_code_mapper=self.c_code_mapper.copy_and_assign(iname, expr))
 
 # }}}
 
@@ -362,7 +373,9 @@ def generate_code(kernel):
 
     initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
     codegen_state = CodeGenerationState(
-            initial_implemented_domain, c_code_mapper=ccm)
+            implemented_domain=initial_implemented_domain,
+            implemented_predicates=frozenset(),
+            c_code_mapper=ccm)
 
     from loopy.codegen.loop import set_up_hw_parallel_loops
     gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state)
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 013ec5d099b892799ad1d7b83dcf91daf4db9a16..4636f07087257bd9b17904a2922485b8c662ce0d 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -25,8 +25,10 @@ THE SOFTWARE.
 """
 
 
-from loopy.codegen import CodeGenerationState, gen_code_block
+from loopy.codegen import gen_code_block
 import islpy as isl
+from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier,
+        gather_schedule_subloop, generate_sub_sched_items)
 
 
 def get_admissible_conditional_inames_for(kernel, sched_index):
@@ -50,8 +52,6 @@ def get_admissible_conditional_inames_for(kernel, sched_index):
 
 
 def generate_code_for_sched_index(kernel, sched_index, codegen_state):
-    from loopy.schedule import (EnterLoop, RunInstruction, Barrier)
-
     sched_item = kernel.schedule[sched_index]
 
     if isinstance(sched_item, EnterLoop):
@@ -128,15 +128,29 @@ def remove_inames_for_shared_hw_axes(kernel, cond_inames):
     return frozenset(cond_inames - multi_use_inames)
 
 
+def get_required_predicates(kernel, sched_index):
+    result = None
+    for _, sched_item in generate_sub_sched_items(kernel.schedule, sched_index):
+        if isinstance(sched_item, Barrier):
+            my_preds = frozenset()
+        elif isinstance(sched_item, RunInstruction):
+            my_preds = kernel.id_to_insn[sched_item.insn_id].predicates
+        else:
+            raise RuntimeError("unexpected schedule item type: %s"
+                    % type(sched_item))
+
+        if result is None:
+            result = my_preds
+        else:
+            result = result & my_preds
+
+    return result
+
+
 def build_loop_nest(kernel, sched_index, codegen_state):
     # Most of the complexity of this function goes towards finding groups of
     # instructions that can be nested inside a shared conditional.
 
-    assert isinstance(codegen_state, CodeGenerationState)
-
-    from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier,
-            gather_schedule_subloop)
-
     # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices
 
     # i.e. go up to the next LeaveLoop, and skip over inner loops.
@@ -167,9 +181,23 @@ def build_loop_nest(kernel, sched_index, codegen_state):
 
     # {{{ pass 2: find admissible conditional inames for each sibling schedule item
 
-    admissible_cond_inames = [
-            get_admissible_conditional_inames_for(kernel, i)
-            for i in my_sched_indices]
+    from pytools import Record
+
+    class ScheduleIndexInfo(Record):
+        """
+        .. attribute:: schedule_index
+        .. attribute:: admissible_cond_inames
+        .. attribute:: required_predicates
+        """
+
+    sched_index_info_entries = [
+            ScheduleIndexInfo(
+                schedule_index=i,
+                admissible_cond_inames=
+                get_admissible_conditional_inames_for(kernel, i),
+                required_predicates=get_required_predicates(kernel, i)
+                )
+        for i in my_sched_indices]
 
     # }}}
 
@@ -198,15 +226,37 @@ def build_loop_nest(kernel, sched_index, codegen_state):
                     # so we can safely overapproximate here.
                     overapproximate=True)
 
-    def build_insn_group(sched_indices_and_cond_inames, codegen_state,
+    def build_insn_group(sched_index_info_entries, codegen_state,
             done_group_lengths=set()):
-        # done_group_lengths serves to prevent infinite recursion by imposing a
-        # bigger and bigger minimum size on the group of shared inames found.
-
-        if not sched_indices_and_cond_inames:
+        """
+        :arg done_group_lengths: A set of group lengths (integers) that grows from
+            empty to include 1 and upwards with every recursive call.
+            It serves to prevent infinite recursion by preventing recursive
+            calls from doing anything about groups that are too small.
+        """
+
+        # The rough plan here is that build_insn_group starts out with the
+        # entirety of the current schedule item's downward siblings (i.e. all
+        # the ones up to the next LeaveLoop). It will then iterate upward to
+        # find the largest usable conditional hoist group.
+        #
+        # It will then call itself recursively, telling its recursive instances
+        # to ignore the hoist group it just found by adding that group length
+        # to done_group_length. (It'll also chop the set of schedule indices
+        # considered down so that a callee cannot find a *longer* hoist group.)
+        #
+        # Upon return the hoist is wrapped around the returned code and
+        # build_insn_group calls itself for the remainder of schedule indices
+        # that were not in the hoist group.
+
+        if not sched_index_info_entries:
             return []
 
-        sched_index, cond_inames = sched_indices_and_cond_inames[0]
+        si_entry = sched_index_info_entries[0]
+        sched_index = si_entry.schedule_index
+        current_iname_set = si_entry.admissible_cond_inames
+        current_pred_set = (si_entry.required_predicates
+                - codegen_state.implemented_predicates)
 
         # {{{ grow schedule item group
 
@@ -216,28 +266,32 @@ def build_loop_nest(kernel, sched_index, codegen_state):
         bounds_check_cache = BoundsCheckCache(
                 kernel, codegen_state.implemented_domain)
 
-        current_iname_set = cond_inames
-
         found_hoists = []
 
         candidate_group_length = 1
-        while candidate_group_length <= len(sched_indices_and_cond_inames):
+        while candidate_group_length <= len(sched_index_info_entries):
             if candidate_group_length in done_group_lengths:
                 candidate_group_length += 1
                 continue
 
-            other_sched_index, other_cond_inames = \
-                    sched_indices_and_cond_inames[candidate_group_length-1]
-            current_iname_set = current_iname_set & other_cond_inames
+            current_iname_set = (
+                    current_iname_set
+                    & sched_index_info_entries[candidate_group_length-1]
+                        .admissible_cond_inames)
+            current_pred_set = (
+                    current_pred_set
+                    & sched_index_info_entries[candidate_group_length-1]
+                        .required_predicates)
 
             # {{{ see which inames are actually used in group
 
             # And only generate conditionals for those.
             from loopy.schedule import find_used_inames_within
             used_inames = set()
-            for subsched_index, _ in \
-                    sched_indices_and_cond_inames[0:candidate_group_length]:
-                used_inames |= find_used_inames_within(kernel, subsched_index)
+            for sched_index_info_entry in \
+                    sched_index_info_entries[0:candidate_group_length]:
+                used_inames |= find_used_inames_within(kernel,
+                        sched_index_info_entry.schedule_index)
 
             # }}}
 
@@ -246,17 +300,21 @@ def build_loop_nest(kernel, sched_index, codegen_state):
 
             bounds_checks = bounds_check_cache(only_unshared_inames)
 
-            if bounds_checks or bounds_checks is None or candidate_group_length == 1:
+            if (bounds_checks  # found a bounds check
+                    or bounds_checks is None  # found impossible bounds check
+                    or current_pred_set
+                    or candidate_group_length == 1):
                 # length-1 must always be an option to reach the recursion base
                 # case below
-                found_hoists.append((candidate_group_length, bounds_checks))
+                found_hoists.append((candidate_group_length,
+                    bounds_checks, current_pred_set))
 
             candidate_group_length += 1
 
         # }}}
 
         # pick largest such group
-        group_length, bounds_checks = max(found_hoists)
+        group_length, bounds_checks, pred_checks = max(found_hoists)
 
         check_set = None
         for cns in bounds_checks:
@@ -276,6 +334,11 @@ def build_loop_nest(kernel, sched_index, codegen_state):
             is_empty = check_set.is_empty()
             new_codegen_state = codegen_state.intersect(check_set)
 
+        if pred_checks:
+            new_codegen_state = new_codegen_state.copy(
+                    implemented_predicates=new_codegen_state.implemented_predicates
+                    | pred_checks)
+
         if is_empty:
             result = []
         else:
@@ -288,27 +351,27 @@ def build_loop_nest(kernel, sched_index, codegen_state):
             else:
                 # recurse with a bigger done_group_lengths
                 result = build_insn_group(
-                        sched_indices_and_cond_inames[0:group_length],
+                        sched_index_info_entries[0:group_length],
                         new_codegen_state,
                         done_group_lengths=done_group_lengths | set([group_length]))
 
-            if bounds_checks:
+            if bounds_checks or pred_checks:
                 from loopy.codegen import wrap_in_if
                 from loopy.codegen.bounds import constraint_to_code
-                result = [
-                        wrap_in_if(
-                            [constraint_to_code(codegen_state.c_code_mapper, cns)
-                                for cns in bounds_checks],
-                            gen_code_block(result))]
+
+                conditionals = [
+                        constraint_to_code(codegen_state.c_code_mapper, cns)
+                        for cns in bounds_checks] + list(pred_checks)
+
+                result = [wrap_in_if(conditionals, gen_code_block(result))]
 
         return result + build_insn_group(
-                sched_indices_and_cond_inames[group_length:], codegen_state)
+                sched_index_info_entries[group_length:], codegen_state)
 
     # }}}
 
     return gen_code_block(
-            build_insn_group(zip(
-                my_sched_indices, admissible_cond_inames), codegen_state))
+            build_insn_group(sched_index_info_entries, codegen_state))
 
 
 
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index e209f8ad07a3e0132cdfe76702eac6c22719d550..019e77262f4a21ff21e9b40146cd819a13740738 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -28,22 +28,26 @@ import islpy as isl
 from loopy.codegen import GeneratedInstruction
 
 
-def wrap_in_bounds_checks(ccm, domain, check_inames, implemented_domain, stmt):
+def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, stmt):
     from loopy.codegen.bounds import get_bounds_checks, constraint_to_code
     bounds_checks = get_bounds_checks(
             domain, check_inames,
-            implemented_domain, overapproximate=False)
+            codegen_state.implemented_domain, overapproximate=False)
 
     bounds_check_set = isl.Set.universe(domain.get_space()) \
             .add_constraints(bounds_checks)
     bounds_check_set, new_implemented_domain = isl.align_two(
-            bounds_check_set, implemented_domain)
+            bounds_check_set, codegen_state.implemented_domain)
     new_implemented_domain = new_implemented_domain & bounds_check_set
 
     if bounds_check_set.is_empty():
         return None, None
 
-    condition_codelets = [constraint_to_code(ccm, cns) for cns in bounds_checks]
+    condition_codelets = [constraint_to_code(codegen_state.c_code_mapper, cns)
+            for cns in bounds_checks]
+
+    condition_codelets.extend(
+            required_preds - codegen_state.implemented_predicates)
 
     if condition_codelets:
         from cgen import If
@@ -63,10 +67,10 @@ def generate_instruction_code(kernel, insn, codegen_state):
         raise RuntimeError("unexpected instruction type")
 
     insn_inames = kernel.insn_inames(insn)
-    insn_code, impl_domain = wrap_in_bounds_checks(
-            codegen_state.c_code_mapper,
+    insn_code, impl_domain = wrap_in_conditionals(
+            codegen_state,
             kernel.get_inames_domain(insn_inames), insn_inames,
-            codegen_state.implemented_domain,
+            insn.predicates,
             result)
 
     if insn_code is None:
@@ -95,7 +99,6 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
                 needed_dtype=target_dtype))
 
     if kernel.flags.trace_assignments or kernel.flags.trace_assignment_values:
-        from loopy.codegen import gen_code_block
         from cgen import Statement as S
 
         gs, ls = kernel.get_grid_sizes()
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 4b81951d0d65998e2e443cf0f38a6eb2a2daccdd..a3a35b082cdb4df1f2ac7023dfd2ca5e141cc506 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -181,6 +181,7 @@ def parse_insn(insn):
         insn_id = None
         priority = 0
         forced_iname_deps = frozenset()
+        predicates = frozenset()
 
         if groups["options"] is not None:
             for option in groups["options"].split(","):
@@ -206,6 +207,8 @@ def parse_insn(insn):
                     insn_deps = set(opt_value.split(":"))
                 elif opt_key == "inames":
                     forced_iname_deps = frozenset(opt_value.split(":"))
+                elif opt_key == "if":
+                    predicates = frozenset(opt_value.split(":"))
                 else:
                     raise ValueError("unrecognized instruction option '%s'"
                             % opt_key)
@@ -230,7 +233,8 @@ def parse_insn(insn):
                     forced_iname_deps=forced_iname_deps,
                     assignee=lhs, expression=rhs,
                     temp_var_type=temp_var_type,
-                    priority=priority)
+                    priority=priority,
+                    predicates=predicates)
 
     elif subst_match is not None:
         from pymbolic.primitives import Variable, Call
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index e9c125fd17c2c2938f5b3f16917732cfcada97ae..72e774c5ec62c8c59dbcd2322e248f948ab7f29d 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -349,6 +349,11 @@ class InstructionBase(Record):
         *must* be executed before this one. Note that loop scheduling augments this
         by adding dependencies on any writes to temporaries read by this instruction.
 
+    .. attribute:: predicates
+
+        a :class:`frozenset` of variable names whose truth values (as defined
+        by C) determine whether this instruction should be run
+
     .. attribute:: forced_iname_deps
 
         A :class:`frozenset` of inames that are added to the list of iname
@@ -372,11 +377,11 @@ class InstructionBase(Record):
         Also allowed to be *None*.
     """
 
-    fields = set("id insn_deps forced_iname_deps "
+    fields = set("id insn_deps predicates forced_iname_deps "
             "priority boostable boostable_into".split())
 
     def __init__(self, id, insn_deps, forced_iname_deps, priority,
-            boostable, boostable_into):
+            boostable, boostable_into, predicates):
 
         assert isinstance(forced_iname_deps, frozenset)
         assert isinstance(insn_deps, set)
@@ -387,7 +392,8 @@ class InstructionBase(Record):
                 forced_iname_deps=forced_iname_deps,
                 priority=priority,
                 boostable=boostable,
-                boostable_into=boostable_into)
+                boostable_into=boostable_into,
+                predicates=predicates)
 
     # {{{ abstract interface
 
@@ -496,14 +502,15 @@ class ExpressionInstruction(InstructionBase):
             assignee, expression,
             id=None, forced_iname_deps=frozenset(), insn_deps=set(), boostable=None,
             boostable_into=None,
-            temp_var_type=None, priority=0):
+            temp_var_type=None, priority=0, predicates=frozenset()):
 
         InstructionBase.__init__(self,
                 id=id,
                 forced_iname_deps=forced_iname_deps,
                 insn_deps=insn_deps, boostable=boostable,
                 boostable_into=boostable_into,
-                priority=priority)
+                priority=priority,
+                predicates=predicates)
 
         from loopy.symbolic import parse
         if isinstance(assignee, str):
@@ -523,6 +530,9 @@ class ExpressionInstruction(InstructionBase):
         result = get_dependencies(self.expression)
         for _, subscript in self.assignees_and_indices():
             result = result | get_dependencies(subscript)
+
+        result = result | self.predicates
+
         return result
 
     @memoize_method
@@ -631,7 +641,7 @@ class CInstruction(InstructionBase):
             iname_exprs, code,
             read_variables=frozenset(), assignees=frozenset(),
             id=None, insn_deps=set(), forced_iname_deps=frozenset(), priority=0,
-            boostable=None, boostable_into=None):
+            boostable=None, boostable_into=None, predicates=frozenset()):
         """
         :arg iname_exprs: Like :attr:`iname_exprs`, but instead of tuples,
             simple strings pepresenting inames are also allowed. A single
@@ -647,7 +657,7 @@ class CInstruction(InstructionBase):
                 forced_iname_deps=forced_iname_deps,
                 insn_deps=insn_deps, boostable=boostable,
                 boostable_into=boostable_into,
-                priority=priority)
+                priority=priority, predicates=predicates)
 
         # {{{ normalize iname_exprs
 
@@ -697,7 +707,7 @@ class CInstruction(InstructionBase):
         for _, subscript in self.assignees_and_indices():
             result.update(get_dependencies(subscript))
 
-        return frozenset(result)
+        return frozenset(result) | self.predicates
 
     def reduction_inames(self):
         return set()
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 8826ed0396db9fa88a8fe517f34a1043cd6f92a7..507cc0e34e8afc980387bb90dc8f6c7f3e118281 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -108,8 +108,11 @@ def find_all_insn_inames(kernel):
         assert isinstance(write_deps, frozenset), type(insn)
         assert isinstance(iname_deps, frozenset), type(insn)
 
-        logger.debug("%s: find_all_insn_inames: %s (init): %s" % (
-            kernel.name, insn.id, ", ".join(sorted(iname_deps))))
+        logger.debug("%s: find_all_insn_inames: %s (init): %s - "
+                "read deps: %s - write deps: %s" % (
+                    kernel.name, insn.id, ", ".join(sorted(iname_deps)),
+                    ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)),
+                    ))
 
         insn_id_to_inames[insn.id] = iname_deps
         insn_assignee_inames[insn.id] = write_deps & kernel.all_inames()
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 93565cde1fe176136e8faec31b808ce4fcac4d69..56bba2d4144b4694a6ad29af1da5bf00958083c5 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -76,6 +76,30 @@ def gather_schedule_subloop(schedule, start_idx):
     assert False
 
 
+def generate_sub_sched_items(schedule, start_idx):
+    if not isinstance(schedule[start_idx], EnterLoop):
+        yield start_idx, schedule[start_idx]
+
+    level = 0
+    i = start_idx
+    while i < len(schedule):
+        sched_item = schedule[i]
+        if isinstance(sched_item, EnterLoop):
+            level += 1
+        elif isinstance(sched_item, LeaveLoop):
+            level -= 1
+
+        else:
+            yield i, sched_item
+
+        if level == 0:
+            return
+
+        i += 1
+
+    assert False
+
+
 def get_barrier_needing_dependency(kernel, target, source, unordered=False):
     from loopy.kernel.data import InstructionBase
     if not isinstance(source, InstructionBase):
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 2729622914ec87ca39e810662db8384302e1c935..7e48e2727985135e342ce987d22859c8dead7267 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1469,6 +1469,34 @@ def test_vector_types(ctx_factory, vec_len):
             fills_entire_output=False)
 
 
+def test_conditional(ctx_factory):
+    #logging.basicConfig(level=logging.DEBUG)
+    ctx = cl.create_some_context()
+
+    knl = lp.make_kernel(
+            ctx.devices[0],
+            "{ [i,j]: 0<=i,j<n }",
+            """
+                <> my_a = a[i,j] {id=read_a}
+                <> a_less_than_zero = my_a < 0 {dep=read_a,inames=i:j}
+                my_a = 2*my_a {id=twice_a,dep=read_a,if=a_less_than_zero}
+                my_a = my_a+1 {id=aplus,dep=twice_a,if=a_less_than_zero}
+                out[i,j] = 2*my_a {dep=aplus}
+                """,
+            [
+                lp.GlobalArg("a", np.float32, shape=lp.auto),
+                lp.GlobalArg("out", np.float32, shape=lp.auto),
+                "..."
+                ])
+
+    ref_knl = knl
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl,
+            parameters=dict(
+                n=200
+                ))
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])