diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecb45edbf0af51bdcbb1efcbbee8e18e0f3b500f
--- /dev/null
+++ b/examples/python/global_barrier_removal.py
@@ -0,0 +1,28 @@
+import numpy as np
+import loopy as lp
+import pyopencl as cl
+import pyopencl.array
+
+knl = lp.make_kernel(
+        "{ [i,k]: 0<=i<n and 0<=k<3 }",
+        """c[k,i] = a[k, i + 1]
+           out[k,i] = c[k,i]""",
+        ["..."])
+
+# transform
+knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+from loopy.kernel.tools import add_dtypes
+knl = add_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
+
+# schedule
+from loopy.preprocess import preprocess_kernel
+knl = preprocess_kernel(knl)
+
+from loopy.schedule import get_one_scheduled_kernel
+knl = get_one_scheduled_kernel(knl)
+print(knl)
+
+# map schedule onto host or device
+from loopy.codegen.device_mapping import map_schedule_onto_host_or_device
+knl = map_schedule_onto_host_or_device(knl)
+print(knl)
diff --git a/loopy/codegen/device_mapping.py b/loopy/codegen/device_mapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..d40b0c4d255eda0159c77fbf56889b0a55e5f347
--- /dev/null
+++ b/loopy/codegen/device_mapping.py
@@ -0,0 +1,138 @@
+from pytools import Record
+
+
+class HostForLoop(Record):
+    pass
+
+
+class HostConditional(Record):
+    pass
+
+
+class HostBlock(Record):
+    pass
+
+
+class HostInvokeKernel(Record):
+    pass
+
+
+def map_schedule_onto_host_or_device(kernel):
+    from pytools import UniqueNameGenerator
+    kernel_name_gen = UniqueNameGenerator(forced_prefix=kernel.name)
+
+    from loopy.schedule import (
+        RunInstruction, EnterLoop, LeaveLoop, Barrier,
+        CallKernel, ReturnFromKernel)
+
+    # TODO: Assert that the kernel has been scheduled, etc.
+    schedule = kernel.schedule
+
+    # Map from loop start to loop end
+    loop_bounds = {}
+    active_loops = []
+    for idx, sched_item in enumerate(schedule):
+        if isinstance(sched_item, EnterLoop):
+            active_loops.append(idx)
+        elif isinstance(sched_item, LeaveLoop):
+            loop_bounds[active_loops.pop()] = idx
+    del active_loops
+
+    # {{{ Inner mapper function
+
+    def inner_mapper(start_idx, end_idx, new_schedule):
+        # XXX: Doesn't do dependency analysis yet....
+        schedule_required_splitting = False
+
+        i = start_idx
+        current_chunk = []
+        while i <= end_idx:
+            sched_item = schedule[i]
+
+            if isinstance(sched_item, RunInstruction):
+                current_chunk.append(sched_item)
+                i += 1
+
+            elif isinstance(sched_item, EnterLoop):
+                loop_end = loop_bounds[i]
+                inner_schedule = []
+                loop_required_splitting = inner_mapper(
+                    i + 1, loop_end - 1, inner_schedule)
+
+                start_item = schedule[i]
+                end_item = schedule[loop_end]
+
+                i = loop_end + 1
+
+                if loop_required_splitting:
+                    schedule_required_splitting = True
+                    if current_chunk:
+                        # TODO: Do a better job of naming the kernel...
+                        new_kernel_name = kernel_name_gen()
+                        new_schedule.extend(
+                            # TODO: Infer kernel arguments
+                            [CallKernel(kernel_name=new_kernel_name)] +
+                            # TODO: Load state into here
+                            current_chunk +
+                            # TODO: Save state right here
+                            [ReturnFromKernel(kernel_name=new_kernel_name)])
+                    new_schedule.extend(
+                        [start_item] +
+                        inner_schedule +
+                        [end_item])
+                    current_chunk = []
+                else:
+                    current_chunk.extend(
+                        [start_item] +
+                        inner_schedule +
+                        [end_item])
+
+            elif isinstance(sched_item, Barrier):
+                if sched_item.kind == "global":
+                    # Wrap the current chunk into a kernel call.
+                    schedule_required_splitting = True
+                    if current_chunk:
+                        # TODO: Do a better job of naming the kernel
+                        new_kernel_name = kernel_name_gen()
+                        new_schedule.extend(
+                            # TODO: Infer kernel arguments
+                            [CallKernel(kernel_name=new_kernel_name)] +
+                            # TODO: Load state into here
+                            current_chunk +
+                            # TODO: Save state right here
+                            [ReturnFromKernel(kernel_name=new_kernel_name)])
+                    current_chunk = []
+                else:
+                    current_chunk.append(sched_item)
+                i += 1
+            else:
+                # TODO: Make error message more informative.
+                raise ValueError()
+
+        if current_chunk and schedule_required_splitting:
+            # Wrap remainder of schedule into a kernel call.
+            new_kernel_name = kernel_name_gen()
+            new_schedule.extend(
+                # TODO: Infer kernel arguments
+                [CallKernel(kernel_name=new_kernel_name)] +
+                # TODO: Load state into here
+                current_chunk +
+                # TODO: Save state right here
+                [ReturnFromKernel(kernel_name=new_kernel_name)])
+        else:
+            new_schedule.extend(current_chunk)
+
+        return schedule_required_splitting
+
+    # }}}
+
+    new_schedule = []
+    split_kernel = inner_mapper(0, len(schedule) - 1, new_schedule)
+    if not split_kernel:
+        # Wrap everything into a kernel call.
+        new_schedule = (
+            [CallKernel(kernel_name=kernel.name)] +
+            new_schedule +
+            [ReturnFromKernel(kernel_name=kernel.name)])
+    new_kernel = kernel.copy(schedule=new_schedule)
+    return new_kernel
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 8094995d7972c2d203c0d3148419aaba252fbc77..7ac8778378869f7c9cf6cfbe8d4be145e9fad520 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -62,6 +62,14 @@ class RunInstruction(ScheduleItem):
     hash_fields = __slots__ = ["insn_id"]
 
 
+class CallKernel(ScheduleItem):
+    hash_fields = __slots__ = ["kernel_name"]
+
+
+class ReturnFromKernel(ScheduleItem):
+    hash_fields = __slots__ = ["kernel_name"]
+
+
 class Barrier(ScheduleItem):
     """
     .. attribute:: comment
@@ -363,6 +371,12 @@ def dump_schedule(kernel, schedule):
         elif isinstance(sched_item, LeaveLoop):
             indent = indent[:-4]
             lines.append(indent + "ENDLOOP %s" % sched_item.iname)
+        elif isinstance(sched_item, CallKernel):
+            lines.append(indent + "CALL KERNEL %s" % sched_item.kernel_name)
+            indent += "    "
+        elif isinstance(sched_item, ReturnFromKernel):
+            indent = indent[:-4]
+            lines.append(indent + "RETURN FROM KERNEL %s" % sched_item.kernel_name)
         elif isinstance(sched_item, RunInstruction):
             insn = kernel.id_to_insn[sched_item.insn_id]
             if isinstance(insn, MultiAssignmentBase):
@@ -1451,12 +1465,14 @@ def generate_loop_schedules(kernel, debug_args={}):
                     gen_sched = insert_barriers(kernel, gen_sched,
                             reverse=False, kind="global")
 
+                    """
                     for sched_item in gen_sched:
                         if (
                                 isinstance(sched_item, Barrier)
                                 and sched_item.kind == "global"):
                             raise LoopyError("kernel requires a global barrier %s"
                                     % sched_item.comment)
+                    """
 
                     logger.info("%s: barrier insertion: local" % kernel.name)