diff --git a/doc/index.rst b/doc/index.rst
index 1687f4c8ee912c1e53879861dc548ca1333770b5..4443e82c7281b3742368b6678ab38a6487dd8218 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -12,15 +12,17 @@ model. Here's a very simple example of how to double the entries of a vector
 using loopy:
 
 .. literalinclude:: ../examples/hello-loopy.py
+   :end-before: ENDEXAMPLE
 
-The following kernel is generated, compiled, and executed behind your back (and
-also printed at the end):
+This example is included in the :mod:`loopy` distribution as
+:download:`examples/hello-loopy.py <../examples/hello-loopy.py>`.
+
+When you run this script, the following kernel is generated, compiled, and executed:
 
 .. literalinclude:: ../examples/hello-loopy.cl
     :language: c
 
-This file is included in the :mod:`loopy` distribution as
-:file:`examples/hello-loopy.py`.
+(See the full example for how to print the generated code.)
 
 .. toctree::
     :maxdepth: 2
diff --git a/examples/hello-loopy.py b/examples/hello-loopy.py
index a835005ab53fe9c15323eff5b4624d3b5b023125..2c8ff4c3bbdfe331596eec1c0dcb49f6bd8b6e46 100644
--- a/examples/hello-loopy.py
+++ b/examples/hello-loopy.py
@@ -3,31 +3,28 @@ import loopy as lp
 import pyopencl as cl
 import pyopencl.array
 
-# -----------------------------------------------------------------------------
 # setup
-# -----------------------------------------------------------------------------
+# -----
 ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)
 
 n = 15 * 10**6
 a = cl.array.arange(queue, n, dtype=np.float32)
 
-# -----------------------------------------------------------------------------
-# generation (loopy bits start here)
-# -----------------------------------------------------------------------------
+# create
+# ------
 knl = lp.make_kernel(ctx.devices[0],
         "{ [i]: 0<=i<n }",
         "out[i] = 2*a[i]")
 
-# -----------------------------------------------------------------------------
-# transformation
-# -----------------------------------------------------------------------------
+# transform
+# ---------
 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
-# -----------------------------------------------------------------------------
-# execution
-# -----------------------------------------------------------------------------
-cknl = lp.CompiledKernel(ctx, knl)
-evt, (out,) = cknl(queue, a=a, n=n)
+# execute
+# -------
+evt, (out,) = knl(queue, a=a, n=n)
+# ENDEXAMPLE
 
+cknl = lp.CompiledKernel(ctx, knl)
 print cknl.get_highlighted_code({"a": np.float32})
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 2cd1bce37eccfb02468896aeb8deed397f31974a..f84feca880cac553ee9e6ad6471cd964de0bf48c 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -844,6 +844,19 @@ class LoopKernel(Record):
 
     # }}}
 
+    # {{{ direct execution
+
+    @memoize_method
+    def get_compiled_kernel(self, ctx):
+        from loopy.compiled import CompiledKernel
+        return CompiledKernel(ctx, self)
+
+    def __call__(self, queue, **kwargs):
+        return self.get_compiled_kernel(queue.context)(
+                queue, **kwargs)
+
+    # }}}
+
 # }}}
 
 # vim: foldmethod=marker