diff --git a/examples/demo_meta_codepy.py b/examples/demo_meta_codepy.py
new file mode 100644
index 0000000000000000000000000000000000000000..b61e13348b867e5330fa79847dbc9e29eff3d508
--- /dev/null
+++ b/examples/demo_meta_codepy.py
@@ -0,0 +1,56 @@
+import pyopencl as cl
+import numpy
+import numpy.linalg as la
+
+block_size = 16
+local_size = 32
+macroblock_count = 33
+dtype = numpy.float32
+total_size = block_size*local_size*macroblock_count
+
+ctx = cl.Context()
+queue = cl.CommandQueue(ctx)
+
+a = numpy.random.randn(total_size).astype(dtype)
+b = numpy.random.randn(total_size).astype(dtype)
+
+mf = cl.mem_flags
+a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
+b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
+c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
+
+from codepy.cgen import FunctionBody, \
+        FunctionDeclaration, Typedef, POD, Value, \
+        Pointer, Module, Block, Initializer, Assign, Const
+from codepy.cgen.opencl import CLKernel, CLGlobal, \
+        CLRequiredWorkGroupSize
+
+mod = Module([
+    FunctionBody(
+        CLKernel(CLRequiredWorkGroupSize((
+            FunctionDeclaration(
+            Value("void", "add"),
+            arg_decls=[CLGlobal(Pointer(Const(POD(dtype, name))))
+                for name in ["tgt", "op1", "op2"]]))),
+        Block([
+            Initializer(
+                POD(numpy.int32, "idx"), "get_global_id(0)")
+            ]+[
+            Assign(
+                "tgt[idx+%d]" % (o*local_size),
+                "op1[idx+%d] + op2[idx+%d]" % (
+                    o*local_size, 
+                    o*local_size))
+            for o in range(block_size)]))])
+
+knl = cl.Program(mod).build().add
+
+knl(c_gpu, a_gpu, b_gpu, 
+        local_size=(local_size,),
+        global_size=(local_size*macroblock_count,1))
+
+c = numpy.empty_like(a)
+cl.enqueue_read_buffer(queue, c_buf, c).wait()
+
+assert la.norm(c-(a+b)) == 0
+
diff --git a/examples/demo_meta_template.py b/examples/demo_meta_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bc21c180debf1830026d23c69085aa900275a6d
--- /dev/null
+++ b/examples/demo_meta_template.py
@@ -0,0 +1,50 @@
+import pyopencl as cl
+import numpy
+import numpy.linalg as la
+
+block_size = 16
+local_size = 32
+macroblock_count = 33
+dtype = numpy.float32
+total_size = block_size*local_size*macroblock_count
+
+ctx = cl.Context()
+queue = cl.CommandQueue(ctx)
+
+mf = cl.mem_flags
+a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
+b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
+c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
+
+from jinja2 import Template
+
+tpl = Template("""
+    __kernel void add(
+            __global {{ type_name }} *tgt, 
+            __global const {{ type_name }} *op1, 
+            __global const {{ type_name }} *op2)
+    {
+      int idx = get_global_id(0);
+
+      {% for i in range(block_size) %}
+          {% set offset = i*local_size %}
+          tgt[idx + {{ offset }}] = 
+            op1[idx + {{ offset }}] 
+            + op2[idx + {{ offset }}];
+      {% endfor %}
+    }""")
+
+rendered_tpl = tpl.render(
+    type_name="float", block_size=block_size,
+    local_size=local_size)
+
+knl = cl.Program(rendered_tpl).build().add
+
+knl(c_gpu, a_gpu, b_gpu, 
+        local_size=(local_size,),
+        global_size=(local_size*macroblock_count,1))
+
+c = numpy.empty_like(a)
+cl.enqueue_read_buffer(queue, c_buf, c).wait()
+
+assert la.norm(c-(a+b)) == 0