diff --git a/doc/algorithm.rst b/doc/algorithm.rst
index 6eccc2f86683d39bd0bf719b414e3befdef987da..3a2cc1ef85997a36df45815575e7cadb292d51ac 100644
--- a/doc/algorithm.rst
+++ b/doc/algorithm.rst
@@ -25,35 +25,11 @@ evaluate multi-stage expressions on one or several operands in a single pass.
 
 Here's a usage example::
 
-    import pyopencl as cl
-    import pyopencl.array as cl_array
-    import numpy
-
-    ctx = cl.create_some_context()
-    queue = cl.CommandQueue(ctx)
-
-    n = 10
-    a_gpu = cl_array.to_device(
-            ctx, queue, numpy.random.randn(n).astype(numpy.float32))
-    b_gpu = cl_array.to_device(
-            ctx, queue, numpy.random.randn(n).astype(numpy.float32))
-
-    from pyopencl.elementwise import ElementwiseKernel
-    lin_comb = ElementwiseKernel(ctx,
-            "float a, float *x, "
-            "float b, float *y, "
-            "float *z",
-            "z[i] = a*x[i] + b*y[i]",
-            "linear_combination")
-
-    c_gpu = cl_array.empty_like(a_gpu)
-    lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
-
-    import numpy.linalg as la
-    assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
-
-(You can find this example as :file:`examples/demo_elementwise.py` in the PyOpenCL
-distribution.)
+.. literalinclude:: ../examples/demo_elementwise.py
+
+(You can find this example as
+:download:`examples/demo_elementwise.py <../examples/demo_elementwise.py>`
+in the PyOpenCL distribution.)
 
 .. _custom-reductions:
 
diff --git a/examples/demo.py b/examples/demo.py
index ba948d6716b84c338f3a28b64d0b3e6c9425a1bc..1b694a88062aa101ca80c72df5676ede9c474f1c 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -1,30 +1,32 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
 import pyopencl as cl
-import numpy
-import numpy.linalg as la
 
-a = numpy.random.rand(50000).astype(numpy.float32)
-b = numpy.random.rand(50000).astype(numpy.float32)
+a_np = np.random.rand(50000).astype(np.float32)
+b_np = np.random.rand(50000).astype(np.float32)
 
 ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)
 
 mf = cl.mem_flags
-a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
-b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
-dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
+a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
+b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
 
 prg = cl.Program(ctx, """
-    __kernel void sum(__global const float *a,
-    __global const float *b, __global float *c)
-    {
-      int gid = get_global_id(0);
-      c[gid] = a[gid] + b[gid];
-    }
-    """).build()
+__kernel void sum(__global const float *a_g, __global const float *b_g, __global float *res_g) {
+  int gid = get_global_id(0);
+  res_g[gid] = a_g[gid] + b_g[gid];
+}
+""").build()
 
-prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
+res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
+prg.sum(queue, a_np.shape, None, a_g, b_g, res_g)
 
-a_plus_b = numpy.empty_like(a)
-cl.enqueue_copy(queue, a_plus_b, dest_buf)
+res_np = np.empty_like(a_np)
+cl.enqueue_copy(queue, res_np, res_g)
 
-print(la.norm(a_plus_b - (a+b)), la.norm(a_plus_b))
+# Check on CPU with Numpy:
+print(res_np - (a_np + b_np))
+print(np.linalg.norm(res_np - (a_np + b_np)))
diff --git a/examples/demo_elementwise.py b/examples/demo_elementwise.py
index a64616baba08f21550c88263e1a813ec2a23b6c0..21646c4f42a8cce495c02aef7beae5d4a2ceaffe 100644
--- a/examples/demo_elementwise.py
+++ b/examples/demo_elementwise.py
@@ -1,26 +1,34 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
 import pyopencl as cl
-import pyopencl.array as cl_array
-import numpy
+import pyopencl.array
+from pyopencl.elementwise import ElementwiseKernel
+
+n = 10
+a_np = np.random.randn(n).astype(np.float32)
+b_np = np.random.randn(n).astype(np.float32)
 
 ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)
 
-n = 10
-a_gpu = cl_array.to_device(
-        queue, numpy.random.randn(n).astype(numpy.float32))
-b_gpu = cl_array.to_device(
-        queue, numpy.random.randn(n).astype(numpy.float32))
+a_g = cl.array.to_device(queue, a_np)
+b_g = cl.array.to_device(queue, b_np)
 
-from pyopencl.elementwise import ElementwiseKernel
 lin_comb = ElementwiseKernel(ctx,
-        "float a, float *x, "
-        "float b, float *y, "
-        "float *z",
-        "z[i] = a*x[i] + b*y[i]",
-        "linear_combination")
+    "float k1, float *a_g, float k2, float *b_g, float *res_g",
+    "res_g[i] = k1 * a_g[i] + k2 * b_g[i]",
+    "lin_comb"
+)
+
+res_g = cl.array.empty_like(a_g)
+lin_comb(2, a_g, 3, b_g, res_g)
 
-c_gpu = cl_array.empty_like(a_gpu)
-lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
+# Check on GPU with PyOpenCL Array:
+print((res_g - (2 * a_g + 3 * b_g)).get())
 
-import numpy.linalg as la
-assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
+# Check on CPU with Numpy:
+res_np = res_g.get()
+print(res_np - (2 * a_np + 3 * b_np))
+print(np.linalg.norm(res_np - (2 * a_np + 3 * b_np)))