diff --git a/doc/algorithm.rst b/doc/algorithm.rst
index 6eccc2f86683d39bd0bf719b414e3befdef987da..3a2cc1ef85997a36df45815575e7cadb292d51ac 100644
--- a/doc/algorithm.rst
+++ b/doc/algorithm.rst
@@ -25,35 +25,11 @@ evaluate multi-stage expressions on one or several operands in a single pass.
 
 Here's a usage example::
 
-    import pyopencl as cl
-    import pyopencl.array as cl_array
-    import numpy
-
-    ctx = cl.create_some_context()
-    queue = cl.CommandQueue(ctx)
-
-    n = 10
-    a_gpu = cl_array.to_device(
-            ctx, queue, numpy.random.randn(n).astype(numpy.float32))
-    b_gpu = cl_array.to_device(
-            ctx, queue, numpy.random.randn(n).astype(numpy.float32))
-
-    from pyopencl.elementwise import ElementwiseKernel
-    lin_comb = ElementwiseKernel(ctx,
-            "float a, float *x, "
-            "float b, float *y, "
-            "float *z",
-            "z[i] = a*x[i] + b*y[i]",
-            "linear_combination")
-
-    c_gpu = cl_array.empty_like(a_gpu)
-    lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
-
-    import numpy.linalg as la
-    assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
-
-(You can find this example as :file:`examples/demo_elementwise.py` in the PyOpenCL
-distribution.)
+.. literalinclude:: ../examples/demo_elementwise.py
+
+(You can find this example as
+:download:`examples/demo_elementwise.py <../examples/demo_elementwise.py>`
+in the PyOpenCL distribution.)
 
 .. _custom-reductions:
 
diff --git a/examples/demo_elementwise.py b/examples/demo_elementwise.py
index a64616baba08f21550c88263e1a813ec2a23b6c0..21646c4f42a8cce495c02aef7beae5d4a2ceaffe 100644
--- a/examples/demo_elementwise.py
+++ b/examples/demo_elementwise.py
@@ -1,26 +1,34 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
 import pyopencl as cl
-import pyopencl.array as cl_array
-import numpy
+import pyopencl.array
+from pyopencl.elementwise import ElementwiseKernel
+
+n = 10
+a_np = np.random.randn(n).astype(np.float32)
+b_np = np.random.randn(n).astype(np.float32)
 
 ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)
 
-n = 10
-a_gpu = cl_array.to_device(
-        queue, numpy.random.randn(n).astype(numpy.float32))
-b_gpu = cl_array.to_device(
-        queue, numpy.random.randn(n).astype(numpy.float32))
+a_g = cl.array.to_device(queue, a_np)
+b_g = cl.array.to_device(queue, b_np)
 
-from pyopencl.elementwise import ElementwiseKernel
 lin_comb = ElementwiseKernel(ctx,
-        "float a, float *x, "
-        "float b, float *y, "
-        "float *z",
-        "z[i] = a*x[i] + b*y[i]",
-        "linear_combination")
+    "float k1, float *a_g, float k2, float *b_g, float *res_g",
+    "res_g[i] = k1 * a_g[i] + k2 * b_g[i]",
+    "lin_comb"
+)
+
+res_g = cl.array.empty_like(a_g)
+lin_comb(2, a_g, 3, b_g, res_g)
 
-c_gpu = cl_array.empty_like(a_gpu)
-lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
+# Check on GPU with PyOpenCL Array:
+print((res_g - (2 * a_g + 3 * b_g)).get())
 
-import numpy.linalg as la
-assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
+# Check on CPU with Numpy:
+res_np = res_g.get()
+print(res_np - (2 * a_np + 3 * b_np))
+print(np.linalg.norm(res_np - (2 * a_np + 3 * b_np)))