diff --git a/doc/source/misc.rst b/doc/source/misc.rst
index 2cf70bcac949a51e58e90e90d8a89af912fe412c..3a3d3fd5735e440d1c3314b2ea7e1c4c9f29db34 100644
--- a/doc/source/misc.rst
+++ b/doc/source/misc.rst
@@ -8,6 +8,7 @@ Acknowledgments
 ===============
 
 * James Snyder provided a patch to make PyOpenCL work on OS X 10.6.
+* Roger Pau Monné supplied the example :file:`examples/benchmark-all.py`.
 
 User-visible Changes
 ====================
diff --git a/examples/benchmark-all.py b/examples/benchmark-all.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fcb7f3981a3b52f27bcd230416f0f6b79ea74a8
--- /dev/null
+++ b/examples/benchmark-all.py
@@ -0,0 +1,87 @@
+# example provided by Roger Pau Monn'e
+
+import pyopencl as cl
+import numpy
+import numpy.linalg as la
+import datetime
+from time import time
+
+a = numpy.random.rand(1000).astype(numpy.float32)
+b = numpy.random.rand(1000).astype(numpy.float32)
+c_result = numpy.empty_like(a)
+
+# Speed in normal CPU usage
+time1 = time()
+for i in range(1000):
+        for j in range(1000):
+                c_result[i] = a[i] + b[i]
+                c_result[i] = c_result[i] * (a[i] + b[i])
+                c_result[i] = c_result[i] * (a[i] / 2.0)
+time2 = time()
+print "Execution time of test without OpenCL: ", time2 - time1, "s"
+
+
+for platform in cl.get_platforms():
+    for device in platform.get_devices():
+        dev_type = "unknown"
+
+        for dev_type_candidate in dir(cl.device_type):
+            if getattr(cl.device_type, dev_type_candidate) == device.type:
+                dev_type = dev_type_candidate
+
+        print "==============================================================="
+        print "Platform name:", platform.name
+        print "Platform profile:", platform.profile
+        print "Platform vendor:", platform.vendor
+        print "Platform version:", platform.version
+        print "---------------------------------------------------------------"
+        print "Device name:", device.name
+        print "Device type: ", dev_type
+        print "Device memory: ", device.global_mem_size//1024//1024, 'MB'
+        print "Device max clock speed:", device.max_clock_frequency, 'MHz'
+        print "Device compute units:", device.max_compute_units
+
+        # Simnple speed test
+        ctx = cl.Context([device])
+        queue = cl.CommandQueue(ctx, 
+                properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+        mf = cl.mem_flags
+        a_buf = cl.create_host_buffer(
+                ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, a)
+        b_buf = cl.create_host_buffer(
+                ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, b)
+        dest_buf = cl.create_buffer(ctx, mf.WRITE_ONLY, b.nbytes)
+
+        prg = cl.create_program_with_source(ctx, """
+            __kernel void sum(__global const float *a,
+            __global const float *b, __global float *c)
+            {
+                        int loop;
+                        int gid = get_global_id(0);
+                        for(loop=1; loop<1000;loop++)
+                        {
+                                c[gid] = a[gid] + b[gid];
+                                c[gid] = c[gid] * (a[gid] + b[gid]);
+                                c[gid] = c[gid] * (a[gid] / 2.0);
+                        }
+                }
+                """).build()
+        c = numpy.empty_like(a)
+
+        before = cl.enqueue_marker(queue)
+        after = prg.sum(queue, a.shape, a_buf, b_buf, dest_buf)
+
+        cl.enqueue_read_buffer(queue, dest_buf, c).wait()
+        elapsed = 1e-9*(after.profile.end - before.profile.end)
+
+        print "Execution time of test: %g s" % elapsed
+
+        error = 0
+        for i in range(1000):
+                if c[i] != c_result[i]:
+                        error = 1
+        if error:
+                print "Results doesn't match!!"
+        else:
+                print "Results OK"