diff --git a/README.rst b/README.rst
index 33e52774ebf6512f7d179bc3475857addb5e93aa..2f457c62a17cbac443fb4f15452529a31f4fde57 100644
--- a/README.rst
+++ b/README.rst
@@ -40,5 +40,5 @@ Places on the web related to PyOpenCL:
       :target: http://pypi.python.org/pypi/pyopencl
 * `C. Gohlke's Windows binaries <http://www.lfd.uci.edu/~gohlke/pythonlibs/#pyopencl>`_ (download Windows binaries)
 * `Github <http://github.com/pyopencl/pyopencl>`_ (get latest source code, file bugs)
-* `Documentation <http://documen.tician.de>`_ (read how things work)
+* `Documentation <http://documen.tician.de/pyopencl>`_ (read how things work)
 * `Wiki <http://wiki.tiker.net/PyOpenCL>`_ (read installation tips, get examples, read FAQ)
diff --git a/doc/algorithm.rst b/doc/algorithm.rst
index 6eccc2f86683d39bd0bf719b414e3befdef987da..3a2cc1ef85997a36df45815575e7cadb292d51ac 100644
--- a/doc/algorithm.rst
+++ b/doc/algorithm.rst
@@ -25,35 +25,11 @@ evaluate multi-stage expressions on one or several operands in a single pass.
 
 Here's a usage example::
 
-    import pyopencl as cl
-    import pyopencl.array as cl_array
-    import numpy
-
-    ctx = cl.create_some_context()
-    queue = cl.CommandQueue(ctx)
-
-    n = 10
-    a_gpu = cl_array.to_device(
-            ctx, queue, numpy.random.randn(n).astype(numpy.float32))
-    b_gpu = cl_array.to_device(
-            ctx, queue, numpy.random.randn(n).astype(numpy.float32))
-
-    from pyopencl.elementwise import ElementwiseKernel
-    lin_comb = ElementwiseKernel(ctx,
-            "float a, float *x, "
-            "float b, float *y, "
-            "float *z",
-            "z[i] = a*x[i] + b*y[i]",
-            "linear_combination")
-
-    c_gpu = cl_array.empty_like(a_gpu)
-    lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
-
-    import numpy.linalg as la
-    assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
-
-(You can find this example as :file:`examples/demo_elementwise.py` in the PyOpenCL
-distribution.)
+.. literalinclude:: ../examples/demo_elementwise.py
+
+(You can find this example as
+:download:`examples/demo_elementwise.py <../examples/demo_elementwise.py>`
+in the PyOpenCL distribution.)
 
 .. _custom-reductions:
 
diff --git a/doc/misc.rst b/doc/misc.rst
index f1fc8cc83cd29a508271066187526b59ca5aa7cb..3fda67b1e9afa7811be889e072b3ec17665e6e4f 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -31,7 +31,7 @@ PyOpenCL comes with IPython integration, which lets you seamlessly integrate
 PyOpenCL kernels into your IPython notebooks. Simply load the PyOpenCL 
 IPython extension using::
 
-    %load_ext pyopencl.ipython
+    %load_ext pyopencl.ipython_ext
 
 and then use the ``%%cl_kernel`` 'cell-magic' command. See `this notebook
 <http://nbviewer.ipython.org/urls/raw.githubusercontent.com/pyopencl/pyopencl/master/examples/ipython-demo.ipynb>`_
diff --git a/examples/demo.py b/examples/demo.py
index ba948d6716b84c338f3a28b64d0b3e6c9425a1bc..1b694a88062aa101ca80c72df5676ede9c474f1c 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -1,30 +1,32 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
 import pyopencl as cl
-import numpy
-import numpy.linalg as la
 
-a = numpy.random.rand(50000).astype(numpy.float32)
-b = numpy.random.rand(50000).astype(numpy.float32)
+a_np = np.random.rand(50000).astype(np.float32)
+b_np = np.random.rand(50000).astype(np.float32)
 
 ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)
 
 mf = cl.mem_flags
-a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
-b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
-dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
+a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
+b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
 
 prg = cl.Program(ctx, """
-    __kernel void sum(__global const float *a,
-    __global const float *b, __global float *c)
-    {
-      int gid = get_global_id(0);
-      c[gid] = a[gid] + b[gid];
-    }
-    """).build()
+__kernel void sum(__global const float *a_g, __global const float *b_g, __global float *res_g) {
+  int gid = get_global_id(0);
+  res_g[gid] = a_g[gid] + b_g[gid];
+}
+""").build()
 
-prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
+res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
+prg.sum(queue, a_np.shape, None, a_g, b_g, res_g)
 
-a_plus_b = numpy.empty_like(a)
-cl.enqueue_copy(queue, a_plus_b, dest_buf)
+res_np = np.empty_like(a_np)
+cl.enqueue_copy(queue, res_np, res_g)
 
-print(la.norm(a_plus_b - (a+b)), la.norm(a_plus_b))
+# Check on CPU with Numpy:
+print(res_np - (a_np + b_np))
+print(np.linalg.norm(res_np - (a_np + b_np)))
diff --git a/examples/demo_elementwise.py b/examples/demo_elementwise.py
index a64616baba08f21550c88263e1a813ec2a23b6c0..21646c4f42a8cce495c02aef7beae5d4a2ceaffe 100644
--- a/examples/demo_elementwise.py
+++ b/examples/demo_elementwise.py
@@ -1,26 +1,34 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
 import pyopencl as cl
-import pyopencl.array as cl_array
-import numpy
+import pyopencl.array
+from pyopencl.elementwise import ElementwiseKernel
+
+n = 10
+a_np = np.random.randn(n).astype(np.float32)
+b_np = np.random.randn(n).astype(np.float32)
 
 ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)
 
-n = 10
-a_gpu = cl_array.to_device(
-        queue, numpy.random.randn(n).astype(numpy.float32))
-b_gpu = cl_array.to_device(
-        queue, numpy.random.randn(n).astype(numpy.float32))
+a_g = cl.array.to_device(queue, a_np)
+b_g = cl.array.to_device(queue, b_np)
 
-from pyopencl.elementwise import ElementwiseKernel
 lin_comb = ElementwiseKernel(ctx,
-        "float a, float *x, "
-        "float b, float *y, "
-        "float *z",
-        "z[i] = a*x[i] + b*y[i]",
-        "linear_combination")
+    "float k1, float *a_g, float k2, float *b_g, float *res_g",
+    "res_g[i] = k1 * a_g[i] + k2 * b_g[i]",
+    "lin_comb"
+)
+
+res_g = cl.array.empty_like(a_g)
+lin_comb(2, a_g, 3, b_g, res_g)
 
-c_gpu = cl_array.empty_like(a_gpu)
-lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
+# Check on GPU with PyOpenCL Array:
+print((res_g - (2 * a_g + 3 * b_g)).get())
 
-import numpy.linalg as la
-assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
+# Check on CPU with Numpy:
+res_np = res_g.get()
+print(res_np - (2 * a_np + 3 * b_np))
+print(np.linalg.norm(res_np - (2 * a_np + 3 * b_np)))
diff --git a/examples/ipython-demo.ipynb b/examples/ipython-demo.ipynb
index 1635dcc31c9b3a229328f4fede2f913f6d9a1c1e..b0e8159c4d1f3579e79842dd96f0918350063cf6 100644
--- a/examples/ipython-demo.ipynb
+++ b/examples/ipython-demo.ipynb
@@ -1,7 +1,7 @@
 {
  "metadata": {
   "name": "",
-  "signature": "sha256:85c637b863a4bbbd3fb91eca8682d36d9874a53a6db35b18f1c53bb53b3c6bdc"
+  "signature": "sha256:81f3deed7cdc26b0fc756b3ee1eb6e8f9b1be96304ddfc6ff484d223c2b8a942"
  },
  "nbformat": 3,
  "nbformat_minor": 0,
@@ -19,8 +19,17 @@
      ],
      "language": "python",
      "metadata": {},
-     "outputs": [],
-     "prompt_number": 2
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stderr",
+       "text": [
+        "/usr/lib/python2.7/pkgutil.py:186: ImportWarning: Not importing directory '/usr/lib/python2.7/dist-packages/enthought': missing __init__.py\n",
+        "  file, filename, etc = imp.find_module(subname, path)\n"
+       ]
+      }
+     ],
+     "prompt_number": 1
     },
     {
      "cell_type": "markdown",
@@ -33,7 +42,7 @@
      "cell_type": "code",
      "collapsed": false,
      "input": [
-      "%load_ext pyopencl.ipython"
+      "%load_ext pyopencl.ipython_ext"
      ],
      "language": "python",
      "metadata": {},
@@ -62,8 +71,8 @@
        "stream": "stdout",
        "text": [
         "Choose platform:\n",
-        "[0] <pyopencl.Platform 'AMD Accelerated Parallel Processing' at 0x7f244be8e500>\n",
-        "[1] <pyopencl.Platform 'Intel(R) OpenCL' at 0x3adcef0>\n"
+        "[0] <pyopencl.Platform 'AMD Accelerated Parallel Processing' at 0x7fc14f1b0080>\n",
+        "[1] <pyopencl.Platform 'Intel(R) OpenCL' at 0x32aed00>\n"
        ]
       },
       {
@@ -162,7 +171,7 @@
        "output_type": "pyout",
        "prompt_number": 8,
        "text": [
-        "<pyopencl._cl.Event at 0x39dac20>"
+        "<pyopencl._cl.Event at 0x7fc14f3fdf30>"
        ]
       }
      ],
diff --git a/pyopencl/ipython.py b/pyopencl/ipython_ext.py
similarity index 100%
rename from pyopencl/ipython.py
rename to pyopencl/ipython_ext.py
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index b85708d8c15809f0a0a86a74d0be82b3997ab5f2..cd784df84da786b7d8a4982c41df299061ea7e47 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -404,18 +404,20 @@ def get_arg_offset_adjuster_code(arg_types):
 def get_gl_sharing_context_properties():
     ctx_props = cl.context_properties
 
-    from OpenGL import platform as gl_platform, GLX, WGL
+    from OpenGL import platform as gl_platform
 
     props = []
 
     import sys
     if sys.platform in ["linux", "linux2"]:
+        from OpenGL import GLX
         props.append(
             (ctx_props.GL_CONTEXT_KHR, gl_platform.GetCurrentContext()))
         props.append(
                 (ctx_props.GLX_DISPLAY_KHR,
                     GLX.glXGetCurrentDisplay()))
     elif sys.platform == "win32":
+        from OpenGL import WGL
         props.append(
             (ctx_props.GL_CONTEXT_KHR, gl_platform.GetCurrentContext()))
         props.append(
diff --git a/pyopencl/version.py b/pyopencl/version.py
index cce4332fa693dc66686f9a61f4d85bb1a2692d2e..2d917d29fc041929fe4c94362fd212f13cafc685 100644
--- a/pyopencl/version.py
+++ b/pyopencl/version.py
@@ -1,4 +1,3 @@
-VERSION = (2013, 3)
+VERSION = (2014, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
-
diff --git a/test/test_clmath.py b/test/test_clmath.py
index 6ebbe46bcb1f49ed70e1797c003013fe43d79147..586e9e075275aadb11c3067b3127f8c955afb625 100644
--- a/test/test_clmath.py
+++ b/test/test_clmath.py
@@ -132,7 +132,7 @@ def test_atan2(ctx_factory):
     queue = cl.CommandQueue(context)
 
     for s in sizes:
-        a = (cl_array.arange(queue, s, dtype=np.float32) - s / 2) / 100
+        a = (cl_array.arange(queue, s, dtype=np.float32) - np.float32(s / 2)) / 100
         a2 = (s / 2 - 1 - cl_array.arange(queue, s, dtype=np.float32)) / 100
         b = clmath.atan2(a, a2)
 
@@ -149,7 +149,7 @@ def test_atan2pi(ctx_factory):
     queue = cl.CommandQueue(context)
 
     for s in sizes:
-        a = (cl_array.arange(queue, s, dtype=np.float32) - s / 2) / 100
+        a = (cl_array.arange(queue, s, dtype=np.float32) - np.float32(s / 2)) / 100
         a2 = (s / 2 - 1 - cl_array.arange(queue, s, dtype=np.float32)) / 100
         b = clmath.atan2pi(a, a2)