diff --git a/doc/source/array.rst b/doc/source/array.rst
index 2d18141b48479e126739ba0cc5cc1935bbe9c3fa..6c3272f0454a8847fc338c464cdfffa6fe387ead 100644
--- a/doc/source/array.rst
+++ b/doc/source/array.rst
@@ -108,247 +108,75 @@ Under the hood, the complex types are simply `float2` and `double2`.
 The :class:`Array` Class
-.. class:: Array(cqa, shape, dtype, order="C", *, allocator=None, base=None, data=None)
-    A :class:`numpy.ndarray` work-alike that stores its data and performs its
-    computations on the compute device.  *shape* and *dtype* work exactly as in
-    :mod:`numpy`.  Arithmetic methods in :class:`Array` support the
-    broadcasting of scalars. (e.g. `array+5`)
-    *cqa* must be a :class:`pyopencl.CommandQueue`. *cqa*
-    specifies the queue in which the array carries out its
-    computations by default. *cqa* will at some point be renamed *queue*,
-    so it should be considered 'positional-only'.
-    *allocator* may be `None` or a callable that, upon being called with an
-    argument of the number of bytes to be allocated, returns an
-    :class:`pyopencl.Buffer` object. (A :class:`pyopencl.tools.MemoryPool`
-    instance is one useful example of an object to pass here.)
-    .. versionchanged:: 2011.1
-        Renamed *context* to *cqa*, made it general-purpose.
-        All arguments beyond *order* should be considered keyword-only.
-    .. attribute :: data
-        The :class:`pyopencl.MemoryObject` instance created for the memory that backs
-        this :class:`Array`.
-    .. attribute :: shape
-        The tuple of lengths of each dimension in the array.
-    .. attribute :: dtype
-        The :class:`numpy.dtype` of the items in the GPU array.
-    .. attribute :: size
-        The number of meaningful entries in the array. Can also be computed by
-        multiplying up the numbers in :attr:`shape`.
-    .. attribute :: nbytes
-        The size of the entire array in bytes. Computed as :attr:`size` times
-        ``dtype.itemsize``.
-    .. attribute :: strides
-        Tuple of bytes to step in each dimension when traversing an array.
-    .. attribute :: flags
-        Return an object with attributes `c_contiguous`, `f_contiguous` and `forc`,
-        which may be used to query contiguity properties in analogy to
-        :attr:`numpy.ndarray.flags`.
-    .. method :: __len__()
-        Returns the size of the leading dimension of *self*.
-    .. method :: reshape(shape)
-        Returns an array containing the same data with a new shape.
-    .. method :: ravel()
-        Returns flattened array containing the same data.
-    .. method :: view(dtype=None)
-        Returns view of array with the same data. If *dtype* is different from
-        current dtype, the actual bytes of memory will be reinterpreted.
-    .. method :: set(ary, queue=None, async=False)
-        Transfer the contents the :class:`numpy.ndarray` object *ary*
-        onto the device.
-        *ary* must have the same dtype and size (not necessarily shape) as *self*.
-    .. method :: get(queue=None, ary=None, async=False)
-        Transfer the contents of *self* into *ary* or a newly allocated
-        :mod:`numpy.ndarray`. If *ary* is given, it must have the right
-        size (not necessarily shape) and dtype.
-    .. method :: copy(queue=None)
-        .. versionadded:: 2013.1
-    .. method :: __str__()
-    .. method :: __repr__()
-    .. method :: mul_add(self, selffac, other, otherfac, queue=None):
-        Return `selffac*self + otherfac*other`.
-    .. method :: __add__(other)
-    .. method :: __sub__(other)
-    .. method :: __iadd__(other)
-    .. method :: __isub__(other)
-    .. method :: __neg__(other)
-    .. method :: __mul__(other)
-    .. method :: __div__(other)
-    .. method :: __rdiv__(other)
-    .. method :: __pow__(other)
-    .. method :: __abs__()
-        Return a :class:`Array` containing the absolute value of each
-        element of *self*.
+.. autoclass:: Array
+    .. automethod :: __len__
+    .. automethod :: reshape
+    .. automethod :: ravel
+    .. automethod :: view
+    .. automethod :: set
+    .. automethod :: get
+    .. automethod :: copy
+    .. automethod :: __str__
+    .. automethod :: __repr__
+    .. automethod :: mul_add
+    .. automethod :: __add__
+    .. automethod :: __sub__
+    .. automethod :: __iadd__
+    .. automethod :: __isub__
+    .. automethod :: __neg__
+    .. automethod :: __mul__
+    .. automethod :: __div__
+    .. automethod :: __rdiv__
+    .. automethod :: __pow__
+    .. automethod :: __abs__
     .. UNDOC reverse()
-    .. method :: fill(scalar, queue=None)
-        Fill the array with *scalar*.
+    .. automethod :: fill
-    .. method :: astype(dtype, queue=None)
+    .. automethod :: astype
-        Return *self*, cast to *dtype*.
-    .. attribute :: real
-        .. versionadded:: 2012.1
-    .. attribute :: imag
-        .. versionadded:: 2012.1
-    .. method :: conj()
-        .. versionadded:: 2012.1
+    .. autoattribute :: real
+    .. autoattribute :: imag
+    .. automethod :: conj
 Constructing :class:`Array` Instances
-.. function:: to_device(queue, ary, allocator=None, async=False)
-    Return a :class:`Array` that is an exact copy of the :class:`numpy.ndarray`
-    instance *ary*.
-    See :class:`Array` for the meaning of *allocator*.
-    .. versionchanged:: 2011.1
-        *context* argument was deprecated.
+.. autofunction:: to_device
 .. function:: empty(queue, shape, dtype, order="C", allocator=None, data=None)
     A synonym for the :class:`Array` constructor.
-.. function:: zeros(queue, shape, dtype, order="C", allocator=None)
-    Same as :func:`empty`, but the :class:`Array` is zero-initialized before
-    being returned.
-    .. versionchanged:: 2011.1
-        *context* argument was deprecated.
-.. function:: empty_like(other_ary)
-    Make a new, uninitialized :class:`Array` having the same properties
-    as *other_ary*.
-.. function:: zeros_like(other_ary)
-    Make a new, zero-initialized :class:`Array` having the same properties
-    as *other_ary*.
-.. function:: arange(queue, start, stop, step, dtype=None, allocator=None)
-    Create a :class:`Array` filled with numbers spaced `step` apart,
-    starting from `start` and ending at `stop`.
-    For floating point arguments, the length of the result is
-    `ceil((stop - start)/step)`.  This rule may result in the last
-    element of the result being greater than `stop`.
-    *dtype*, if not specified, is taken as the largest common type
-    of *start*, *stop* and *step*.
-    .. versionchanged:: 2011.1
-        *context* argument was deprecated.
-    .. versionchanged:: 2011.2
-        *allocator* keyword argument was added.
-.. function:: take(a, indices, out=None, queue=None)
-    Return the :class:`Array` ``[a[indices[0]], ..., a[indices[n]]]``.
-    For the moment, *a* must be a type that can be bound to a texture.
+.. autofunction:: zeros
+.. autofunction:: empty_like
+.. autofunction:: zeros_like
+.. autofunction:: arange
+.. autofunction:: take
-.. function:: if_positive(criterion, then_, else_, out=None, queue=None)
-    Return an array like *then_*, which, for the element at index *i*,
-    contains *then_[i]* if *criterion[i]>0*, else *else_[i]*.
-.. function:: maximum(a, b, out=None, queue=None)
-    Return the elementwise maximum of *a* and *b*.
-.. function:: minimum(a, b, out=None, queue=None)
-    Return the elementwise minimum of *a* and *b*.
+.. autofunction:: if_positive
+.. autofunction:: maximum
+.. autofunction:: minimum
 .. _reductions:
-.. function:: sum(a, dtype=None, queue=None)
-    .. versionadded: 2011.1
-.. function:: dot(a, b, dtype=None, queue=None)
-    .. versionadded: 2011.1
-.. function:: subset_dot(subset, a, b, dtype=None, queue=None)
-    .. versionadded: 2011.1
-.. function:: max(a, queue=None)
-    .. versionadded: 2011.1
-.. function:: min(a, queue=None)
-    .. versionadded: 2011.1
-.. function:: subset_max(subset, a, queue=None)
-    .. versionadded: 2011.1
-.. function:: subset_min(subset, a, queue=None)
-    .. versionadded: 2011.1
+.. autofunction:: sum
+.. autofunction:: dot
+.. autofunction:: subset_dot
+.. autofunction:: max
+.. autofunction:: min
+.. autofunction:: subset_max
+.. autofunction:: subset_min
 See also :ref:`custom-reductions`.
@@ -464,93 +292,18 @@ functions available in the OpenCL standard. (See table 6.8 in the spec.)
 Generating Arrays of Random Numbers
-.. module:: pyopencl.clrandom
-.. class:: RanluxGenerator(self, queue, num_work_items=None,  luxury=2, seed=None, max_work_items=None)
-    :param queue: :class:`pyopencl.CommandQueue`, only used for initialization
-    :param luxury: the "luxury value" of the generator, and should be 0-4, where 0 is fastest
-        and 4 produces the best numbers. It can also be >=24, in which case it directly
-        sets the p-value of RANLUXCL.
-    :param num_work_items: is the number of generators to initialize, usually corresponding
-        to the number of work-items in the NDRange RANLUXCL will be used with.
-        May be `None`, in which case a default value is used.
-    :param max_work_items: should reflect the maximum number of work-items that will be used
-        on any parallel instance of RANLUXCL. So for instance if we are launching 5120
-        work-items on GPU1 and 10240 work-items on GPU2, GPU1's RANLUXCLTab would be
-        generated by calling ranluxcl_intialization with numWorkitems = 5120 while
-        GPU2's RANLUXCLTab would use numWorkitems = 10240. However maxWorkitems must
-        be at least 10240 for both GPU1 and GPU2, and it must be set to the same value
-        for both. (may be `None`)
-    .. versionadded:: 2011.2
-    .. versionchanged:: 2013.1
-        Added default value for `num_work_items`.
-    .. attribute:: state
-        A :class:`pyopencl.array.Array` containing the state of the generator.
-    .. attribute:: nskip
-        nskip is an integer which can (optionally) be defined in the kernel code
-        as RANLUXCL_NSKIP. If this is done the generator will be faster for luxury setting
-        0 and 1, or when the p-value is manually set to a multiple of 24.
-    .. method:: fill_uniform(ary, a=0, b=1, queue=None)
-        Fill *ary* with uniformly distributed random numbers in the interval
-        *(a, b)*, endpoints excluded.
-    .. method:: uniform(queue, shape, dtype, order="C", allocator=None, base=None, data=None, a=0, b=1)
-        Make a new empty array, apply :meth:`fill_uniform` to it.
-    .. method:: fill_normal(ary, mu=0, sigma=1, queue=None):
-        Fill *ary* with normally distributed numbers with mean *mu* and
-        standard deviation *sigma*.
-    .. method:: normal(queue, shape, dtype, order="C", allocator=None, base=None, data=None, mu=0, sigma=1)
-        Make a new empty array, apply :meth:`fill_normal` to it.
-    .. method:: synchronize()
-        The generator gets inefficient when different work items invoke
-        the generator a differing number of times. This function
-        ensures efficiency.
-.. function:: rand(queue, shape, dtype, a=0, b=1)
-    Return an array of `shape` filled with random values of `dtype`
-    in the range [a,b).
-.. function:: fill_rand(result, queue=None, a=0, b=1)
-    Fill *result* with random values of `dtype` in the range [0,1).
-PyOpenCL now includes and uses the `RANLUXCL random number generator
-<https://bitbucket.org/ivarun/ranluxcl/>`_ by Ivar Ursin Nikolaisen.  In
-addition to being usable through the convenience functions above, it is
-available in any piece of code compiled through PyOpenCL by::
-    #include <pyopencl-ranluxcl.cl>
-See the `source <https://github.com/inducer/pyopencl/blob/master/src/cl/pyopencl-ranluxcl.cl>`_
-for some documentation if you're planning on using RANLUXCL directly.
+.. automodule:: pyopencl.clrandom
-The RANLUX generator is described in the following two articles. If you use the
-generator for scientific purposes, please consider citing them:
+    .. autoclass:: RanluxGenerator
-* Martin Lüscher, A portable high-quality random number generator for lattice
-  field theory simulations, `Computer Physics Communications 79 (1994) 100-110
-  <http://dx.doi.org/10.1016/0010-4655(94)90232-1>`_
+        .. automethod:: fill_uniform
+        .. automethod:: uniform
+        .. automethod:: fill_normal
+        .. automethod:: normal
+        .. automethod:: synchronize
-* F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom
-  number generator of Lüscher, `Computer Physics Communications 79 (1994) 111-114
-  <http://dx.doi.org/10.1016/0010-4655(94)90233-X>`_
+    .. autofunction:: rand
+    .. autofunction:: fill_rand
 Fast Fourier Transforms
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 1554d69588b75de3d8e8fbe6786c5eac1e57560b..ccfb977bd06ec91f8817395c80fd6de3adeab203 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -31,6 +31,8 @@ extensions = [
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['.templates']
+exclude_patterns = ['subst.rst']
 # The suffix of source filenames.
 source_suffix = '.rst'
diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index 9c33792a4636b481b38920e8c46e59205ce54c5a..e38b4fda424b7b91d8227dccc06230a3ff2a5f06 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -1136,7 +1136,7 @@ class KeyValueSorter(object):
                 values, keys, queue=queue, wait_for=wait_for)
         starts = cl.array.empty(queue, (nkeys+1), starts_dtype, allocator=allocator)
-        evt = starts.fill_and_return_event(len(values_sorted_by_key), wait_for=[evt])
+        evt = starts.fill_evt(len(values_sorted_by_key), wait_for=[evt])
         evt = knl_info.start_finder(starts, keys_sorted_by_key,
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 7d6999054a21ebe4520a826fa1647783c76056c2..66f4508e59d4084df502c996edf1d99dd17d5c71 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -205,11 +205,58 @@ class DefaultAllocator(cl.tools.DeferredAllocator):
 # {{{ array class
 class Array(object):
-    """A :mod:`pyopencl` Array is used to do array-based calculation on
-    a compute device.
+    """A :class:`numpy.ndarray` work-alike that stores its data and performs its
+    computations on the compute device.  *shape* and *dtype* work exactly as in
+    :mod:`numpy`.  Arithmetic methods in :class:`Array` support the
+    broadcasting of scalars. (e.g. `array+5`)
-    This is mostly supposed to be a :mod:`numpy`-workalike. Operators
-    work on an element-by-element basis, just like :class:`numpy.ndarray`.
+    *cqa* must be a :class:`pyopencl.CommandQueue`. *cqa*
+    specifies the queue in which the array carries out its
+    computations by default. *cqa* will at some point be renamed *queue*,
+    so it should be considered 'positional-only'.
+    *allocator* may be `None` or a callable that, upon being called with an
+    argument of the number of bytes to be allocated, returns an
+    :class:`pyopencl.Buffer` object. (A :class:`pyopencl.tools.MemoryPool`
+    instance is one useful example of an object to pass here.)
+    .. versionchanged:: 2011.1
+        Renamed *context* to *cqa*, made it general-purpose.
+        All arguments beyond *order* should be considered keyword-only.
+    .. attribute :: data
+        The :class:`pyopencl.MemoryObject` instance created for the memory that backs
+        this :class:`Array`.
+    .. attribute :: shape
+        The tuple of lengths of each dimension in the array.
+    .. attribute :: dtype
+        The :class:`numpy.dtype` of the items in the GPU array.
+    .. attribute :: size
+        The number of meaningful entries in the array. Can also be computed by
+        multiplying up the numbers in :attr:`shape`.
+    .. attribute :: nbytes
+        The size of the entire array in bytes. Computed as :attr:`size` times
+        ``dtype.itemsize``.
+    .. attribute :: strides
+        Tuple of bytes to step in each dimension when traversing an array.
+    .. attribute :: flags
+        Return an object with attributes `c_contiguous`, `f_contiguous` and `forc`,
+        which may be used to query contiguity properties in analogy to
+        :attr:`numpy.ndarray.flags`.
     def __init__(self, cqa, shape, dtype, order="C", allocator=None,
@@ -375,6 +422,12 @@ class Array(object):
     def set(self, ary, queue=None, async=False):
+        """Transfer the contents the :class:`numpy.ndarray` object *ary*
+        onto the device.
+        *ary* must have the same dtype and size (not necessarily shape) as *self*.
+        """
         assert ary.size == self.size
         assert ary.dtype == self.dtype
@@ -394,6 +447,11 @@ class Array(object):
                     is_blocking=not async)
     def get(self, queue=None, ary=None, async=False):
+        """Transfer the contents of *self* into *ary* or a newly allocated
+        :mod:`numpy.ndarray`. If *ary* is given, it must have the right
+        size (not necessarily shape) and dtype.
+        """
         if ary is None:
             ary = np.empty(self.shape, self.dtype)
@@ -413,6 +471,8 @@ class Array(object):
         return ary
     def copy(self, queue=None):
+        """.. versionadded:: 2013.1"""
         queue = queue or self.queue
         result = self._new_like_me()
         cl.enqueue_copy(queue, result.data, self.data, byte_count=self.nbytes)
@@ -427,7 +487,8 @@ class Array(object):
     def __hash__(self):
         raise TypeError("pyopencl arrays are not hashable.")
-    # kernel invocation wrappers ----------------------------------------------
+    # {{{ kernel invocation wrappers
     def _axpbyz(out, afac, a, bfac, b, queue=None):
@@ -568,7 +629,10 @@ class Array(object):
             return self.__class__(self.context, self.shape, dtype,
-    # operators ---------------------------------------------------------------
+    # }}}
+    # {{{ operators
     def mul_add(self, selffac, other, otherfac, queue=None):
         """Return `selffac * self + otherfac*other`.
@@ -678,9 +742,7 @@ class Array(object):
         return self
     def __div__(self, other):
-        """Divides an array by an array or a scalar::
-           x = self / n
+        """Divides an array by an array or a scalar, i.e. ``self / other``.
         if isinstance(other, Array):
             result = self._new_like_me(_get_common_dtype(self, other, self.queue))
@@ -700,9 +762,7 @@ class Array(object):
     __truediv__ = __div__
     def __rdiv__(self,other):
-        """Divides an array by a scalar or an array::
-           x = n / self
+        """Divides an array by a scalar or an array, i.e. ``other / self``.
         if isinstance(other, Array):
@@ -718,17 +778,23 @@ class Array(object):
     __rtruediv__ = __rdiv__
-    def fill(self, value, queue=None, wait_for=None, return_event=False):
-        """Fills the array with the specified value."""
+    def fill(self, value, queue=None, wait_for=None):
+        """Fill the array with *scalar*.
+        :returns: *self*.
+        """
         self._fill(self, value, queue=queue, wait_for=wait_for)
         return self
-    def fill_and_return_event(self, value, queue=None, wait_for=None, return_event=False):
-        """Fills the array with the specified value."""
+    def fill_evt(self, value, queue=None, wait_for=None):
+        """Fills the array with the specified value.
+        :returns: A :class:`pyopencl.Event`.
+        """
         return self._fill(self, value, queue=queue, wait_for=wait_for)
     def __len__(self):
-        """Return the size of the leading dimension of self."""
+        """Returns the size of the leading dimension of *self*."""
         if len(self.shape):
             return self.shape[0]
@@ -736,7 +802,7 @@ class Array(object):
     def __abs__(self):
         """Return a `Array` of the absolute values of the elements
-        of `self`.
+        of *self*.
         result = self._new_like_me(self.dtype.type(0).real.dtype)
@@ -766,6 +832,8 @@ class Array(object):
         self._rpow_scalar(result, common_dtype.type(other), self)
         return result
+    # }}}
     def reverse(self, queue=None):
         """Return this array in reversed order. The array is treated
         as one-dimensional.
@@ -776,6 +844,7 @@ class Array(object):
         return result
     def astype(self, dtype, queue=None):
+        """Return *self*, cast to *dtype*."""
         if dtype == self.dtype:
             return self
@@ -783,7 +852,8 @@ class Array(object):
         self._copy(result, self, queue=queue)
         return result
-    # rich comparisons (or rather, lack thereof) ------------------------------
+    # {{{ rich comparisons (or rather, lack thereof)
     def __eq__(self, other):
         raise NotImplementedError
@@ -802,9 +872,10 @@ class Array(object):
     def __gt__(self, other):
         raise NotImplementedError
+    # }}}
     # {{{ complex-valued business
-    @property
     def real(self):
         if self.dtype.kind == "c":
             result = self._new_like_me(self.dtype.type(0).real.dtype)
@@ -812,8 +883,8 @@ class Array(object):
             return result
             return self
+    real = property(real, doc=".. versionadded:: 2012.1")
-    @property
     def imag(self):
         if self.dtype.kind == "c":
             result = self._new_like_me(self.dtype.type(0).real.dtype)
@@ -821,8 +892,10 @@ class Array(object):
             return result
             return zeros_like(self)
+    imag = property(imag, doc=".. versionadded:: 2012.1")
     def conj(self):
+        """.. versionadded:: 2012.1"""
         if self.dtype.kind == "c":
             result = self._new_like_me()
             self._conj(result, self)
@@ -835,6 +908,7 @@ class Array(object):
     # {{{ views
     def reshape(self, *shape):
+        """Returns an array containing the same data with a new shape."""
         # TODO: add more error-checking, perhaps
         if isinstance(shape[0], tuple) or isinstance(shape[0], list):
             shape = tuple(shape[0])
@@ -845,9 +919,14 @@ class Array(object):
         return self._new_with_changes(data=self.data, shape=shape)
     def ravel(self):
+        """Returns flattened array containing the same data."""
         return self.reshape(self.size)
     def view(self, dtype=None):
+        """Returns view of array with the same data. If *dtype* is different from
+        current dtype, the actual bytes of memory will be reinterpreted.
+        """
         if dtype is None:
             dtype = self.dtype
@@ -887,7 +966,14 @@ def as_strided(ary, shape=None, strides=None):
 # {{{ creation helpers
 def to_device(queue, ary, allocator=None, async=False):
-    """Converts a numpy array to a :class:`Array`."""
+    """Return a :class:`Array` that is an exact copy of the :class:`numpy.ndarray`
+    instance *ary*.
+    See :class:`Array` for the meaning of *allocator*.
+    .. versionchanged:: 2011.1
+        *context* argument was deprecated.
+    """
     if ary.dtype == object:
         raise RuntimeError("to_device does not work on object arrays.")
@@ -903,7 +989,12 @@ def to_device(queue, ary, allocator=None, async=False):
 empty = Array
 def zeros(queue, shape, dtype, order="C", allocator=None):
-    """Returns an array of the given shape and dtype filled with 0's."""
+    """Same as :func:`empty`, but the :class:`Array` is zero-initialized before
+    being returned.
+    .. versionchanged:: 2011.1
+        *context* argument was deprecated.
+    """
     result = Array(queue, shape, dtype,
             order=order, allocator=allocator)
@@ -912,9 +1003,17 @@ def zeros(queue, shape, dtype, order="C", allocator=None):
     return result
 def empty_like(ary):
+    """Make a new, uninitialized :class:`Array` having the same properties
+    as *other_ary*.
+    """
     return ary._new_with_changes(data=None)
 def zeros_like(ary):
+    """Make a new, zero-initialized :class:`Array` having the same properties
+    as *other_ary*.
+    """
     result = empty_like(ary)
     zero = np.zeros((), ary.dtype)
@@ -927,12 +1026,21 @@ def _arange_knl(result, start, step):
             result.context, result.dtype)
 def arange(queue, *args, **kwargs):
-    """Create an array filled with numbers spaced `step` apart,
+    """Create a :class:`Array` filled with numbers spaced `step` apart,
     starting from `start` and ending at `stop`.
     For floating point arguments, the length of the result is
     `ceil((stop - start)/step)`.  This rule may result in the last
-    element of the result being greater than stop.
+    element of the result being greater than `stop`.
+    *dtype*, if not specified, is taken as the largest common type
+    of *start*, *stop* and *step*.
+    .. versionchanged:: 2011.1
+        *context* argument was deprecated.
+    .. versionchanged:: 2011.2
+        *allocator* keyword argument was added.
     # argument processing -----------------------------------------------------
@@ -1019,6 +1127,10 @@ def _take(result, ary, indices):
 def take(a, indices, out=None, queue=None):
+    """Return the :class:`Array` ``[a[indices[0]], ..., a[indices[n]]]``.
+    For the moment, *a* must be a type that can be bound to a texture.
+    """
     queue = queue or a.queue
     if out is None:
         out = Array(queue, indices.shape, a.dtype, allocator=a.allocator)
@@ -1224,33 +1336,30 @@ def _if_positive(result, criterion, then_, else_):
 def if_positive(criterion, then_, else_, out=None, queue=None):
+    """Return an array like *then_*, which, for the element at index *i*,
+    contains *then_[i]* if *criterion[i]>0*, else *else_[i]*.
+    """
     if not (criterion.shape == then_.shape == else_.shape):
         raise ValueError("shapes do not match")
     if not (then_.dtype == else_.dtype):
         raise ValueError("dtypes do not match")
-    knl = elementwise.get_if_positive_kernel(
-            criterion.context,
-            criterion.dtype, then_.dtype)
     if out is None:
         out = empty_like(then_)
     _if_positive(out, criterion, then_, else_)
     return out
 def maximum(a, b, out=None, queue=None):
+    """Return the elementwise maximum of *a* and *b*."""
     # silly, but functional
     return if_positive(a.mul_add(1, b, -1, queue=queue), a, b,
             queue=queue, out=out)
 def minimum(a, b, out=None, queue=None):
+    """Return the elementwise minimum of *a* and *b*."""
     # silly, but functional
     return if_positive(a.mul_add(1, b, -1, queue=queue), b, a,
             queue=queue, out=out)
@@ -1262,16 +1371,25 @@ _builtin_min = min
 _builtin_max = max
 def sum(a, dtype=None, queue=None):
+    """
+    .. versionadded:: 2011.1
+    """
     from pyopencl.reduction import get_sum_kernel
     krnl = get_sum_kernel(a.context, dtype, a.dtype)
     return krnl(a, queue=queue)
 def dot(a, b, dtype=None, queue=None):
+    """
+    .. versionadded:: 2011.1
+    """
     from pyopencl.reduction import get_dot_kernel
     krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype)
     return krnl(a, b, queue=queue)
 def subset_dot(subset, a, b, dtype=None, queue=None):
+    """
+    .. versionadded:: 2011.1
+    """
     from pyopencl.reduction import get_subset_dot_kernel
     krnl = get_subset_dot_kernel(a.context, dtype, subset.dtype, a.dtype, b.dtype)
     return krnl(subset, a, b, queue=queue)
@@ -1285,7 +1403,14 @@ def _make_minmax_kernel(what):
     return f
 min = _make_minmax_kernel("min")
+min.__doc__ = """
+    .. versionadded:: 2011.1
+    """
 max = _make_minmax_kernel("max")
+max.__doc__ = """
+    .. versionadded:: 2011.1
+    """
 def _make_subset_minmax_kernel(what):
     def f(subset, a, queue=None):
@@ -1296,7 +1421,9 @@ def _make_subset_minmax_kernel(what):
     return f
 subset_min = _make_subset_minmax_kernel("min")
+subset_min.__doc__ = """.. versionadded:: 2011.1"""
 subset_max = _make_subset_minmax_kernel("max")
+subset_max.__doc__ = """.. versionadded:: 2011.1"""
 # }}}
diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py
index 5db53e671b4219e5cc7f89cc44e0de2a74a5a8e2..43baca7ae941276fec622d9dac57380a7e1ccce5 100644
--- a/pyopencl/clrandom.py
+++ b/pyopencl/clrandom.py
@@ -1,3 +1,4 @@
+# encoding: utf8
 from __future__ import division
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
+# {{{ documentation
+__doc__ = """
+PyOpenCL now includes and uses the `RANLUXCL random number generator
+<https://bitbucket.org/ivarun/ranluxcl/>`_ by Ivar Ursin Nikolaisen.  In
+addition to being usable through the convenience functions above, it is
+available in any piece of code compiled through PyOpenCL by::
+    #include <pyopencl-ranluxcl.cl>
+See the `source <https://github.com/inducer/pyopencl/blob/master/src/cl/pyopencl-ranluxcl.cl>`_
+for some documentation if you're planning on using RANLUXCL directly.
+The RANLUX generator is described in the following two articles. If you use the
+generator for scientific purposes, please consider citing them:
+* Martin Lüscher, A portable high-quality random number generator for lattice
+  field theory simulations, `Computer Physics Communications 79 (1994) 100-110
+  <http://dx.doi.org/10.1016/0010-4655(94)90232-1>`_
+* F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom
+  number generator of Lüscher, `Computer Physics Communications 79 (1994) 111-114
+  <http://dx.doi.org/10.1016/0010-4655(94)90233-X>`_
+# }}}
 import pyopencl as cl
 import pyopencl.array as cl_array
 from pyopencl.tools import first_arg_dependent_memoize
@@ -32,10 +60,47 @@ import numpy as np
 class RanluxGenerator(object):
+    """
+    .. versionadded:: 2011.2
+    .. attribute:: state
+        A :class:`pyopencl.array.Array` containing the state of the generator.
+    .. attribute:: nskip
+        nskip is an integer which can (optionally) be defined in the kernel code
+        as RANLUXCL_NSKIP. If this is done the generator will be faster for luxury setting
+        0 and 1, or when the p-value is manually set to a multiple of 24.
+    """
     def __init__(self, queue, num_work_items=None,
             luxury=None, seed=None, no_warmup=False,
             use_legacy_init=False, max_work_items=None):
+        """
+        :param queue: :class:`pyopencl.CommandQueue`, only used for initialization
+        :param luxury: the "luxury value" of the generator, and should be 0-4, where 0 is fastest
+            and 4 produces the best numbers. It can also be >=24, in which case it directly
+            sets the p-value of RANLUXCL.
+        :param num_work_items: is the number of generators to initialize, usually corresponding
+            to the number of work-items in the NDRange RANLUXCL will be used with.
+            May be `None`, in which case a default value is used.
+        :param max_work_items: should reflect the maximum number of work-items that will be used
+            on any parallel instance of RANLUXCL. So for instance if we are launching 5120
+            work-items on GPU1 and 10240 work-items on GPU2, GPU1's RANLUXCLTab would be
+            generated by calling ranluxcl_intialization with numWorkitems = 5120 while
+            GPU2's RANLUXCLTab would use numWorkitems = 10240. However maxWorkitems must
+            be at least 10240 for both GPU1 and GPU2, and it must be set to the same value
+            for both. (may be `None`)
+        .. versionchanged:: 2013.1
+            Added default value for `num_work_items`.
+        """
         if luxury is None:
             luxury = 4
@@ -215,6 +280,10 @@ class RanluxGenerator(object):
         return knl, size_multiplier
     def fill_uniform(self, ary, a=0, b=1, queue=None):
+        """Fill *ary* with uniformly distributed random numbers in the interval
+        *(a, b)*, endpoints excluded.
+        """
         if queue is None:
             queue = ary.queue
@@ -225,6 +294,8 @@ class RanluxGenerator(object):
                 b-a, a)
     def uniform(self, *args, **kwargs):
+        """Make a new empty array, apply :meth:`fill_uniform` to it.
+        """
         a = kwargs.pop("a", 0)
         b = kwargs.pop("b", 1)
@@ -234,6 +305,10 @@ class RanluxGenerator(object):
         return result
     def fill_normal(self, ary, mu=0, sigma=1, queue=None):
+        """Fill *ary* with normally distributed numbers with mean *mu* and
+        standard deviation *sigma*.
+        """
         if queue is None:
             queue = ary.queue
@@ -243,6 +318,8 @@ class RanluxGenerator(object):
                 self.state.data, ary.data, ary.size*size_multiplier, sigma, mu)
     def normal(self, *args, **kwargs):
+        """Make a new empty array, apply :meth:`fill_normal` to it.
+        """
         mu = kwargs.pop("mu", 0)
         sigma = kwargs.pop("sigma", 1)
@@ -273,6 +350,11 @@ class RanluxGenerator(object):
         return prg.sync
     def synchronize(self, queue):
+        """The generator gets inefficient when different work items invoke the
+        generator a differing number of times. This function ensures
+        efficiency.
+        """
         self.get_sync_kernel()(queue, (self.num_work_items,), self.wg_size, self.state.data)
@@ -290,6 +372,8 @@ def _get_generator(queue, luxury=None):
 def fill_rand(result, queue=None, luxury=4, a=0, b=1):
+    """Fill *result* with random values of `dtype` in the range [0,1).
+    """
     if queue is None:
         queue = result.queue
     gen = _get_generator(queue, luxury=luxury)
@@ -299,6 +383,10 @@ def fill_rand(result, queue=None, luxury=4, a=0, b=1):
 def rand(queue, shape, dtype, luxury=None, a=0, b=1):
+    """Return an array of `shape` filled with random values of `dtype`
+    in the range [a,b).
+    """
     from pyopencl.array import Array
     gen = _get_generator(queue, luxury)
     result = Array(queue, shape, dtype)
@@ -307,4 +395,4 @@ def rand(queue, shape, dtype, luxury=None, a=0, b=1):
-# vim: filetype=pyopencl
+# vim: filetype=pyopencl:foldmethod=marker