From 0ac073fedcf4d33a2d8afaf32005d4240438008a Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 27 Jun 2010 00:30:46 -0400
Subject: [PATCH] Port array functionality from PyCUDA.

---
 examples/demo_elementwise.py |  26 ++
 pyopencl/array.py            | 875 +++++++++++++++++++++++++++++++++++
 pyopencl/clrandom.py         | 259 +++++++++++
 pyopencl/elementwise.py      | 466 +++++++++++++++++++
 pyopencl/tools.py            | 173 +++++++
 setup.py                     |   3 +-
 test/test_array.py           | 593 ++++++++++++++++++++++++
 7 files changed, 2394 insertions(+), 1 deletion(-)
 create mode 100644 examples/demo_elementwise.py
 create mode 100644 pyopencl/array.py
 create mode 100644 pyopencl/clrandom.py
 create mode 100644 pyopencl/elementwise.py
 create mode 100644 pyopencl/tools.py
 create mode 100644 test/test_array.py

diff --git a/examples/demo_elementwise.py b/examples/demo_elementwise.py
new file mode 100644
index 00000000..cf89121a
--- /dev/null
+++ b/examples/demo_elementwise.py
@@ -0,0 +1,26 @@
+import pyopencl as cl
+import pyopencl.array as cl_array
+import numpy
+
+ctx = cl.create_some_context()
+queue = cl.CommandQueue(ctx)
+
+n = 10
+a_gpu = cl_array.to_device(
+        ctx, queue, numpy.random.randn(n).astype(numpy.float32))
+b_gpu = cl_array.to_device(
+        ctx, queue, numpy.random.randn(n).astype(numpy.float32))
+
+from pyopencl.elementwise import ElementwiseKernel
+lin_comb = ElementwiseKernel(ctx,
+        "float a, float *x, "
+        "float b, float *y, "
+        "float *z",
+        "z[i] = a*x[i] + b*y[i]",
+        "linear_combination")
+
+c_gpu = cl_array.empty_like(a_gpu)
+lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
+
+import numpy.linalg as la
+assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
diff --git a/pyopencl/array.py b/pyopencl/array.py
new file mode 100644
index 00000000..474a6bc7
--- /dev/null
+++ b/pyopencl/array.py
@@ -0,0 +1,875 @@
+"""CL device arrays."""
+
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+
+
+
+import numpy
+import pyopencl.elementwise as elementwise
+import pyopencl as cl
+
+
+
+def splay(ctx, n):
+    min_work_items = 32
+    max_work_items = 128
+    max_groups = max(
+            4 * dev.max_compute_units * 8
+            # 4 to overfill the device
+            # 8 is an Nvidia constant--that's how many
+            # groups fit onto one compute device
+            for dev in ctx.devices)
+
+    if n < min_work_items:
+        group_count = 1
+        work_items_per_group = min_work_items
+    elif n < (max_groups * min_work_items):
+        group_count = (n + min_work_items - 1) // min_work_items
+        work_items_per_group = min_work_items
+    elif n < (max_groups * max_work_items):
+        group_count = max_groups
+        grp = (n + min_work_items - 1) // min_work_items
+        work_items_per_group = ((grp + max_groups -1) // max_groups) * min_work_items
+    else:
+        group_count = max_groups
+        work_items_per_group = max_work_items
+
+    #print "n:%d gc:%d wipg:%d" % (n, group_count, work_items_per_group)
+    return (group_count*work_items_per_group, 1, 1), \
+            (work_items_per_group, 1, 1)
+
+
+
+
+def _get_common_dtype(obj1, obj2):
+    return (obj1.dtype.type(0) + obj2.dtype.type(0)).dtype
+
+
+
+
+class DefaultAllocator:
+    def __init__(self, context, flags=cl.mem_flags.READ_WRITE):
+        self.context = context
+        self.flags = flags
+
+    def __call__(self, size):
+        return cl.Buffer(self.context, self.flags, size)
+
+
+
+
+class Array(object):
+    """A :mod:`pyopencl` Array is used to do array-based calculation on
+    a compute device.
+
+    This is mostly supposed to be a :mod:`numpy`-workalike. Operators
+    work on an element-by-element basis, just like :class:`numpy.ndarray`.
+    """
+
+    def __init__(self, context, shape, dtype, allocator=None,
+            base=None, data=None, queue=None):
+        if allocator is None:
+            allocator = DefaultAllocator(context)
+
+        try:
+            s = 1
+            for dim in shape:
+                s *= dim
+        except TypeError:
+            if not isinstance(shape, int):
+                raise TypeError("shape must either be iterable or "
+                        "castable to an integer")
+            s = shape
+            shape = (shape,)
+
+        self.context = context
+        self.queue = queue
+
+        self.shape = shape
+        self.dtype = numpy.dtype(dtype)
+
+        self.mem_size = self.size = s
+        self.nbytes = self.dtype.itemsize * self.size
+
+        self.allocator = allocator
+        if data is None:
+            if self.size:
+                self.data = self.allocator(self.size * self.dtype.itemsize)
+            else:
+                self.data = None
+
+            if base is not None:
+                raise ValueError("If data is specified, base must be None.")
+        else:
+            self.data = data
+
+        self.base = base
+
+        self._global_size, self._local_size = splay(context, self.mem_size)
+
+    def set(self, ary, queue=None, async=False):
+        assert ary.size == self.size
+        assert ary.dtype == self.dtype
+        if self.size:
+            evt = cl.enqueue_write_buffer(queue or self.queue, self.data, ary)
+
+            if not async:
+                evt.wait()
+
+    def get(self, queue=None, ary=None, async=False):
+        if ary is None:
+            ary = numpy.empty(self.shape, self.dtype)
+        else:
+            if ary.size != self.size:
+                raise TypeError("'ary' has non-matching type")
+            if ary.dtype != self.dtype:
+                raise TypeError("'ary' has non-matching size")
+
+        if self.size:
+            evt = cl.enqueue_read_buffer(queue or self.queue, self.data, ary)
+
+            if not async:
+                evt.wait()
+
+        return ary
+
+    def __str__(self):
+        return str(self.get())
+
+    def __repr__(self):
+        return repr(self.get())
+
+    def __hash__(self):
+        raise TypeError("pyopencl arrays are not hashable.")
+
+    # kernel invocation wrappers ----------------------------------------------
+    def _axpbyz(self, selffac, other, otherfac, out, queue=None):
+        """Compute ``out = selffac * self + otherfac*other``,
+        where `other` is a vector.."""
+        assert self.shape == other.shape
+
+        knl = elementwise.get_axpbyz_kernel(
+                self.context, self.dtype, other.dtype, out.dtype)
+
+        knl(queue or self.queue, self._global_size, self._local_size,
+                selffac, self.data, otherfac, other.data,
+                out.data, self.mem_size)
+
+        return out
+
+    def _axpbz(self, selffac, other, out, queue=None):
+        """Compute ``out = selffac * self + other``, where `other` is a scalar."""
+        knl = elementwise.get_axpbz_kernel(self.context, self.dtype)
+        knl(queue or self.queue, self._global_size, self._local_size,
+                selffac, self.data, other, out.data, self.mem_size)
+
+        return out
+
+    def _elwise_multiply(self, other, out, queue=None):
+        knl = elementwise.get_multiply_kernel(
+                self.context, self.dtype, other.dtype, out.dtype)
+        knl(queue or self.queue, self._global_size, self._local_size,
+                self.data, other.data,
+                out.data, self.mem_size)
+
+        return out
+
+    def _rdiv_scalar(self, other, out, queue=None):
+        """Divides an array by a scalar::
+
+           y = n / self
+        """
+
+        assert self.dtype == numpy.float32
+
+        knl = elementwise.get_rdivide_elwise_kernel(self.context, self.dtype)
+        knl(queue or self.queue, self._global_size, self._local_size,
+                self.data, other,
+                out.data, self.mem_size)
+
+        return out
+
+    def _div(self, other, out, queue=None):
+        """Divides an array by another array."""
+
+        assert self.shape == other.shape
+
+        knl = elementwise.get_divide_kernel(self.context,
+                self.dtype, other.dtype, out.dtype)
+        knl(queue or self.queue, self._global_size, self._local_size,
+                self.data, other.data, out.data, self.mem_size)
+
+        return out
+
+    def _new_like_me(self, dtype=None):
+        if dtype is None:
+            dtype = self.dtype
+        return self.__class__(self.context,
+                self.shape, dtype, allocator=self.allocator,
+                queue=self.queue)
+
+    # operators ---------------------------------------------------------------
+    def mul_add(self, selffac, other, otherfac, add_timer=None, queue=None):
+        """Return `selffac * self + otherfac*other`.
+        """
+        result = self._new_like_me(_get_common_dtype(self, other))
+        return self._axpbyz(selffac, other, otherfac, result, add_timer)
+
+    def __add__(self, other):
+        """Add an array with an array or an array with a scalar."""
+
+        if isinstance(other, Array):
+            # add another vector
+            result = self._new_like_me(_get_common_dtype(self, other))
+            return self._axpbyz(1, other, 1, result)
+        else:
+            # add a scalar
+            if other == 0:
+                return self
+            else:
+                result = self._new_like_me()
+                return self._axpbz(1, other, result)
+
+    __radd__ = __add__
+
+    def __sub__(self, other):
+        """Substract an array from an array or a scalar from an array."""
+
+        if isinstance(other, Array):
+            result = self._new_like_me(_get_common_dtype(self, other))
+            return self._axpbyz(1, other, -1, result)
+        else:
+            if other == 0:
+                return self
+            else:
+                # create a new array for the result
+                result = self._new_like_me()
+                return self._axpbz(1, -other, result)
+
+    def __rsub__(self,other):
+        """Substracts an array by a scalar or an array::
+
+           x = n - self
+        """
+        # other must be a scalar
+        result = self._new_like_me()
+        return self._axpbz(-1, other, result)
+
+    def __iadd__(self, other):
+        return self._axpbyz(1, other, 1, self)
+
+    def __isub__(self, other):
+        return self._axpbyz(1, other, -1, self)
+
+    def __neg__(self):
+        result = self._new_like_me()
+        return self._axpbz(-1, 0, result)
+
+    def __mul__(self, other):
+        if isinstance(other, Array):
+            result = self._new_like_me(_get_common_dtype(self, other))
+            return self._elwise_multiply(other, result)
+        else:
+            result = self._new_like_me()
+            return self._axpbz(other, 0, result)
+
+    def __rmul__(self, scalar):
+        result = self._new_like_me()
+        return self._axpbz(scalar, 0, result)
+
+    def __imul__(self, scalar):
+        return self._axpbz(scalar, 0, self)
+
+    def __div__(self, other):
+        """Divides an array by an array or a scalar::
+
+           x = self / n
+        """
+        if isinstance(other, Array):
+            result = self._new_like_me(_get_common_dtype(self, other))
+            return self._div(other, result)
+        else:
+            if other == 1:
+                return self
+            else:
+                # create a new array for the result
+                result = self._new_like_me()
+                return self._axpbz(1/other, 0, result)
+
+    __truediv__ = __div__
+
+    def __rdiv__(self,other):
+        """Divides an array by a scalar or an array::
+
+           x = n / self
+        """
+
+        if isinstance(other, Array):
+            result = self._new_like_me(_get_common_dtype(self, other))
+
+            knl = elementwise.get_divide_kernel(self.context)
+            knl(self.queue, self._global_size, self._local_size,
+                    other.data, self.data, result.data,
+                    self.mem_size)
+
+            return result
+        else:
+            if other == 1:
+                return self
+            else:
+                # create a new array for the result
+                result = self._new_like_me()
+                return self._rdiv_scalar(other, result)
+
+    __rtruediv__ = __div__
+
+    def fill(self, value, queue=None):
+        """fills the array with the specified value"""
+        knl = elementwise.get_fill_kernel(self.context, self.dtype)
+        knl(queue or self.queue, self._global_size, self._local_size,
+                value, self.data, self.mem_size)
+
+        return self
+
+    def __len__(self):
+        """Return the size of the leading dimension of self."""
+        if len(self.shape):
+            return self.shape[0]
+        else:
+            return 1
+
+    def __abs__(self):
+        """Return a `Array` of the absolute values of the elements
+        of `self`.
+        """
+
+        result = self._new_like_me()
+
+        if self.dtype.kind == "f":
+            fname = "fabs"
+        elif self.dtype.kind in ["u", "i"]:
+            fname = "abs"
+        else:
+            raise TypeError("unsupported dtype in __abs__()")
+
+        knl = elementwise.get_unary_func_kernel(
+                self.context, fname, self.dtype)
+        knl(self.queue, self._global_size, self._local_size,
+                self.data,result.data, self.mem_size)
+
+        return result
+
+    def __pow__(self, other):
+        """Exponentiation by a scalar or elementwise by another
+        :class:`Array`.
+        """
+
+        if isinstance(other, Array):
+            assert self.shape == other.shape
+
+            result = self._new_like_me(_get_common_dtype(self, other))
+
+            knl = elementwise.get_pow_array_kernel(self.context,
+                    self.dtype, other.dtype, result.dtype)
+
+            knl(self.queue, self._global_size, self._local_size,
+                    self.data, other.data, result.data,
+                    self.mem_size)
+
+            return result
+        else:
+            result = self._new_like_me()
+            knl = elementwise.get_pow_kernel(self.context, self.dtype)
+            knl(self.queue, self._global_size, self._local_size,
+                    other, self.data, result.data,
+                    self.mem_size)
+
+            return result
+
+    def reverse(self, queue=None):
+        """Return this array in reversed order. The array is treated
+        as one-dimensional.
+        """
+
+        result = self._new_like_me()
+
+        knl = elementwise.get_reverse_kernel(self.context, self.dtype)
+        knl(queue or self.queue, self._global_size, self._local_size,
+                self.data, result.data, self.mem_size)
+
+        return result
+
+    def astype(self, dtype, queue=None):
+        if dtype == self.dtype:
+            return self
+
+        result = self._new_like_me(dtype=dtype)
+
+        knl = elementwise.get_copy_kernel(self.context, dtype, self.dtype)
+        knl(queue or self.queue, self._global_size, self._local_size,
+                result.data, self.data, self.mem_size)
+
+        return result
+
+
+    # slicing -----------------------------------------------------------------
+    def __getitem__(self, idx):
+        if idx == ():
+            return self
+
+        if len(self.shape) > 1:
+            raise NotImplementedError("multi-d slicing is not yet implemented")
+
+        if not isinstance(idx, slice):
+            raise ValueError("non-slice indexing not supported: %s" % (idx,))
+
+        l, = self.shape
+        start, stop, stride = idx.indices(l)
+
+        if stride != 1:
+            raise NotImplementedError("strided slicing is not yet implemented")
+
+        return Array(
+                shape=((stop-start)//stride,),
+                dtype=self.dtype,
+                allocator=self.allocator,
+                base=self,
+                data=int(self.data) + start*self.dtype.itemsize)
+
+    # rich comparisons (or rather, lack thereof) ------------------------------
+    def __eq__(self, other):
+        raise NotImplementedError
+
+    def __ne__(self, other):
+        raise NotImplementedError
+
+    def __le__(self, other):
+        raise NotImplementedError
+
+    def __ge__(self, other):
+        raise NotImplementedError
+
+    def __lt__(self, other):
+        raise NotImplementedError
+
+    def __gt__(self, other):
+        raise NotImplementedError
+
+
+
+def to_device(context, queue, ary, allocator=None, async=False):
+    """Converts a numpy array to a :class:`Array`."""
+    result = Array(context, ary.shape, ary.dtype, allocator,
+            queue=queue)
+    result.set(ary, async=async)
+    return result
+
+
+empty = Array
+
+def zeros(context, queue, shape, dtype, allocator=None):
+    """Returns an array of the given shape and dtype filled with 0's."""
+
+    result = Array(context, shape, dtype, allocator, queue=queue)
+    result.fill(0)
+    return result
+
+def empty_like(ary):
+    result = Array(ary.context,
+            ary.shape, ary.dtype, ary.allocator, queue=ary.queue)
+    return result
+
+def zeros_like(ary):
+    result = Array(ary.context,
+            ary.shape, ary.dtype, ary.allocator, queue=ary.queue)
+    result.fill(0)
+    return result
+
+
+def arange(context, queue, *args, **kwargs):
+    """Create an array filled with numbers spaced `step` apart,
+    starting from `start` and ending at `stop`.
+
+    For floating point arguments, the length of the result is
+    `ceil((stop - start)/step)`.  This rule may result in the last
+    element of the result being greater than stop.
+    """
+
+    # argument processing -----------------------------------------------------
+
+    # Yuck. Thanks, numpy developers. ;)
+    from pytools import Record
+    class Info(Record):
+        pass
+
+    explicit_dtype = False
+
+    inf = Info()
+    inf.start = None
+    inf.stop = None
+    inf.step = None
+    inf.dtype = None
+
+    if isinstance(args[-1], numpy.dtype):
+        dtype = args[-1]
+        args = args[:-1]
+        explicit_dtype = True
+
+    argc = len(args)
+    if argc == 0:
+        raise ValueError, "stop argument required"
+    elif argc == 1:
+        inf.stop = args[0]
+    elif argc == 2:
+        inf.start = args[0]
+        inf.stop = args[1]
+    elif argc == 3:
+        inf.start = args[0]
+        inf.stop = args[1]
+        inf.step = args[2]
+    else:
+        raise ValueError, "too many arguments"
+
+    admissible_names = ["start", "stop", "step", "dtype"]
+    for k, v in kwargs.iteritems():
+        if k in admissible_names:
+            if getattr(inf, k) is None:
+                setattr(inf, k, v)
+                if k == "dtype":
+                    explicit_dtype = True
+            else:
+                raise ValueError, "may not specify '%s' by position and keyword" % k
+        else:
+            raise ValueError, "unexpected keyword argument '%s'" % k
+
+    if inf.start is None:
+        inf.start = 0
+    if inf.step is None:
+        inf.step = 1
+    if inf.dtype is None:
+        inf.dtype = numpy.array([inf.start, inf.stop, inf.step]).dtype
+
+    # actual functionality ----------------------------------------------------
+    dtype = numpy.dtype(inf.dtype)
+    start = dtype.type(inf.start)
+    step = dtype.type(inf.step)
+    stop = dtype.type(inf.stop)
+
+    if not explicit_dtype:
+        raise TypeError("arange requires dtype argument")
+
+    from math import ceil
+    size = int(ceil((stop-start)/step))
+
+    result = Array(context, (size,), dtype, queue=queue)
+
+    knl = elementwise.get_arange_kernel(context, dtype)
+    knl(queue, result._global_size, result._local_size,
+            result.data, start, step, size)
+
+    return result
+
+
+
+
+def take(a, indices, out=None, queue=None):
+    queue = queue or a.queue
+
+    if out is None:
+        out = Array(a.context, indices.shape, a.dtype, a.allocator,
+                queue=queue)
+
+    assert len(indices.shape) == 1
+
+    knl = elementwise.get_take_kernel(
+            a.context, a.dtype, indices.dtype)
+    knl(queue, a._global_size, a._local_size,
+            indices.data, a.data, out.data, indices.size)
+
+    return out
+
+
+
+
+def multi_take(arrays, indices, out=None, queue=None):
+    if not len(arrays):
+        return []
+
+    assert len(indices.shape) == 1
+
+    from pytools import single_valued
+    a_dtype = single_valued(a.dtype for a in arrays)
+    a_allocator = arrays[0].dtype
+    context = indices.context
+    queue = queue or indices.queue
+
+    vec_count = len(arrays)
+
+    if out is None:
+        out = [Array(context, queue, indices.shape, a_dtype, a_allocator)
+                for i in range(vec_count)]
+    else:
+        if len(out) != len(arrays):
+            raise ValueError("out and arrays must have the same length")
+
+    chunk_size = _builtin_min(vec_count, 10)
+
+    def make_func_for_chunk_size(chunk_size):
+        knl = elementwise.get_take_kernel(
+                indices.context, a_dtype, indices.dtype,
+                vec_count=chunk_size)
+        knl.set_block_shape(*indices._block)
+        return knl
+
+    knl = make_func_for_chunk_size(chunk_size)
+
+    for start_i in range(0, len(arrays), chunk_size):
+        chunk_slice = slice(start_i, start_i+chunk_size)
+
+        if start_i + chunk_size > vec_count:
+            knl = make_func_for_chunk_size(vec_count-start_i)
+
+        knl(queue, indices._global_size, indices._local_size,
+                indices.data,
+                *([i.data for i in arrays[chunk_slice]]
+                    + [o.data for o in out[chunk_slice]]
+                    + [indices.size]))
+
+    return out
+
+
+
+
+def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
+        out=None, queue=None, src_offsets=None):
+    if not len(arrays):
+        return []
+
+    from pytools import single_valued
+    a_dtype = single_valued(a.dtype for a in arrays)
+    a_allocator = arrays[0].allocator
+    context = src_indices.context
+    queue = queue or src_indices.queue
+
+    vec_count = len(arrays)
+
+    if out is None:
+        out = [Array(context, dest_shape, a_dtype, a_allocator, queue=queue)
+                for i in range(vec_count)]
+    else:
+        if a_dtype != single_valued(o.dtype for o in out):
+            raise TypeError("arrays and out must have the same dtype")
+        if len(out) != vec_count:
+            raise ValueError("out and arrays must have the same length")
+
+    if src_indices.dtype != dest_indices.dtype:
+        raise TypeError("src_indices and dest_indices must have the same dtype")
+
+    if len(src_indices.shape) != 1:
+        raise ValueError("src_indices must be 1D")
+
+    if src_indices.shape != dest_indices.shape:
+        raise ValueError("src_indices and dest_indices must have the same shape")
+
+    if src_offsets is None:
+        src_offsets_list = []
+    else:
+        src_offsets_list = src_offsets
+        if len(src_offsets) != vec_count:
+            raise ValueError("src_indices and src_offsets must have the same length")
+
+    max_chunk_size = 10
+
+    chunk_size = _builtin_min(vec_count, max_chunk_size)
+
+    def make_func_for_chunk_size(chunk_size):
+        return elementwise.get_take_put_kernel(context,
+                a_dtype, src_indices.dtype,
+                with_offsets=src_offsets is not None,
+                vec_count=chunk_size)
+
+    knl = make_func_for_chunk_size(chunk_size)
+
+    for start_i in range(0, len(arrays), chunk_size):
+        chunk_slice = slice(start_i, start_i+chunk_size)
+
+        if start_i + chunk_size > vec_count:
+            knl = make_func_for_chunk_size(vec_count-start_i)
+
+        knl(queue, src_indices._global_size, src_indices._local_size,
+                dest_indices.data, src_indices.data,
+                *([i.data for i in arrays[chunk_slice]]
+                    + [o.data for o in out[chunk_slice]]
+                    + src_offsets_list[chunk_slice]
+                    + [src_indices.size]))
+
+    return out
+
+
+
+
+def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None):
+    if not len(arrays):
+        return []
+
+    from pytools import single_valued
+    a_dtype = single_valued(a.dtype for a in arrays)
+    a_allocator = arrays[0].allocator
+    context = dest_indices.context
+    queue = queue or dest_indices.queue
+
+    vec_count = len(arrays)
+
+    if out is None:
+        out = [Array(context, dest_shape, a_dtype, a_allocator, queue=queue)
+                for i in range(vec_count)]
+    else:
+        if a_dtype != single_valued(o.dtype for o in out):
+            raise TypeError("arrays and out must have the same dtype")
+        if len(out) != vec_count:
+            raise ValueError("out and arrays must have the same length")
+
+    if len(dest_indices.shape) != 1:
+        raise ValueError("src_indices must be 1D")
+
+    chunk_size = _builtin_min(vec_count, 10)
+
+    def make_func_for_chunk_size(chunk_size):
+        knl = elementwise.get_put_kernel(
+                a_dtype, dest_indices.dtype, vec_count=chunk_size)
+        knl.set_block_shape(*dest_indices._block)
+        return knl
+
+    knl = make_func_for_chunk_size(chunk_size)
+
+    for start_i in range(0, len(arrays), chunk_size):
+        chunk_slice = slice(start_i, start_i+chunk_size)
+
+        if start_i + chunk_size > vec_count:
+            knl = make_func_for_chunk_size(vec_count-start_i)
+
+        knl(queue, dest_indices._global_size, dest_indices._local_size,
+                dest_indices.data,
+                *([o.data for o in out[chunk_slice]]
+                    + [i.data for i in arrays[chunk_slice]]
+                    + [dest_indices.size]))
+
+    return out
+
+
+
+
+def if_positive(criterion, then_, else_, out=None, queue=None):
+    if not (criterion.shape == then_.shape == else_.shape):
+        raise ValueError("shapes do not match")
+
+    if not (then_.dtype == else_.dtype):
+        raise ValueError("dtypes do not match")
+
+    knl = elementwise.get_if_positive_kernel(
+            criterion.context,
+            criterion.dtype, then_.dtype)
+
+    if out is None:
+        out = empty_like(then_)
+
+    knl(queue or criterion.queue, criterion._global_size, criterion._local_size,
+            criterion.data, then_.data, else_.data, out.data,
+            criterion.size)
+
+    return out
+
+
+
+
+def maximum(a, b, out=None, queue=None):
+    # silly, but functional
+    return if_positive(a.mul_add(1, b, -1, queue=queue), a, b,
+            queue=queue, out=out)
+
+
+
+
+def minimum(a, b, out=None, queue=None):
+    # silly, but functional
+    return if_positive(a.mul_add(1, b, -1, queue=queue), b, a,
+            queue=queue, out=out)
+
+
+
+
+# reductions ------------------------------------------------------------------
+_builtin_min = min
+_builtin_max = max
+
+if False:
+    # not yet
+    def sum(a, dtype=None, queue=None):
+        from pyopencl.reduction import get_sum_kernel
+        krnl = get_sum_kernel(dtype, a.dtype)
+        return krnl(a, queue=queue)
+
+    def dot(a, b, dtype=None, queue=None):
+        from pyopencl.reduction import get_dot_kernel
+        krnl = get_dot_kernel(dtype, a.dtype, b.dtype)
+        return krnl(a, b, queue=queue)
+
+    def subset_dot(subset, a, b, dtype=None, queue=None):
+        from pyopencl.reduction import get_subset_dot_kernel
+        krnl = get_subset_dot_kernel(dtype, a.dtype, b.dtype)
+        return krnl(subset, a, b, queue=queue)
+
+    def _make_minmax_kernel(what):
+        def f(a, queue=None):
+            from pyopencl.reduction import get_minmax_kernel
+            krnl = get_minmax_kernel(what, a.dtype)
+            return krnl(a,  queue=queue)
+
+        return f
+
+    min = _make_minmax_kernel("min")
+    max = _make_minmax_kernel("max")
+
+    def _make_subset_minmax_kernel(what):
+        def f(subset, a, queue=None):
+            from pyopencl.reduction import get_subset_minmax_kernel
+            import pyopencl.reduction
+            krnl = get_subset_minmax_kernel(what, a.dtype)
+            return krnl(subset, a,  queue=queue)
+
+        return f
+
+    subset_min = _make_subset_minmax_kernel("min")
+    subset_max = _make_subset_minmax_kernel("max")
+
+
+
+
+
+# vim: foldmethod=marker
diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py
new file mode 100644
index 00000000..0cc344d9
--- /dev/null
+++ b/pyopencl/clrandom.py
@@ -0,0 +1,259 @@
+import pyopencl as cl
+
+md5_code = """
+/*
+ **********************************************************************
+ ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
+ **                                                                  **
+ ** License to copy and use this software is granted provided that   **
+ ** it is identified as the "RSA Data Security, Inc. MD5 Message     **
+ ** Digest Algorithm" in all material mentioning or referencing this **
+ ** software or this function.                                       **
+ **                                                                  **
+ ** License is also granted to make and use derivative works         **
+ ** provided that such works are identified as "derived from the RSA **
+ ** Data Security, Inc. MD5 Message Digest Algorithm" in all         **
+ ** material mentioning or referencing the derived work.             **
+ **                                                                  **
+ ** RSA Data Security, Inc. makes no representations concerning      **
+ ** either the merchantability of this software or the suitability   **
+ ** of this software for any particular purpose.  It is provided "as **
+ ** is" without express or implied warranty of any kind.             **
+ **                                                                  **
+ ** These notices must be retained in any copies of any part of this **
+ ** documentation and/or software.                                   **
+ **********************************************************************
+ */
+
+/* F, G and H are basic MD5 functions: selection, majority, parity */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */
+/* Rotation is separate from addition to prevent recomputation */
+#define FF(a, b, c, d, x, s, ac) \
+  {(a) += F ((b), (c), (d)) + (x) + (ac); \
+   (a) = ROTATE_LEFT ((a), (s)); \
+   (a) += (b); \
+  }
+#define GG(a, b, c, d, x, s, ac) \
+  {(a) += G ((b), (c), (d)) + (x) + (ac); \
+   (a) = ROTATE_LEFT ((a), (s)); \
+   (a) += (b); \
+  }
+#define HH(a, b, c, d, x, s, ac) \
+  {(a) += H ((b), (c), (d)) + (x) + (ac); \
+   (a) = ROTATE_LEFT ((a), (s)); \
+   (a) += (b); \
+  }
+#define II(a, b, c, d, x, s, ac) \
+  {(a) += I ((b), (c), (d)) + (x) + (ac); \
+   (a) = ROTATE_LEFT ((a), (s)); \
+   (a) += (b); \
+  }
+
+#define X0 get_local_id(0)
+#define X1 get_local_id(1)
+#define X2 get_local_id(2)
+#define X3 get_group_id(0)
+#define X4 get_group_id(1)
+#define X5 get_group_id(2)
+#define X6 seed
+#define X7 i
+#define X8 n
+#define X9  get_local_size(0)
+#define X10 get_local_size(1)
+#define X11 get_local_size(2)
+#define X12 get_global_size(0)
+#define X13 get_global_size(1)
+#define X14 get_global_size(2)
+#define X15 0
+
+  unsigned int a = 0x67452301;
+  unsigned int b = 0xefcdab89;
+  unsigned int c = 0x98badcfe;
+  unsigned int d = 0x10325476;
+
+  /* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+  FF ( a, b, c, d, X0 , S11, 3614090360); /* 1 */
+  FF ( d, a, b, c, X1 , S12, 3905402710); /* 2 */
+  FF ( c, d, a, b, X2 , S13,  606105819); /* 3 */
+  FF ( b, c, d, a, X3 , S14, 3250441966); /* 4 */
+  FF ( a, b, c, d, X4 , S11, 4118548399); /* 5 */
+  FF ( d, a, b, c, X5 , S12, 1200080426); /* 6 */
+  FF ( c, d, a, b, X6 , S13, 2821735955); /* 7 */
+  FF ( b, c, d, a, X7 , S14, 4249261313); /* 8 */
+  FF ( a, b, c, d, X8 , S11, 1770035416); /* 9 */
+  FF ( d, a, b, c, X9 , S12, 2336552879); /* 10 */
+  FF ( c, d, a, b, X10, S13, 4294925233); /* 11 */
+  FF ( b, c, d, a, X11, S14, 2304563134); /* 12 */
+  FF ( a, b, c, d, X12, S11, 1804603682); /* 13 */
+  FF ( d, a, b, c, X13, S12, 4254626195); /* 14 */
+  FF ( c, d, a, b, X14, S13, 2792965006); /* 15 */
+  FF ( b, c, d, a, X15, S14, 1236535329); /* 16 */
+
+  /* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+  GG ( a, b, c, d, X1 , S21, 4129170786); /* 17 */
+  GG ( d, a, b, c, X6 , S22, 3225465664); /* 18 */
+  GG ( c, d, a, b, X11, S23,  643717713); /* 19 */
+  GG ( b, c, d, a, X0 , S24, 3921069994); /* 20 */
+  GG ( a, b, c, d, X5 , S21, 3593408605); /* 21 */
+  GG ( d, a, b, c, X10, S22,   38016083); /* 22 */
+  GG ( c, d, a, b, X15, S23, 3634488961); /* 23 */
+  GG ( b, c, d, a, X4 , S24, 3889429448); /* 24 */
+  GG ( a, b, c, d, X9 , S21,  568446438); /* 25 */
+  GG ( d, a, b, c, X14, S22, 3275163606); /* 26 */
+  GG ( c, d, a, b, X3 , S23, 4107603335); /* 27 */
+  GG ( b, c, d, a, X8 , S24, 1163531501); /* 28 */
+  GG ( a, b, c, d, X13, S21, 2850285829); /* 29 */
+  GG ( d, a, b, c, X2 , S22, 4243563512); /* 30 */
+  GG ( c, d, a, b, X7 , S23, 1735328473); /* 31 */
+  GG ( b, c, d, a, X12, S24, 2368359562); /* 32 */
+
+  /* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+  HH ( a, b, c, d, X5 , S31, 4294588738); /* 33 */
+  HH ( d, a, b, c, X8 , S32, 2272392833); /* 34 */
+  HH ( c, d, a, b, X11, S33, 1839030562); /* 35 */
+  HH ( b, c, d, a, X14, S34, 4259657740); /* 36 */
+  HH ( a, b, c, d, X1 , S31, 2763975236); /* 37 */
+  HH ( d, a, b, c, X4 , S32, 1272893353); /* 38 */
+  HH ( c, d, a, b, X7 , S33, 4139469664); /* 39 */
+  HH ( b, c, d, a, X10, S34, 3200236656); /* 40 */
+  HH ( a, b, c, d, X13, S31,  681279174); /* 41 */
+  HH ( d, a, b, c, X0 , S32, 3936430074); /* 42 */
+  HH ( c, d, a, b, X3 , S33, 3572445317); /* 43 */
+  HH ( b, c, d, a, X6 , S34,   76029189); /* 44 */
+  HH ( a, b, c, d, X9 , S31, 3654602809); /* 45 */
+  HH ( d, a, b, c, X12, S32, 3873151461); /* 46 */
+  HH ( c, d, a, b, X15, S33,  530742520); /* 47 */
+  HH ( b, c, d, a, X2 , S34, 3299628645); /* 48 */
+
+  /* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+  II ( a, b, c, d, X0 , S41, 4096336452); /* 49 */
+  II ( d, a, b, c, X7 , S42, 1126891415); /* 50 */
+  II ( c, d, a, b, X14, S43, 2878612391); /* 51 */
+  II ( b, c, d, a, X5 , S44, 4237533241); /* 52 */
+  II ( a, b, c, d, X12, S41, 1700485571); /* 53 */
+  II ( d, a, b, c, X3 , S42, 2399980690); /* 54 */
+  II ( c, d, a, b, X10, S43, 4293915773); /* 55 */
+  II ( b, c, d, a, X1 , S44, 2240044497); /* 56 */
+  II ( a, b, c, d, X8 , S41, 1873313359); /* 57 */
+  II ( d, a, b, c, X15, S42, 4264355552); /* 58 */
+  II ( c, d, a, b, X6 , S43, 2734768916); /* 59 */
+  II ( b, c, d, a, X13, S44, 1309151649); /* 60 */
+  II ( a, b, c, d, X4 , S41, 4149444226); /* 61 */
+  II ( d, a, b, c, X11, S42, 3174756917); /* 62 */
+  II ( c, d, a, b, X2 , S43,  718787259); /* 63 */
+  II ( b, c, d, a, X9 , S44, 3951481745); /* 64 */
+
+  a += 0x67452301;
+  b += 0xefcdab89;
+  c += 0x98badcfe;
+  d += 0x10325476;
+"""
+
+import numpy
+
+def rand(context, queue, shape, dtype):
+    from pyopencl.array import Array
+    from pyopencl.elementwise import get_elwise_kernel
+
+    result = Array(context, shape, dtype, queue=queue)
+
+    if dtype == numpy.float32:
+        func = get_elwise_kernel(context,
+            "float *dest, unsigned int seed",
+            md5_code + """
+            #define POW_2_M32 (1/4294967296.0f)
+            dest[i] = a*POW_2_M32;
+            if ((i += gsize) < n)
+                dest[i] = b*POW_2_M32;
+            if ((i += gsize) < n)
+                dest[i] = c*POW_2_M32;
+            if ((i += gsize) < n)
+                dest[i] = d*POW_2_M32;
+            """,
+            "md5_rng_float")
+    elif dtype == numpy.float64:
+        func = get_elwise_kernel(context,
+            "double *dest, unsigned int seed",
+            md5_code + """
+            #define POW_2_M32 (1/4294967296.0)
+            #define POW_2_M64 (1/18446744073709551616.)
+
+            dest[i] = a*POW_2_M32 + b*POW_2_M64;
+
+            if ((i += gsize) < n)
+            {
+              dest[i] = c*POW_2_M32 + d*POW_2_M64;
+            }
+            """,
+            "md5_rng_float")
+    elif dtype in [numpy.int32, numpy.uint32]:
+        func = get_elwise_kernel(context,
+            "unsigned int *dest, unsigned int seed",
+            md5_code + """
+            dest[i] = a;
+            if ((i += gsize) < n)
+                dest[i] = b;
+            if ((i += gsize) < n)
+                dest[i] = c;
+            if ((i += gsize) < n)
+                dest[i] = d;
+            """,
+            "md5_rng_int")
+    else:
+        raise NotImplementedError
+
+    func(queue, result._global_size, result._local_size,
+            result.data, numpy.random.randint(2**31-1),
+            result.size)
+
+    return result
+
+
+if __name__ == "__main__":
+    import sys
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    if "generate" in sys.argv[1:]:
+        N = 256
+        print N, "MB"
+        r = rand(ctx, queue, (N*2**18,), numpy.uint32)
+        print "generated"
+        r.get().tofile("random.dat")
+        print "written"
+
+    else:
+        from pylab import plot, show, subplot
+        N = 250
+        r1 = rand(ctx, queue, (N,), numpy.uint32)
+        r2 = rand(ctx, queue, (N,), numpy.int32)
+        r3 = rand(ctx, queue, (N,), numpy.float32)
+
+        subplot(131); plot( r1.get(),"x-")
+        subplot(132); plot( r2.get(),"x-")
+        subplot(133); plot( r3.get(),"x-")
+        show()
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
new file mode 100644
index 00000000..1a3f25ec
--- /dev/null
+++ b/pyopencl/elementwise.py
@@ -0,0 +1,466 @@
+"""Elementwise functionality."""
+
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+
+
+
+from pyopencl.tools import context_dependent_memoize
+import numpy
+import pyopencl as cl
+from pyopencl.tools import dtype_to_ctype, VectorArg, ScalarArg
+
+
+
+
+def get_elwise_program(context, arguments, operation,
+        name="elwise_kernel", keep=False, options=[],
+        preamble="", loop_prep="", after_loop=""):
+    from pyopencl import Program
+    return Program(context, """
+        %(preamble)s
+
+        __kernel void %(name)s(%(arguments)s)
+        {
+          unsigned lid = get_local_id(0);
+          unsigned gsize = get_global_size(0);
+          unsigned work_item_start = get_local_size(0)*get_group_id(0);
+          unsigned i;
+
+          %(loop_prep)s;
+
+          for (i = work_item_start + lid; i < n; i += gsize)
+          {
+            %(operation)s;
+          }
+
+          %(after_loop)s;
+        }
+        """ % {
+            "arguments": ", ".join(arg.declarator() for arg in arguments),
+            "operation": operation,
+            "name": name,
+            "preamble": preamble,
+            "loop_prep": loop_prep,
+            "after_loop": after_loop,
+            }).build(options=" ".join(options))
+
+
+
+
+def get_elwise_kernel_and_types(context, arguments, operation,
+        name="elwise_kernel", keep=False, options=[], preamble="", **kwargs):
+    if isinstance(arguments, str):
+        from pyopencl.tools import parse_c_arg
+        parsed_args = [parse_c_arg(arg) for arg in arguments.split(",")]
+    else:
+        parsed_args = arguments
+
+    for arg in parsed_args:
+        if numpy.float64  == arg.dtype:
+            preamble += "\n\n#pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
+            break
+
+    parsed_args.append(ScalarArg(numpy.uintp, "n"))
+
+    prg = get_elwise_program(context, parsed_args, operation, name,
+            keep, options, preamble, **kwargs)
+
+    scalar_arg_dtypes = []
+    for arg in parsed_args:
+        if isinstance(arg, ScalarArg):
+            scalar_arg_dtypes.append(arg.dtype)
+        else:
+            scalar_arg_dtypes.append(None)
+
+    kernel = getattr(prg, name)
+    kernel.set_scalar_arg_dtypes(scalar_arg_dtypes)
+
+    return kernel, parsed_args
+
+
+
+
+def get_elwise_kernel(context, arguments, operation,
+        name="elwise_kernel", options=[], **kwargs):
+    """Return a L{pyopencl.Kernel} that performs the same scalar operation
+    on one or several vectors.
+    """
+    func, arguments = get_elwise_kernel_and_types(context,
+            arguments, operation, name, options, **kwargs)
+
+    return func
+
+
+
+
+class ElementwiseKernel:
+    def __init__(self, context, arguments, operation,
+            name="elwise_kernel", options=[], **kwargs):
+        self.kernel, self.arguments = get_elwise_kernel_and_types(context,
+            arguments, operation, name, options, **kwargs)
+
+        if not [i for i, arg in enumerate(self.arguments)
+                if isinstance(arg, VectorArg)]:
+            raise RuntimeError(
+                "ElementwiseKernel can only be used with "
+                "functions that have at least one "
+                "vector argument")
+
+    def __call__(self, *args, **kwargs):
+        vectors = []
+
+        invocation_args = []
+        for arg, arg_descr in zip(args, self.arguments):
+            if isinstance(arg_descr, VectorArg):
+                vectors.append(arg)
+                invocation_args.append(arg.data)
+            else:
+                invocation_args.append(arg)
+
+        queue = kwargs.pop("queue", None)
+        if kwargs:
+            raise TypeError("too many/unknown keyword arguments")
+
+        repr_vec = vectors[0]
+        if queue is None:
+            queue = repr_vec.queue
+        invocation_args.append(repr_vec.mem_size)
+
+        self.kernel.set_args(*invocation_args)
+        return cl.enqueue_nd_range_kernel(queue, self.kernel,
+                repr_vec._global_size, repr_vec._local_size)
+
+
+
+
+@context_dependent_memoize
+def get_take_kernel(context, dtype, idx_dtype, vec_count=1):
+    ctx = {
+            "idx_tp": dtype_to_ctype(idx_dtype),
+            "tp": dtype_to_ctype(dtype),
+            }
+
+    args = ([VectorArg(idx_dtype, "idx")]
+            + [VectorArg(dtype, "src"+str(i))for i in range(vec_count)] 
+            + [VectorArg(dtype, "dest"+str(i))for i in range(vec_count)])
+    body = (
+            ("%(idx_tp)s src_idx = idx[i];\n" % ctx)
+            + "\n".join(
+            "dest%d[i] = src%d[src_idx];" % (i, i)
+            for i in range(vec_count)))
+
+    return get_elwise_kernel(context, args, body, "take")
+
+
+
+
+@context_dependent_memoize
+def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1):
+    ctx = {
+            "idx_tp": dtype_to_ctype(idx_dtype),
+            "tp": dtype_to_ctype(dtype),
+            }
+
+    args = [
+            VectorArg(idx_dtype, "gmem_dest_idx"),
+            VectorArg(idx_dtype, "gmem_src_idx"),
+            ] + [
+            VectorArg(dtype, "src%d" % i)
+                for i in range(vec_count)
+            ] + [
+            VectorArg(dtype, "dest%d" % i)
+                for i in range(vec_count)
+            ] + [
+            ScalarArg(idx_dtype, "offset%d" % i)
+                for i in range(vec_count) if with_offsets
+            ]
+
+    if with_offsets:
+        def get_copy_insn(i):
+            return ("dest%d[dest_idx] = "
+                    "src%d[src_idx+offset%d];"
+                    % (i, i, i))
+    else:
+        def get_copy_insn(i):
+            return ("dest%d[dest_idx] = "
+                    "src%d[src_idx];" % (i, i))
+
+    body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n"
+                "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx)
+            + "\n".join(get_copy_insn(i) for i in range(vec_count)))
+
+    return get_elwise_kernel(context, args, body, "take_put")
+
+
+
+
+@context_dependent_memoize
+def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
+    ctx = {
+            "idx_tp": dtype_to_ctype(idx_dtype),
+            "tp": dtype_to_ctype(dtype),
+            }
+
+    args = [
+            VectorArg(idx_dtype, "gmem_dest_idx"),
+            ] + [
+            VectorArg(dtype, "dest%d" % i)
+                for i in range(vec_count)
+            ] + [
+            VectorArg(dtype, "src%d" % i)
+                for i in range(vec_count)
+            ]
+
+    body = (
+            "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx
+            + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i)
+                for i in range(vec_count)))
+
+    return get_elwise_kernel(args, body, "put")
+
+
+
+
+@context_dependent_memoize
+def get_copy_kernel(context, dtype_dest, dtype_src):
+    return get_elwise_kernel(context,
+            "%(tp_dest)s *dest, %(tp_src)s *src" % {
+                "tp_dest": dtype_to_ctype(dtype_dest),
+                "tp_src": dtype_to_ctype(dtype_src),
+                },
+            "dest[i] = src[i]",
+            "copy")
+
+
+
+@context_dependent_memoize
+def get_linear_combination_kernel(summand_descriptors,
+        dtype_z):
+    from pycuda.tools import dtype_to_ctype
+    from pycuda.elementwise import \
+            VectorArg, ScalarArg, get_elwise_module
+
+    args = []
+    preamble = [ "#include <pycuda-helpers.hpp>\n\n" ]
+    loop_prep = []
+    summands = []
+    tex_names = []
+
+    for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \
+            enumerate(summand_descriptors):
+        if is_gpu_scalar:
+            preamble.append(
+                    "texture <%s, 1, cudaReadModeElementType> tex_a%d;"
+                    % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i))
+            args.append(VectorArg(vector_dtype, "x%d" % i))
+            tex_names.append("tex_a%d" % i)
+            loop_prep.append(
+                    "%s a%d = fp_tex1Dfetch(tex_a%d, 0)"
+                    % (dtype_to_ctype(scalar_dtype), i, i))
+        else:
+            args.append(ScalarArg(scalar_dtype, "a%d" % i))
+            args.append(VectorArg(vector_dtype, "x%d" % i))
+
+        summands.append("a%d*x%d[i]" % (i, i))
+
+    args.append(VectorArg(dtype_z, "z"))
+    args.append(ScalarArg(numpy.uintp, "n"))
+
+    mod = get_elwise_module(args,
+            "z[i] = " + " + ".join(summands),
+            "linear_combination",
+            preamble="\n".join(preamble),
+            loop_prep=";\n".join(loop_prep))
+
+    func = mod.get_function("linear_combination")
+    tex_src = [mod.get_texref(tn) for tn in tex_names]
+    func.prepare("".join(arg.struct_char for arg in args),
+            (1,1,1), texrefs=tex_src)
+
+    return func, tex_src
+
+
+
+
+@context_dependent_memoize
+def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
+    return get_elwise_kernel(context,
+            "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % {
+                "tp_x": dtype_to_ctype(dtype_x),
+                "tp_y": dtype_to_ctype(dtype_y),
+                "tp_z": dtype_to_ctype(dtype_z),
+                },
+            "z[i] = a*x[i] + b*y[i]",
+            "axpbyz")
+
+@context_dependent_memoize
+def get_axpbz_kernel(context, dtype):
+    return get_elwise_kernel(context,
+            "%(tp)s a, %(tp)s *x,%(tp)s b, %(tp)s *z" % {
+                "tp": dtype_to_ctype(dtype)},
+            "z[i] = a * x[i] + b",
+            "axpb")
+
+@context_dependent_memoize
+def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z):
+    return get_elwise_kernel(context,
+            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
+                "tp_x": dtype_to_ctype(dtype_x),
+                "tp_y": dtype_to_ctype(dtype_y),
+                "tp_z": dtype_to_ctype(dtype_z),
+                },
+            "z[i] = x[i] * y[i]",
+            "multiply")
+
+@context_dependent_memoize
+def get_divide_kernel(context, dtype_x, dtype_y, dtype_z):
+    return get_elwise_kernel(context,
+            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
+                "tp_x": dtype_to_ctype(dtype_x),
+                "tp_y": dtype_to_ctype(dtype_y),
+                "tp_z": dtype_to_ctype(dtype_z),
+                },
+            "z[i] = x[i] / y[i]",
+            "divide")
+
+@context_dependent_memoize
+def get_rdivide_elwise_kernel(context, dtype):
+    return get_elwise_kernel(context,
+            "%(tp)s *x, %(tp)s y, %(tp)s *z" % {
+                "tp": dtype_to_ctype(dtype),
+                },
+            "z[i] = y / x[i]",
+            "divide_r")
+
+@context_dependent_memoize
+def get_fill_kernel(context, dtype):
+    return get_elwise_kernel(context,
+            "%(tp)s a, %(tp)s *z" % {
+                "tp": dtype_to_ctype(dtype),
+                },
+            "z[i] = a",
+            "fill")
+
+@context_dependent_memoize
+def get_reverse_kernel(context, dtype):
+    return get_elwise_kernel(context,
+            "%(tp)s *y, %(tp)s *z" % {
+                "tp": dtype_to_ctype(dtype),
+                },
+            "z[i] = y[n-1-i]",
+            "reverse")
+
+@context_dependent_memoize
+def get_arange_kernel(context, dtype):
+    return get_elwise_kernel(context,
+            "%(tp)s *z, %(tp)s start, %(tp)s step" % {
+                "tp": dtype_to_ctype(dtype),
+                },
+            "z[i] = start + i*step",
+            "arange")
+
+
+@context_dependent_memoize
+def get_pow_kernel(context, dtype):
+    return get_elwise_kernel(context,
+            "%(tp)s value, %(tp)s *y, %(tp)s *z" % {
+                "tp": dtype_to_ctype(dtype),
+                },
+            "z[i] = pow(y[i], value)",
+            "pow_method")
+
+@context_dependent_memoize
+def get_pow_array_kernel(context, dtype_x, dtype_y, dtype_z):
+    return get_elwise_kernel(context,
+            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
+                "tp_x": dtype_to_ctype(dtype_x),
+                "tp_y": dtype_to_ctype(dtype_y),
+                "tp_z": dtype_to_ctype(dtype_z),
+                },
+            "z[i] = pow(x[i], y[i])",
+            "pow_method")
+
+@context_dependent_memoize
+def get_fmod_kernel(context):
+    return get_elwise_kernel(context,
+            "float *arg, float *mod, float *z",
+            "z[i] = fmod(arg[i], mod[i])",
+            "fmod_kernel")
+
+@context_dependent_memoize
+def get_modf_kernel(context):
+    return get_elwise_kernel(context,
+            "float *x, float *intpart ,float *fracpart",
+            "fracpart[i] = modf(x[i], &intpart[i])",
+            "modf_kernel")
+
+@context_dependent_memoize
+def get_frexp_kernel(context):
+    return get_elwise_kernel(context,
+            "float *x, float *significand, float *exponent",
+            """
+                int expt = 0;
+                significand[i] = frexp(x[i], &expt);
+                exponent[i] = expt;
+            """,
+            "frexp_kernel")
+
+@context_dependent_memoize
+def get_ldexp_kernel(context):
+    return get_elwise_kernel(context,
+            "float *sig, float *expt, float *z",
+            "z[i] = ldexp(sig[i], int(expt[i]))",
+            "ldexp_kernel")
+
+@context_dependent_memoize
+def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None):
+    if out_dtype is None:
+        out_dtype = in_dtype
+
+    return get_elwise_kernel(context,
+            "%(tp_in)s *y, %(tp_out)s *z" % {
+                "tp_in": dtype_to_ctype(in_dtype),
+                "tp_out": dtype_to_ctype(out_dtype),
+                },
+            "z[i] = %s(y[i])" % func_name,
+            "%s_kernel" % func_name)
+
+
+
+
+@context_dependent_memoize
+def get_if_positive_kernel(context, crit_dtype, dtype):
+    return get_elwise_kernel(context, [
+            VectorArg(crit_dtype, "crit"),
+            VectorArg(dtype, "then_"),
+            VectorArg(dtype, "else_"),
+            VectorArg(dtype, "result"),
+            ],
+            "result[i] = crit[i] > 0 ? then_[i] : else_[i]",
+            "if_positive")
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
new file mode 100644
index 00000000..aca8c4c9
--- /dev/null
+++ b/pyopencl/tools.py
@@ -0,0 +1,173 @@
+"""H."""
+
+from __future__ import division
+
+__copyright__ = "Copyright (C) 2010 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+
+
+import numpy
+from decorator import decorator
+
+
+
+
+@decorator
+def context_dependent_memoize(func, context, *args):
+    """Provides memoization for things that get created inside
+    a context, i.e. mainly programs and kernels. Assumes that
+    the first argument of the decorated function is the context.
+    """
+    dicname = "_ctx_memoize_dic_%s_%x" % (
+            func.__name__, hash(func))
+
+    try:
+        return getattr(context, dicname)[args]
+    except AttributeError:
+        result = func(context, *args)
+        setattr(context, dicname, {args: result})
+        return result
+    except KeyError:
+        result = func(context, *args)
+        getattr(context,dicname)[args] = result
+        return result
+
+
+
+# {{{ C code generation helpers -----------------------------------------------
+def dtype_to_ctype(dtype):
+    if dtype is None:
+        raise ValueError("dtype may not be None")
+
+    dtype = numpy.dtype(dtype)
+    if dtype == numpy.int64:
+        return "long"
+    elif dtype == numpy.uint64:
+        return "unsigned long"
+    elif dtype == numpy.int32:
+        return "int"
+    elif dtype == numpy.uint32:
+        return "unsigned int"
+    elif dtype == numpy.int16:
+        return "short int"
+    elif dtype == numpy.uint16:
+        return "short unsigned int"
+    elif dtype == numpy.int8:
+        return "signed char"
+    elif dtype == numpy.uint8:
+        return "unsigned char"
+    elif dtype == numpy.bool:
+        return "bool"
+    elif dtype == numpy.float32:
+        return "float"
+    elif dtype == numpy.float64:
+        return "double"
+    elif dtype == numpy.complex64:
+        return "complex float"
+    elif dtype == numpy.complex128:
+        return "complex double"
+    else:
+        raise ValueError, "unable to map dtype '%s'" % dtype
+
+# }}}
+
+
+
+
+# {{{ C argument lists --------------------------------------------------------
+class Argument:
+    def __init__(self, dtype, name):
+        self.dtype = numpy.dtype(dtype)
+        self.name = name
+
+    def __repr__(self):
+        return "%s(%r, %s)" % (
+                self.__class__.__name__,
+                self.name,
+                self.dtype)
+
+class VectorArg(Argument):
+    def declarator(self):
+        return "__global %s *%s" % (dtype_to_ctype(self.dtype), self.name)
+
+class ScalarArg(Argument):
+    def declarator(self):
+        return "%s %s" % (dtype_to_ctype(self.dtype), self.name)
+
+
+
+
+
+def parse_c_arg(c_arg):
+    c_arg = (c_arg
+            .replace("const", "")
+            .replace("volatile", "")
+            .replace("__global", "")
+            .replace("__local", "")
+            .replace("__constant", ""))
+
+    # process and remove declarator
+    import re
+    decl_re = re.compile(r"(\**)\s*([_a-zA-Z0-9]+)(\s*\[[ 0-9]*\])*\s*$")
+    decl_match = decl_re.search(c_arg)
+
+    if decl_match is None:
+        raise ValueError("couldn't parse C declarator '%s'" % c_arg)
+
+    name = decl_match.group(2)
+
+    if decl_match.group(1) or decl_match.group(3) is not None:
+        arg_class = VectorArg
+    else:
+        arg_class = ScalarArg
+
+    tp = c_arg[:decl_match.start()]
+    tp = " ".join(tp.split())
+
+    if tp == "float": dtype = numpy.float32
+    elif tp == "double": dtype = numpy.float64
+    elif tp == "pycuda::complex<float>": dtype = numpy.complex64
+    elif tp == "pycuda::complex<double>": dtype = numpy.complex128
+    elif tp in ["int", "signed int"]: dtype = numpy.int32
+    elif tp in ["unsigned", "unsigned int"]: dtype = numpy.uint32
+    elif tp in ["long", "long int"]: dtype = numpy.int64
+    elif tp in ["unsigned long", "unsigned long int"]:
+        dtype = numpy.uint64
+    elif tp in ["short", "short int"]: dtype = numpy.int16
+    elif tp in ["unsigned short", "unsigned short int"]: dtype = numpy.uint16
+    elif tp in ["char"]: dtype = numpy.int8
+    elif tp in ["unsigned char"]: dtype = numpy.uint8
+    elif tp in ["bool"]: dtype = numpy.bool
+    else: raise ValueError, "unknown type '%s'" % tp
+
+    return arg_class(dtype, name)
+
+# }}}
+
+
+
+
+# vim: foldmethod=marker
diff --git a/setup.py b/setup.py
index 0e488cd8..0088d0bd 100644
--- a/setup.py
+++ b/setup.py
@@ -128,7 +128,8 @@ def main():
 
             install_requires=[
                 "pytools>=7",
-                "py>=1.0.2"
+                "py>=1.0.2",
+                "decorator>=3.2.0",
                 ],
 
             ext_package="pyopencl",
diff --git a/test/test_array.py b/test/test_array.py
new file mode 100644
index 00000000..a1f9d263
--- /dev/null
+++ b/test/test_array.py
@@ -0,0 +1,593 @@
+#! /usr/bin/env python
+import numpy
+import numpy.linalg as la
+import sys
+import pytools.test
+
+
+
+
+def have_cl():
+    try:
+        import pyopencl
+        return True
+    except:
+        return False
+
+if have_cl():
+    import pyopencl.array as cl_array
+    import pyopencl as cl
+
+
+
+
+def has_double_support(dev):
+    for ext in dev.extensions.split(" "):
+        if ext == "cl_khr_fp64":
+            return True
+    return False
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_pow_array(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    a = numpy.array([1,2,3,4,5]).astype(numpy.float32)
+    a_gpu = cl_array.to_device(context, queue, a)
+
+    result = pow(a_gpu,a_gpu).get()
+    assert (numpy.abs(a**a - result) < 1e-3).all()
+
+    result = (a_gpu**a_gpu).get()
+    assert (numpy.abs(pow(a, a) - result) < 1e-3).all()
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_pow_number(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a_gpu = cl_array.to_device(context, queue, a)
+
+    result = pow(a_gpu, 2).get()
+    assert (numpy.abs(a**2 - result) < 1e-3).all()
+
+
+
+@pytools.test.mark_test.opencl
+def test_abs(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    a = -cl_array.arange(context, queue, 111, dtype=numpy.float32)
+    res = a.get()
+
+    for i in range(111):
+        assert res[i] <= 0
+
+    a = abs(a)
+
+    res = a.get()
+
+    for i in range (111):
+        assert abs(res[i]) >= 0
+        assert res[i] == i
+
+
+@pytools.test.mark_test.opencl
+def test_len(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a_cpu = cl_array.to_device(context, queue, a)
+    assert len(a_cpu) == 10
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_multiply(ctx_getter):
+    """Test the muliplication of an array with a scalar. """
+
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+
+    for sz in [10, 50000]:
+        for dtype, scalars in [
+            (numpy.float32, [2]),
+            #(numpy.complex64, [2, 2j])
+            ]:
+            for scalar in scalars:
+                a = numpy.arange(sz).astype(dtype)
+                a_gpu = cl_array.to_device(context, queue, a)
+                a_doubled = (scalar * a_gpu).get()
+
+                assert (a * scalar == a_doubled).all()
+
+@pytools.test.mark_test.opencl
+def test_multiply_array(ctx_getter):
+    """Test the multiplication of two arrays."""
+
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+
+    a_gpu = cl_array.to_device(context, queue, a)
+    b_gpu = cl_array.to_device(context, queue, a)
+
+    a_squared = (b_gpu*a_gpu).get()
+
+    assert (a*a == a_squared).all()
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_addition_array(ctx_getter):
+    """Test the addition of two arrays."""
+
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a_gpu = cl_array.to_device(context, queue, a)
+    a_added = (a_gpu+a_gpu).get()
+
+    assert (a+a == a_added).all()
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_addition_scalar(ctx_getter):
+    """Test the addition of an array and a scalar."""
+
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a_gpu = cl_array.to_device(context, queue, a)
+    a_added = (7+a_gpu).get()
+
+    assert (7+a == a_added).all()
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_substract_array(ctx_getter):
+    """Test the substraction of two arrays."""
+    #test data
+    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    b = numpy.array([10,20,30,40,50,60,70,80,90,100]).astype(numpy.float32)
+
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    a_gpu = cl_array.to_device(context, queue, a)
+    b_gpu = cl_array.to_device(context, queue, b)
+
+    result = (a_gpu-b_gpu).get()
+    assert (a-b == result).all()
+
+    result = (b_gpu-a_gpu).get()
+    assert (b-a == result).all()
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_substract_scalar(ctx_getter):
+    """Test the substraction of an array and a scalar."""
+
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    #test data
+    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+
+    #convert a to a gpu object
+    a_gpu = cl_array.to_device(context, queue, a)
+
+    result = (a_gpu-7).get()
+    assert (a-7 == result).all()
+
+    result = (7-a_gpu).get()
+    assert (7-a == result).all()
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_divide_scalar(ctx_getter):
+    """Test the division of an array and a scalar."""
+
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a_gpu = cl_array.to_device(context, queue, a)
+
+    result = (a_gpu/2).get()
+    assert (a/2 == result).all()
+
+    result = (2/a_gpu).get()
+    assert (2/a == result).all()
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_divide_array(ctx_getter):
+    """Test the division of an array and a scalar. """
+
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    #test data
+    a = numpy.array([10,20,30,40,50,60,70,80,90,100]).astype(numpy.float32)
+    b = numpy.array([10,10,10,10,10,10,10,10,10,10]).astype(numpy.float32)
+
+    a_gpu = cl_array.to_device(context, queue, a)
+    b_gpu = cl_array.to_device(context, queue, b)
+
+    a_divide = (a_gpu/b_gpu).get()
+    assert (numpy.abs(a/b - a_divide) < 1e-3).all()
+
+    a_divide = (b_gpu/a_gpu).get()
+    assert (numpy.abs(b/a - a_divide) < 1e-3).all()
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_random(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.clrandom import rand as clrand
+
+    if has_double_support(context.devices[0]):
+        dtypes = [numpy.float32, numpy.float64]
+    else:
+        dtypes = [numpy.float32]
+
+    for dtype in dtypes:
+        a = clrand(context, queue, (10, 100), dtype=dtype).get()
+
+        assert (0 <= a).all()
+        assert (a < 1).all()
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_nan_arithmetic(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    def make_nan_contaminated_vector(size):
+        shape = (size,)
+        a = numpy.random.randn(*shape).astype(numpy.float32)
+        #for i in range(0, shape[0], 3):
+            #a[i] = float('nan')
+        from random import randrange
+        for i in range(size//10):
+            a[randrange(0, size)] = float('nan')
+        return a
+
+    size = 1 << 20
+
+    a = make_nan_contaminated_vector(size)
+    a_gpu = cl_array.to_device(context, queue, a)
+    b = make_nan_contaminated_vector(size)
+    b_gpu = cl_array.to_device(context, queue, b)
+
+    ab = a*b
+    ab_gpu = (a_gpu*b_gpu).get()
+
+    for i in range(size):
+        assert numpy.isnan(ab[i]) == numpy.isnan(ab_gpu[i])
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_elwise_kernel(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.clrandom import rand as clrand
+
+    a_gpu = clrand(context, queue, (50,), numpy.float32)
+    b_gpu = clrand(context, queue, (50,), numpy.float32)
+
+    from pyopencl.elementwise import ElementwiseKernel
+    lin_comb = ElementwiseKernel(context,
+            "float a, float *x, float b, float *y, float *z",
+            "z[i] = a*x[i] + b*y[i]",
+            "linear_combination")
+
+    c_gpu = cl_array.empty_like(a_gpu)
+    lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
+
+    assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_take(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    idx = cl_array.arange(context, queue, 0, 200000, 2, dtype=numpy.uint32)
+    a = cl_array.arange(context, queue, 0, 600000, 3, dtype=numpy.float32)
+    result = cl_array.take(a, idx)
+    assert ((3*idx).get() == result.get()).all()
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_arange(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    n = 5000
+    a = cl_array.arange(context, queue, n, dtype=numpy.float32)
+    assert (numpy.arange(n, dtype=numpy.float32) == a.get()).all()
+
+
+
+
+@pytools.test.mark_test.opencl
+def test_reverse(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    n = 5000
+    a = numpy.arange(n).astype(numpy.float32)
+    a_gpu = cl_array.to_device(context, queue, a)
+
+    a_gpu = a_gpu.reverse()
+
+    assert (a[::-1] == a_gpu.get()).all()
+
+if False:
+    # not yet
+
+    @pytools.test.mark_test.opencl
+    def test_sum(ctx_getter):
+        context = ctx_getter()
+        queue = cl.CommandQueue(context)
+
+        from pyopencl.clrandom import rand as clrand
+
+        a_gpu = clrand(context, queue, (200000,))
+        a = a_gpu.get()
+
+        sum_a = numpy.sum(a)
+
+        from pycuda.reduction import get_sum_kernel
+        sum_a_gpu = cl_array.sum(a_gpu).get()
+
+        assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
+
+    @pytools.test.mark_test.opencl
+    def test_minmax(ctx_getter):
+        context = ctx_getter()
+        queue = cl.CommandQueue(context)
+
+        from pyopencl.clrandom import rand as clrand
+
+        if has_double_support():
+            dtypes = [numpy.float64, numpy.float32, numpy.int32]
+        else:
+            dtypes = [numpy.float32, numpy.int32]
+
+        for what in ["min", "max"]:
+            for dtype in dtypes:
+                a_gpu = clrand(context, queue, (200000,), dtype)
+                a = a_gpu.get()
+
+                op_a = getattr(numpy, what)(a)
+                op_a_gpu = getattr(cl_array, what)(a_gpu).get()
+
+                assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
+
+    @pytools.test.mark_test.opencl
+    def test_subset_minmax(ctx_getter):
+        context = ctx_getter()
+        queue = cl.CommandQueue(context)
+
+        from pyopencl.clrandom import rand as clrand
+
+        l_a = 200000
+        gran = 5
+        l_m = l_a - l_a // gran + 1
+
+        if has_double_support():
+            dtypes = [numpy.float64, numpy.float32, numpy.int32]
+        else:
+            dtypes = [numpy.float32, numpy.int32]
+
+        for dtype in dtypes:
+            a_gpu = clrand(context, queue, (l_a,), dtype)
+            a = a_gpu.get()
+
+            meaningful_indices_gpu = cl_array.zeros(l_m, dtype=numpy.int32)
+            meaningful_indices = meaningful_indices_gpu.get()
+            j = 0
+            for i in range(len(meaningful_indices)):
+                meaningful_indices[i] = j
+                j = j + 1
+                if j % gran == 0:
+                    j = j + 1
+
+            meaningful_indices_gpu = cl_array.to_device(meaningful_indices)
+            b = a[meaningful_indices]
+
+            min_a = numpy.min(b)
+            min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get()
+
+            assert min_a_gpu == min_a
+
+    @pytools.test.mark_test.opencl
+    def test_dot(ctx_getter):
+        from pyopencl.clrandom import rand as clrand
+        a_gpu = clrand(context, queue, (200000,))
+        a = a_gpu.get()
+        b_gpu = clrand(context, queue, (200000,))
+        b = b_gpu.get()
+
+        dot_ab = numpy.dot(a, b)
+
+        dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()
+
+        assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
+
+    @pytools.test.mark_test.opencl
+    def test_slice(ctx_getter):
+        from pyopencl.clrandom import rand as clrand
+
+        l = 20000
+        a_gpu = clrand(context, queue, (l,))
+        a = a_gpu.get()
+
+        from random import randrange
+        for i in range(200):
+            start = randrange(l)
+            end = randrange(start, l)
+
+            a_gpu_slice = a_gpu[start:end]
+            a_slice = a[start:end]
+
+            assert la.norm(a_gpu_slice.get()-a_slice) == 0
+
+@pytools.test.mark_test.opencl
+def test_if_positive(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.clrandom import rand as clrand
+
+    l = 20000
+    a_gpu = clrand(context, queue, (l,), numpy.float32)
+    b_gpu = clrand(context, queue, (l,), numpy.float32)
+    a = a_gpu.get()
+    b = b_gpu.get()
+
+    max_a_b_gpu = cl_array.maximum(a_gpu, b_gpu)
+    min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu)
+
+    print max_a_b_gpu
+    print numpy.maximum(a, b)
+
+    assert la.norm(max_a_b_gpu.get()- numpy.maximum(a, b)) == 0
+    assert la.norm(min_a_b_gpu.get()- numpy.minimum(a, b)) == 0
+
+@pytools.test.mark_test.opencl
+def test_take_put(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    for n in [5, 17, 333]:
+        one_field_size = 8
+        buf_gpu = cl_array.zeros(context, queue,
+                n*one_field_size, dtype=numpy.float32)
+        dest_indices = cl_array.to_device(context, queue,
+                numpy.array([ 0,  1,  2,  3, 32, 33, 34, 35], dtype=numpy.uint32))
+        read_map = cl_array.to_device(context, queue,
+                numpy.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=numpy.uint32))
+
+        cl_array.multi_take_put(
+                arrays=[buf_gpu for i in range(n)],
+                dest_indices=dest_indices,
+                src_indices=read_map,
+                src_offsets=[i*one_field_size for i in range(n)],
+                dest_shape=(96,))
+
+@pytools.test.mark_test.opencl
+def test_astype(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.clrandom import rand as clrand
+
+    if not has_double_support(context.devices[0]):
+        return
+
+    a_gpu = clrand(context, queue, (2000,), dtype=numpy.float32)
+
+    a = a_gpu.get().astype(numpy.float64)
+    a2 = a_gpu.astype(numpy.float64).get()
+
+    assert a2.dtype == numpy.float64
+    assert la.norm(a - a2) == 0, (a, a2)
+
+    a_gpu = clrand(context, queue, (2000,), dtype=numpy.float64)
+
+    a = a_gpu.get().astype(numpy.float32)
+    a2 = a_gpu.astype(numpy.float32).get()
+
+    assert a2.dtype == numpy.float32
+    assert la.norm(a - a2)/la.norm(a) < 1e-7
+
+
+
+
+def pytest_generate_tests(metafunc):
+    if have_cl():
+        import pyopencl as cl
+    else:
+        # will still show "skipped" messages
+        return
+
+    if ("device" in metafunc.funcargnames
+            or "ctx_getter" in metafunc.funcargnames):
+        arg_dict = {}
+
+        for platform in cl.get_platforms():
+            if "platform" in metafunc.funcargnames:
+                arg_dict["platform"] = platform
+
+            for device in platform.get_devices():
+                if "device" in metafunc.funcargnames:
+                    arg_dict["device"] = device
+
+                if "ctx_getter" in metafunc.funcargnames:
+                    arg_dict["ctx_getter"] = lambda: cl.Context([device])
+
+                metafunc.addcall(funcargs=arg_dict.copy(),
+                        id=", ".join("%s=%s" % (arg, value) 
+                                for arg, value in arg_dict.iteritems()))
+
+    elif "platform" in metafunc.funcargnames:
+        for platform in cl.get_platforms():
+            metafunc.addcall(
+                    funcargs=dict(platform=platform),
+                    id=str(platform))
+
+
+
+
+if __name__ == "__main__":
+    # make sure that import failures get reported, instead of skipping the tests.
+    import pyopencl as cl
+
+    import sys
+    if len(sys.argv) > 1:
+        exec sys.argv[1]
+    else:
+        from py.test.cmdline import main
+        main([__file__])
-- 
GitLab