From 0ac073fedcf4d33a2d8afaf32005d4240438008a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Sun, 27 Jun 2010 00:30:46 -0400 Subject: [PATCH] Port array functionality from PyCUDA. --- examples/demo_elementwise.py | 26 ++ pyopencl/array.py | 875 +++++++++++++++++++++++++++++++++++ pyopencl/clrandom.py | 259 +++++++++++ pyopencl/elementwise.py | 466 +++++++++++++++++++ pyopencl/tools.py | 173 +++++++ setup.py | 3 +- test/test_array.py | 593 ++++++++++++++++++++++++ 7 files changed, 2394 insertions(+), 1 deletion(-) create mode 100644 examples/demo_elementwise.py create mode 100644 pyopencl/array.py create mode 100644 pyopencl/clrandom.py create mode 100644 pyopencl/elementwise.py create mode 100644 pyopencl/tools.py create mode 100644 test/test_array.py diff --git a/examples/demo_elementwise.py b/examples/demo_elementwise.py new file mode 100644 index 00000000..cf89121a --- /dev/null +++ b/examples/demo_elementwise.py @@ -0,0 +1,26 @@ +import pyopencl as cl +import pyopencl.array as cl_array +import numpy + +ctx = cl.create_some_context() +queue = cl.CommandQueue(ctx) + +n = 10 +a_gpu = cl_array.to_device( + ctx, queue, numpy.random.randn(n).astype(numpy.float32)) +b_gpu = cl_array.to_device( + ctx, queue, numpy.random.randn(n).astype(numpy.float32)) + +from pyopencl.elementwise import ElementwiseKernel +lin_comb = ElementwiseKernel(ctx, + "float a, float *x, " + "float b, float *y, " + "float *z", + "z[i] = a*x[i] + b*y[i]", + "linear_combination") + +c_gpu = cl_array.empty_like(a_gpu) +lin_comb(5, a_gpu, 6, b_gpu, c_gpu) + +import numpy.linalg as la +assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5 diff --git a/pyopencl/array.py b/pyopencl/array.py new file mode 100644 index 00000000..474a6bc7 --- /dev/null +++ b/pyopencl/array.py @@ -0,0 +1,875 @@ +"""CL device arrays.""" + +from __future__ import division + +__copyright__ = "Copyright (C) 2009 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. +""" + + + + +import numpy +import pyopencl.elementwise as elementwise +import pyopencl as cl + + + +def splay(ctx, n): + min_work_items = 32 + max_work_items = 128 + max_groups = max( + 4 * dev.max_compute_units * 8 + # 4 to overfill the device + # 8 is an Nvidia constant--that's how many + # groups fit onto one compute device + for dev in ctx.devices) + + if n < min_work_items: + group_count = 1 + work_items_per_group = min_work_items + elif n < (max_groups * min_work_items): + group_count = (n + min_work_items - 1) // min_work_items + work_items_per_group = min_work_items + elif n < (max_groups * max_work_items): + group_count = max_groups + grp = (n + min_work_items - 1) // min_work_items + work_items_per_group = ((grp + max_groups -1) // max_groups) * min_work_items + else: + group_count = max_groups + work_items_per_group = max_work_items + + #print "n:%d gc:%d wipg:%d" % (n, group_count, work_items_per_group) + return (group_count*work_items_per_group, 1, 1), \ + (work_items_per_group, 1, 1) + + + + +def _get_common_dtype(obj1, obj2): + return (obj1.dtype.type(0) + obj2.dtype.type(0)).dtype + + + + +class DefaultAllocator: + def __init__(self, context, flags=cl.mem_flags.READ_WRITE): + self.context = context + self.flags = flags + + def __call__(self, size): + return cl.Buffer(self.context, self.flags, size) + + + + +class Array(object): + """A :mod:`pyopencl` Array is used to do array-based calculation on + a compute device. + + This is mostly supposed to be a :mod:`numpy`-workalike. Operators + work on an element-by-element basis, just like :class:`numpy.ndarray`. + """ + + def __init__(self, context, shape, dtype, allocator=None, + base=None, data=None, queue=None): + if allocator is None: + allocator = DefaultAllocator(context) + + try: + s = 1 + for dim in shape: + s *= dim + except TypeError: + if not isinstance(shape, int): + raise TypeError("shape must either be iterable or " + "castable to an integer") + s = shape + shape = (shape,) + + self.context = context + self.queue = queue + + self.shape = shape + self.dtype = numpy.dtype(dtype) + + self.mem_size = self.size = s + self.nbytes = self.dtype.itemsize * self.size + + self.allocator = allocator + if data is None: + if self.size: + self.data = self.allocator(self.size * self.dtype.itemsize) + else: + self.data = None + + if base is not None: + raise ValueError("If data is specified, base must be None.") + else: + self.data = data + + self.base = base + + self._global_size, self._local_size = splay(context, self.mem_size) + + def set(self, ary, queue=None, async=False): + assert ary.size == self.size + assert ary.dtype == self.dtype + if self.size: + evt = cl.enqueue_write_buffer(queue or self.queue, self.data, ary) + + if not async: + evt.wait() + + def get(self, queue=None, ary=None, async=False): + if ary is None: + ary = numpy.empty(self.shape, self.dtype) + else: + if ary.size != self.size: + raise TypeError("'ary' has non-matching type") + if ary.dtype != self.dtype: + raise TypeError("'ary' has non-matching size") + + if self.size: + evt = cl.enqueue_read_buffer(queue or self.queue, self.data, ary) + + if not async: + evt.wait() + + return ary + + def __str__(self): + return str(self.get()) + + def __repr__(self): + return repr(self.get()) + + def __hash__(self): + raise TypeError("pyopencl arrays are not hashable.") + + # kernel invocation wrappers ---------------------------------------------- + def _axpbyz(self, selffac, other, otherfac, out, queue=None): + """Compute ``out = selffac * self + otherfac*other``, + where `other` is a vector..""" + assert self.shape == other.shape + + knl = elementwise.get_axpbyz_kernel( + self.context, self.dtype, other.dtype, out.dtype) + + knl(queue or self.queue, self._global_size, self._local_size, + selffac, self.data, otherfac, other.data, + out.data, self.mem_size) + + return out + + def _axpbz(self, selffac, other, out, queue=None): + """Compute ``out = selffac * self + other``, where `other` is a scalar.""" + knl = elementwise.get_axpbz_kernel(self.context, self.dtype) + knl(queue or self.queue, self._global_size, self._local_size, + selffac, self.data, other, out.data, self.mem_size) + + return out + + def _elwise_multiply(self, other, out, queue=None): + knl = elementwise.get_multiply_kernel( + self.context, self.dtype, other.dtype, out.dtype) + knl(queue or self.queue, self._global_size, self._local_size, + self.data, other.data, + out.data, self.mem_size) + + return out + + def _rdiv_scalar(self, other, out, queue=None): + """Divides an array by a scalar:: + + y = n / self + """ + + assert self.dtype == numpy.float32 + + knl = elementwise.get_rdivide_elwise_kernel(self.context, self.dtype) + knl(queue or self.queue, self._global_size, self._local_size, + self.data, other, + out.data, self.mem_size) + + return out + + def _div(self, other, out, queue=None): + """Divides an array by another array.""" + + assert self.shape == other.shape + + knl = elementwise.get_divide_kernel(self.context, + self.dtype, other.dtype, out.dtype) + knl(queue or self.queue, self._global_size, self._local_size, + self.data, other.data, out.data, self.mem_size) + + return out + + def _new_like_me(self, dtype=None): + if dtype is None: + dtype = self.dtype + return self.__class__(self.context, + self.shape, dtype, allocator=self.allocator, + queue=self.queue) + + # operators --------------------------------------------------------------- + def mul_add(self, selffac, other, otherfac, add_timer=None, queue=None): + """Return `selffac * self + otherfac*other`. + """ + result = self._new_like_me(_get_common_dtype(self, other)) + return self._axpbyz(selffac, other, otherfac, result, add_timer) + + def __add__(self, other): + """Add an array with an array or an array with a scalar.""" + + if isinstance(other, Array): + # add another vector + result = self._new_like_me(_get_common_dtype(self, other)) + return self._axpbyz(1, other, 1, result) + else: + # add a scalar + if other == 0: + return self + else: + result = self._new_like_me() + return self._axpbz(1, other, result) + + __radd__ = __add__ + + def __sub__(self, other): + """Substract an array from an array or a scalar from an array.""" + + if isinstance(other, Array): + result = self._new_like_me(_get_common_dtype(self, other)) + return self._axpbyz(1, other, -1, result) + else: + if other == 0: + return self + else: + # create a new array for the result + result = self._new_like_me() + return self._axpbz(1, -other, result) + + def __rsub__(self,other): + """Substracts an array by a scalar or an array:: + + x = n - self + """ + # other must be a scalar + result = self._new_like_me() + return self._axpbz(-1, other, result) + + def __iadd__(self, other): + return self._axpbyz(1, other, 1, self) + + def __isub__(self, other): + return self._axpbyz(1, other, -1, self) + + def __neg__(self): + result = self._new_like_me() + return self._axpbz(-1, 0, result) + + def __mul__(self, other): + if isinstance(other, Array): + result = self._new_like_me(_get_common_dtype(self, other)) + return self._elwise_multiply(other, result) + else: + result = self._new_like_me() + return self._axpbz(other, 0, result) + + def __rmul__(self, scalar): + result = self._new_like_me() + return self._axpbz(scalar, 0, result) + + def __imul__(self, scalar): + return self._axpbz(scalar, 0, self) + + def __div__(self, other): + """Divides an array by an array or a scalar:: + + x = self / n + """ + if isinstance(other, Array): + result = self._new_like_me(_get_common_dtype(self, other)) + return self._div(other, result) + else: + if other == 1: + return self + else: + # create a new array for the result + result = self._new_like_me() + return self._axpbz(1/other, 0, result) + + __truediv__ = __div__ + + def __rdiv__(self,other): + """Divides an array by a scalar or an array:: + + x = n / self + """ + + if isinstance(other, Array): + result = self._new_like_me(_get_common_dtype(self, other)) + + knl = elementwise.get_divide_kernel(self.context) + knl(self.queue, self._global_size, self._local_size, + other.data, self.data, result.data, + self.mem_size) + + return result + else: + if other == 1: + return self + else: + # create a new array for the result + result = self._new_like_me() + return self._rdiv_scalar(other, result) + + __rtruediv__ = __div__ + + def fill(self, value, queue=None): + """fills the array with the specified value""" + knl = elementwise.get_fill_kernel(self.context, self.dtype) + knl(queue or self.queue, self._global_size, self._local_size, + value, self.data, self.mem_size) + + return self + + def __len__(self): + """Return the size of the leading dimension of self.""" + if len(self.shape): + return self.shape[0] + else: + return 1 + + def __abs__(self): + """Return a `Array` of the absolute values of the elements + of `self`. + """ + + result = self._new_like_me() + + if self.dtype.kind == "f": + fname = "fabs" + elif self.dtype.kind in ["u", "i"]: + fname = "abs" + else: + raise TypeError("unsupported dtype in __abs__()") + + knl = elementwise.get_unary_func_kernel( + self.context, fname, self.dtype) + knl(self.queue, self._global_size, self._local_size, + self.data,result.data, self.mem_size) + + return result + + def __pow__(self, other): + """Exponentiation by a scalar or elementwise by another + :class:`Array`. + """ + + if isinstance(other, Array): + assert self.shape == other.shape + + result = self._new_like_me(_get_common_dtype(self, other)) + + knl = elementwise.get_pow_array_kernel(self.context, + self.dtype, other.dtype, result.dtype) + + knl(self.queue, self._global_size, self._local_size, + self.data, other.data, result.data, + self.mem_size) + + return result + else: + result = self._new_like_me() + knl = elementwise.get_pow_kernel(self.context, self.dtype) + knl(self.queue, self._global_size, self._local_size, + other, self.data, result.data, + self.mem_size) + + return result + + def reverse(self, queue=None): + """Return this array in reversed order. The array is treated + as one-dimensional. + """ + + result = self._new_like_me() + + knl = elementwise.get_reverse_kernel(self.context, self.dtype) + knl(queue or self.queue, self._global_size, self._local_size, + self.data, result.data, self.mem_size) + + return result + + def astype(self, dtype, queue=None): + if dtype == self.dtype: + return self + + result = self._new_like_me(dtype=dtype) + + knl = elementwise.get_copy_kernel(self.context, dtype, self.dtype) + knl(queue or self.queue, self._global_size, self._local_size, + result.data, self.data, self.mem_size) + + return result + + + # slicing ----------------------------------------------------------------- + def __getitem__(self, idx): + if idx == (): + return self + + if len(self.shape) > 1: + raise NotImplementedError("multi-d slicing is not yet implemented") + + if not isinstance(idx, slice): + raise ValueError("non-slice indexing not supported: %s" % (idx,)) + + l, = self.shape + start, stop, stride = idx.indices(l) + + if stride != 1: + raise NotImplementedError("strided slicing is not yet implemented") + + return Array( + shape=((stop-start)//stride,), + dtype=self.dtype, + allocator=self.allocator, + base=self, + data=int(self.data) + start*self.dtype.itemsize) + + # rich comparisons (or rather, lack thereof) ------------------------------ + def __eq__(self, other): + raise NotImplementedError + + def __ne__(self, other): + raise NotImplementedError + + def __le__(self, other): + raise NotImplementedError + + def __ge__(self, other): + raise NotImplementedError + + def __lt__(self, other): + raise NotImplementedError + + def __gt__(self, other): + raise NotImplementedError + + + +def to_device(context, queue, ary, allocator=None, async=False): + """Converts a numpy array to a :class:`Array`.""" + result = Array(context, ary.shape, ary.dtype, allocator, + queue=queue) + result.set(ary, async=async) + return result + + +empty = Array + +def zeros(context, queue, shape, dtype, allocator=None): + """Returns an array of the given shape and dtype filled with 0's.""" + + result = Array(context, shape, dtype, allocator, queue=queue) + result.fill(0) + return result + +def empty_like(ary): + result = Array(ary.context, + ary.shape, ary.dtype, ary.allocator, queue=ary.queue) + return result + +def zeros_like(ary): + result = Array(ary.context, + ary.shape, ary.dtype, ary.allocator, queue=ary.queue) + result.fill(0) + return result + + +def arange(context, queue, *args, **kwargs): + """Create an array filled with numbers spaced `step` apart, + starting from `start` and ending at `stop`. + + For floating point arguments, the length of the result is + `ceil((stop - start)/step)`. This rule may result in the last + element of the result being greater than stop. + """ + + # argument processing ----------------------------------------------------- + + # Yuck. Thanks, numpy developers. ;) + from pytools import Record + class Info(Record): + pass + + explicit_dtype = False + + inf = Info() + inf.start = None + inf.stop = None + inf.step = None + inf.dtype = None + + if isinstance(args[-1], numpy.dtype): + dtype = args[-1] + args = args[:-1] + explicit_dtype = True + + argc = len(args) + if argc == 0: + raise ValueError, "stop argument required" + elif argc == 1: + inf.stop = args[0] + elif argc == 2: + inf.start = args[0] + inf.stop = args[1] + elif argc == 3: + inf.start = args[0] + inf.stop = args[1] + inf.step = args[2] + else: + raise ValueError, "too many arguments" + + admissible_names = ["start", "stop", "step", "dtype"] + for k, v in kwargs.iteritems(): + if k in admissible_names: + if getattr(inf, k) is None: + setattr(inf, k, v) + if k == "dtype": + explicit_dtype = True + else: + raise ValueError, "may not specify '%s' by position and keyword" % k + else: + raise ValueError, "unexpected keyword argument '%s'" % k + + if inf.start is None: + inf.start = 0 + if inf.step is None: + inf.step = 1 + if inf.dtype is None: + inf.dtype = numpy.array([inf.start, inf.stop, inf.step]).dtype + + # actual functionality ---------------------------------------------------- + dtype = numpy.dtype(inf.dtype) + start = dtype.type(inf.start) + step = dtype.type(inf.step) + stop = dtype.type(inf.stop) + + if not explicit_dtype: + raise TypeError("arange requires dtype argument") + + from math import ceil + size = int(ceil((stop-start)/step)) + + result = Array(context, (size,), dtype, queue=queue) + + knl = elementwise.get_arange_kernel(context, dtype) + knl(queue, result._global_size, result._local_size, + result.data, start, step, size) + + return result + + + + +def take(a, indices, out=None, queue=None): + queue = queue or a.queue + + if out is None: + out = Array(a.context, indices.shape, a.dtype, a.allocator, + queue=queue) + + assert len(indices.shape) == 1 + + knl = elementwise.get_take_kernel( + a.context, a.dtype, indices.dtype) + knl(queue, a._global_size, a._local_size, + indices.data, a.data, out.data, indices.size) + + return out + + + + +def multi_take(arrays, indices, out=None, queue=None): + if not len(arrays): + return [] + + assert len(indices.shape) == 1 + + from pytools import single_valued + a_dtype = single_valued(a.dtype for a in arrays) + a_allocator = arrays[0].dtype + context = indices.context + queue = queue or indices.queue + + vec_count = len(arrays) + + if out is None: + out = [Array(context, queue, indices.shape, a_dtype, a_allocator) + for i in range(vec_count)] + else: + if len(out) != len(arrays): + raise ValueError("out and arrays must have the same length") + + chunk_size = _builtin_min(vec_count, 10) + + def make_func_for_chunk_size(chunk_size): + knl = elementwise.get_take_kernel( + indices.context, a_dtype, indices.dtype, + vec_count=chunk_size) + knl.set_block_shape(*indices._block) + return knl + + knl = make_func_for_chunk_size(chunk_size) + + for start_i in range(0, len(arrays), chunk_size): + chunk_slice = slice(start_i, start_i+chunk_size) + + if start_i + chunk_size > vec_count: + knl = make_func_for_chunk_size(vec_count-start_i) + + knl(queue, indices._global_size, indices._local_size, + indices.data, + *([i.data for i in arrays[chunk_slice]] + + [o.data for o in out[chunk_slice]] + + [indices.size])) + + return out + + + + +def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, + out=None, queue=None, src_offsets=None): + if not len(arrays): + return [] + + from pytools import single_valued + a_dtype = single_valued(a.dtype for a in arrays) + a_allocator = arrays[0].allocator + context = src_indices.context + queue = queue or src_indices.queue + + vec_count = len(arrays) + + if out is None: + out = [Array(context, dest_shape, a_dtype, a_allocator, queue=queue) + for i in range(vec_count)] + else: + if a_dtype != single_valued(o.dtype for o in out): + raise TypeError("arrays and out must have the same dtype") + if len(out) != vec_count: + raise ValueError("out and arrays must have the same length") + + if src_indices.dtype != dest_indices.dtype: + raise TypeError("src_indices and dest_indices must have the same dtype") + + if len(src_indices.shape) != 1: + raise ValueError("src_indices must be 1D") + + if src_indices.shape != dest_indices.shape: + raise ValueError("src_indices and dest_indices must have the same shape") + + if src_offsets is None: + src_offsets_list = [] + else: + src_offsets_list = src_offsets + if len(src_offsets) != vec_count: + raise ValueError("src_indices and src_offsets must have the same length") + + max_chunk_size = 10 + + chunk_size = _builtin_min(vec_count, max_chunk_size) + + def make_func_for_chunk_size(chunk_size): + return elementwise.get_take_put_kernel(context, + a_dtype, src_indices.dtype, + with_offsets=src_offsets is not None, + vec_count=chunk_size) + + knl = make_func_for_chunk_size(chunk_size) + + for start_i in range(0, len(arrays), chunk_size): + chunk_slice = slice(start_i, start_i+chunk_size) + + if start_i + chunk_size > vec_count: + knl = make_func_for_chunk_size(vec_count-start_i) + + knl(queue, src_indices._global_size, src_indices._local_size, + dest_indices.data, src_indices.data, + *([i.data for i in arrays[chunk_slice]] + + [o.data for o in out[chunk_slice]] + + src_offsets_list[chunk_slice] + + [src_indices.size])) + + return out + + + + +def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None): + if not len(arrays): + return [] + + from pytools import single_valued + a_dtype = single_valued(a.dtype for a in arrays) + a_allocator = arrays[0].allocator + context = dest_indices.context + queue = queue or dest_indices.queue + + vec_count = len(arrays) + + if out is None: + out = [Array(context, dest_shape, a_dtype, a_allocator, queue=queue) + for i in range(vec_count)] + else: + if a_dtype != single_valued(o.dtype for o in out): + raise TypeError("arrays and out must have the same dtype") + if len(out) != vec_count: + raise ValueError("out and arrays must have the same length") + + if len(dest_indices.shape) != 1: + raise ValueError("src_indices must be 1D") + + chunk_size = _builtin_min(vec_count, 10) + + def make_func_for_chunk_size(chunk_size): + knl = elementwise.get_put_kernel( + a_dtype, dest_indices.dtype, vec_count=chunk_size) + knl.set_block_shape(*dest_indices._block) + return knl + + knl = make_func_for_chunk_size(chunk_size) + + for start_i in range(0, len(arrays), chunk_size): + chunk_slice = slice(start_i, start_i+chunk_size) + + if start_i + chunk_size > vec_count: + knl = make_func_for_chunk_size(vec_count-start_i) + + knl(queue, dest_indices._global_size, dest_indices._local_size, + dest_indices.data, + *([o.data for o in out[chunk_slice]] + + [i.data for i in arrays[chunk_slice]] + + [dest_indices.size])) + + return out + + + + +def if_positive(criterion, then_, else_, out=None, queue=None): + if not (criterion.shape == then_.shape == else_.shape): + raise ValueError("shapes do not match") + + if not (then_.dtype == else_.dtype): + raise ValueError("dtypes do not match") + + knl = elementwise.get_if_positive_kernel( + criterion.context, + criterion.dtype, then_.dtype) + + if out is None: + out = empty_like(then_) + + knl(queue or criterion.queue, criterion._global_size, criterion._local_size, + criterion.data, then_.data, else_.data, out.data, + criterion.size) + + return out + + + + +def maximum(a, b, out=None, queue=None): + # silly, but functional + return if_positive(a.mul_add(1, b, -1, queue=queue), a, b, + queue=queue, out=out) + + + + +def minimum(a, b, out=None, queue=None): + # silly, but functional + return if_positive(a.mul_add(1, b, -1, queue=queue), b, a, + queue=queue, out=out) + + + + +# reductions ------------------------------------------------------------------ +_builtin_min = min +_builtin_max = max + +if False: + # not yet + def sum(a, dtype=None, queue=None): + from pyopencl.reduction import get_sum_kernel + krnl = get_sum_kernel(dtype, a.dtype) + return krnl(a, queue=queue) + + def dot(a, b, dtype=None, queue=None): + from pyopencl.reduction import get_dot_kernel + krnl = get_dot_kernel(dtype, a.dtype, b.dtype) + return krnl(a, b, queue=queue) + + def subset_dot(subset, a, b, dtype=None, queue=None): + from pyopencl.reduction import get_subset_dot_kernel + krnl = get_subset_dot_kernel(dtype, a.dtype, b.dtype) + return krnl(subset, a, b, queue=queue) + + def _make_minmax_kernel(what): + def f(a, queue=None): + from pyopencl.reduction import get_minmax_kernel + krnl = get_minmax_kernel(what, a.dtype) + return krnl(a, queue=queue) + + return f + + min = _make_minmax_kernel("min") + max = _make_minmax_kernel("max") + + def _make_subset_minmax_kernel(what): + def f(subset, a, queue=None): + from pyopencl.reduction import get_subset_minmax_kernel + import pyopencl.reduction + krnl = get_subset_minmax_kernel(what, a.dtype) + return krnl(subset, a, queue=queue) + + return f + + subset_min = _make_subset_minmax_kernel("min") + subset_max = _make_subset_minmax_kernel("max") + + + + + +# vim: foldmethod=marker diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py new file mode 100644 index 00000000..0cc344d9 --- /dev/null +++ b/pyopencl/clrandom.py @@ -0,0 +1,259 @@ +import pyopencl as cl + +md5_code = """ +/* + ********************************************************************** + ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. ** + ** ** + ** License to copy and use this software is granted provided that ** + ** it is identified as the "RSA Data Security, Inc. MD5 Message ** + ** Digest Algorithm" in all material mentioning or referencing this ** + ** software or this function. ** + ** ** + ** License is also granted to make and use derivative works ** + ** provided that such works are identified as "derived from the RSA ** + ** Data Security, Inc. MD5 Message Digest Algorithm" in all ** + ** material mentioning or referencing the derived work. ** + ** ** + ** RSA Data Security, Inc. makes no representations concerning ** + ** either the merchantability of this software or the suitability ** + ** of this software for any particular purpose. It is provided "as ** + ** is" without express or implied warranty of any kind. ** + ** ** + ** These notices must be retained in any copies of any part of this ** + ** documentation and/or software. ** + ********************************************************************** + */ + +/* F, G and H are basic MD5 functions: selection, majority, parity */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +/* ROTATE_LEFT rotates x left n bits */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */ +/* Rotation is separate from addition to prevent recomputation */ +#define FF(a, b, c, d, x, s, ac) \ + {(a) += F ((b), (c), (d)) + (x) + (ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) \ + {(a) += G ((b), (c), (d)) + (x) + (ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) \ + {(a) += H ((b), (c), (d)) + (x) + (ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) \ + {(a) += I ((b), (c), (d)) + (x) + (ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } + +#define X0 get_local_id(0) +#define X1 get_local_id(1) +#define X2 get_local_id(2) +#define X3 get_group_id(0) +#define X4 get_group_id(1) +#define X5 get_group_id(2) +#define X6 seed +#define X7 i +#define X8 n +#define X9 get_local_size(0) +#define X10 get_local_size(1) +#define X11 get_local_size(2) +#define X12 get_global_size(0) +#define X13 get_global_size(1) +#define X14 get_global_size(2) +#define X15 0 + + unsigned int a = 0x67452301; + unsigned int b = 0xefcdab89; + unsigned int c = 0x98badcfe; + unsigned int d = 0x10325476; + + /* Round 1 */ +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 + FF ( a, b, c, d, X0 , S11, 3614090360); /* 1 */ + FF ( d, a, b, c, X1 , S12, 3905402710); /* 2 */ + FF ( c, d, a, b, X2 , S13, 606105819); /* 3 */ + FF ( b, c, d, a, X3 , S14, 3250441966); /* 4 */ + FF ( a, b, c, d, X4 , S11, 4118548399); /* 5 */ + FF ( d, a, b, c, X5 , S12, 1200080426); /* 6 */ + FF ( c, d, a, b, X6 , S13, 2821735955); /* 7 */ + FF ( b, c, d, a, X7 , S14, 4249261313); /* 8 */ + FF ( a, b, c, d, X8 , S11, 1770035416); /* 9 */ + FF ( d, a, b, c, X9 , S12, 2336552879); /* 10 */ + FF ( c, d, a, b, X10, S13, 4294925233); /* 11 */ + FF ( b, c, d, a, X11, S14, 2304563134); /* 12 */ + FF ( a, b, c, d, X12, S11, 1804603682); /* 13 */ + FF ( d, a, b, c, X13, S12, 4254626195); /* 14 */ + FF ( c, d, a, b, X14, S13, 2792965006); /* 15 */ + FF ( b, c, d, a, X15, S14, 1236535329); /* 16 */ + + /* Round 2 */ +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 + GG ( a, b, c, d, X1 , S21, 4129170786); /* 17 */ + GG ( d, a, b, c, X6 , S22, 3225465664); /* 18 */ + GG ( c, d, a, b, X11, S23, 643717713); /* 19 */ + GG ( b, c, d, a, X0 , S24, 3921069994); /* 20 */ + GG ( a, b, c, d, X5 , S21, 3593408605); /* 21 */ + GG ( d, a, b, c, X10, S22, 38016083); /* 22 */ + GG ( c, d, a, b, X15, S23, 3634488961); /* 23 */ + GG ( b, c, d, a, X4 , S24, 3889429448); /* 24 */ + GG ( a, b, c, d, X9 , S21, 568446438); /* 25 */ + GG ( d, a, b, c, X14, S22, 3275163606); /* 26 */ + GG ( c, d, a, b, X3 , S23, 4107603335); /* 27 */ + GG ( b, c, d, a, X8 , S24, 1163531501); /* 28 */ + GG ( a, b, c, d, X13, S21, 2850285829); /* 29 */ + GG ( d, a, b, c, X2 , S22, 4243563512); /* 30 */ + GG ( c, d, a, b, X7 , S23, 1735328473); /* 31 */ + GG ( b, c, d, a, X12, S24, 2368359562); /* 32 */ + + /* Round 3 */ +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 + HH ( a, b, c, d, X5 , S31, 4294588738); /* 33 */ + HH ( d, a, b, c, X8 , S32, 2272392833); /* 34 */ + HH ( c, d, a, b, X11, S33, 1839030562); /* 35 */ + HH ( b, c, d, a, X14, S34, 4259657740); /* 36 */ + HH ( a, b, c, d, X1 , S31, 2763975236); /* 37 */ + HH ( d, a, b, c, X4 , S32, 1272893353); /* 38 */ + HH ( c, d, a, b, X7 , S33, 4139469664); /* 39 */ + HH ( b, c, d, a, X10, S34, 3200236656); /* 40 */ + HH ( a, b, c, d, X13, S31, 681279174); /* 41 */ + HH ( d, a, b, c, X0 , S32, 3936430074); /* 42 */ + HH ( c, d, a, b, X3 , S33, 3572445317); /* 43 */ + HH ( b, c, d, a, X6 , S34, 76029189); /* 44 */ + HH ( a, b, c, d, X9 , S31, 3654602809); /* 45 */ + HH ( d, a, b, c, X12, S32, 3873151461); /* 46 */ + HH ( c, d, a, b, X15, S33, 530742520); /* 47 */ + HH ( b, c, d, a, X2 , S34, 3299628645); /* 48 */ + + /* Round 4 */ +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + II ( a, b, c, d, X0 , S41, 4096336452); /* 49 */ + II ( d, a, b, c, X7 , S42, 1126891415); /* 50 */ + II ( c, d, a, b, X14, S43, 2878612391); /* 51 */ + II ( b, c, d, a, X5 , S44, 4237533241); /* 52 */ + II ( a, b, c, d, X12, S41, 1700485571); /* 53 */ + II ( d, a, b, c, X3 , S42, 2399980690); /* 54 */ + II ( c, d, a, b, X10, S43, 4293915773); /* 55 */ + II ( b, c, d, a, X1 , S44, 2240044497); /* 56 */ + II ( a, b, c, d, X8 , S41, 1873313359); /* 57 */ + II ( d, a, b, c, X15, S42, 4264355552); /* 58 */ + II ( c, d, a, b, X6 , S43, 2734768916); /* 59 */ + II ( b, c, d, a, X13, S44, 1309151649); /* 60 */ + II ( a, b, c, d, X4 , S41, 4149444226); /* 61 */ + II ( d, a, b, c, X11, S42, 3174756917); /* 62 */ + II ( c, d, a, b, X2 , S43, 718787259); /* 63 */ + II ( b, c, d, a, X9 , S44, 3951481745); /* 64 */ + + a += 0x67452301; + b += 0xefcdab89; + c += 0x98badcfe; + d += 0x10325476; +""" + +import numpy + +def rand(context, queue, shape, dtype): + from pyopencl.array import Array + from pyopencl.elementwise import get_elwise_kernel + + result = Array(context, shape, dtype, queue=queue) + + if dtype == numpy.float32: + func = get_elwise_kernel(context, + "float *dest, unsigned int seed", + md5_code + """ + #define POW_2_M32 (1/4294967296.0f) + dest[i] = a*POW_2_M32; + if ((i += gsize) < n) + dest[i] = b*POW_2_M32; + if ((i += gsize) < n) + dest[i] = c*POW_2_M32; + if ((i += gsize) < n) + dest[i] = d*POW_2_M32; + """, + "md5_rng_float") + elif dtype == numpy.float64: + func = get_elwise_kernel(context, + "double *dest, unsigned int seed", + md5_code + """ + #define POW_2_M32 (1/4294967296.0) + #define POW_2_M64 (1/18446744073709551616.) + + dest[i] = a*POW_2_M32 + b*POW_2_M64; + + if ((i += gsize) < n) + { + dest[i] = c*POW_2_M32 + d*POW_2_M64; + } + """, + "md5_rng_float") + elif dtype in [numpy.int32, numpy.uint32]: + func = get_elwise_kernel(context, + "unsigned int *dest, unsigned int seed", + md5_code + """ + dest[i] = a; + if ((i += gsize) < n) + dest[i] = b; + if ((i += gsize) < n) + dest[i] = c; + if ((i += gsize) < n) + dest[i] = d; + """, + "md5_rng_int") + else: + raise NotImplementedError + + func(queue, result._global_size, result._local_size, + result.data, numpy.random.randint(2**31-1), + result.size) + + return result + + +if __name__ == "__main__": + import sys + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + if "generate" in sys.argv[1:]: + N = 256 + print N, "MB" + r = rand(ctx, queue, (N*2**18,), numpy.uint32) + print "generated" + r.get().tofile("random.dat") + print "written" + + else: + from pylab import plot, show, subplot + N = 250 + r1 = rand(ctx, queue, (N,), numpy.uint32) + r2 = rand(ctx, queue, (N,), numpy.int32) + r3 = rand(ctx, queue, (N,), numpy.float32) + + subplot(131); plot( r1.get(),"x-") + subplot(132); plot( r2.get(),"x-") + subplot(133); plot( r3.get(),"x-") + show() diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py new file mode 100644 index 00000000..1a3f25ec --- /dev/null +++ b/pyopencl/elementwise.py @@ -0,0 +1,466 @@ +"""Elementwise functionality.""" + +from __future__ import division + +__copyright__ = "Copyright (C) 2009 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. +""" + + + + +from pyopencl.tools import context_dependent_memoize +import numpy +import pyopencl as cl +from pyopencl.tools import dtype_to_ctype, VectorArg, ScalarArg + + + + +def get_elwise_program(context, arguments, operation, + name="elwise_kernel", keep=False, options=[], + preamble="", loop_prep="", after_loop=""): + from pyopencl import Program + return Program(context, """ + %(preamble)s + + __kernel void %(name)s(%(arguments)s) + { + unsigned lid = get_local_id(0); + unsigned gsize = get_global_size(0); + unsigned work_item_start = get_local_size(0)*get_group_id(0); + unsigned i; + + %(loop_prep)s; + + for (i = work_item_start + lid; i < n; i += gsize) + { + %(operation)s; + } + + %(after_loop)s; + } + """ % { + "arguments": ", ".join(arg.declarator() for arg in arguments), + "operation": operation, + "name": name, + "preamble": preamble, + "loop_prep": loop_prep, + "after_loop": after_loop, + }).build(options=" ".join(options)) + + + + +def get_elwise_kernel_and_types(context, arguments, operation, + name="elwise_kernel", keep=False, options=[], preamble="", **kwargs): + if isinstance(arguments, str): + from pyopencl.tools import parse_c_arg + parsed_args = [parse_c_arg(arg) for arg in arguments.split(",")] + else: + parsed_args = arguments + + for arg in parsed_args: + if numpy.float64 == arg.dtype: + preamble += "\n\n#pragma OPENCL EXTENSION cl_khr_fp64: enable\n" + break + + parsed_args.append(ScalarArg(numpy.uintp, "n")) + + prg = get_elwise_program(context, parsed_args, operation, name, + keep, options, preamble, **kwargs) + + scalar_arg_dtypes = [] + for arg in parsed_args: + if isinstance(arg, ScalarArg): + scalar_arg_dtypes.append(arg.dtype) + else: + scalar_arg_dtypes.append(None) + + kernel = getattr(prg, name) + kernel.set_scalar_arg_dtypes(scalar_arg_dtypes) + + return kernel, parsed_args + + + + +def get_elwise_kernel(context, arguments, operation, + name="elwise_kernel", options=[], **kwargs): + """Return a L{pyopencl.Kernel} that performs the same scalar operation + on one or several vectors. + """ + func, arguments = get_elwise_kernel_and_types(context, + arguments, operation, name, options, **kwargs) + + return func + + + + +class ElementwiseKernel: + def __init__(self, context, arguments, operation, + name="elwise_kernel", options=[], **kwargs): + self.kernel, self.arguments = get_elwise_kernel_and_types(context, + arguments, operation, name, options, **kwargs) + + if not [i for i, arg in enumerate(self.arguments) + if isinstance(arg, VectorArg)]: + raise RuntimeError( + "ElementwiseKernel can only be used with " + "functions that have at least one " + "vector argument") + + def __call__(self, *args, **kwargs): + vectors = [] + + invocation_args = [] + for arg, arg_descr in zip(args, self.arguments): + if isinstance(arg_descr, VectorArg): + vectors.append(arg) + invocation_args.append(arg.data) + else: + invocation_args.append(arg) + + queue = kwargs.pop("queue", None) + if kwargs: + raise TypeError("too many/unknown keyword arguments") + + repr_vec = vectors[0] + if queue is None: + queue = repr_vec.queue + invocation_args.append(repr_vec.mem_size) + + self.kernel.set_args(*invocation_args) + return cl.enqueue_nd_range_kernel(queue, self.kernel, + repr_vec._global_size, repr_vec._local_size) + + + + +@context_dependent_memoize +def get_take_kernel(context, dtype, idx_dtype, vec_count=1): + ctx = { + "idx_tp": dtype_to_ctype(idx_dtype), + "tp": dtype_to_ctype(dtype), + } + + args = ([VectorArg(idx_dtype, "idx")] + + [VectorArg(dtype, "src"+str(i))for i in range(vec_count)] + + [VectorArg(dtype, "dest"+str(i))for i in range(vec_count)]) + body = ( + ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + + "\n".join( + "dest%d[i] = src%d[src_idx];" % (i, i) + for i in range(vec_count))) + + return get_elwise_kernel(context, args, body, "take") + + + + +@context_dependent_memoize +def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1): + ctx = { + "idx_tp": dtype_to_ctype(idx_dtype), + "tp": dtype_to_ctype(dtype), + } + + args = [ + VectorArg(idx_dtype, "gmem_dest_idx"), + VectorArg(idx_dtype, "gmem_src_idx"), + ] + [ + VectorArg(dtype, "src%d" % i) + for i in range(vec_count) + ] + [ + VectorArg(dtype, "dest%d" % i) + for i in range(vec_count) + ] + [ + ScalarArg(idx_dtype, "offset%d" % i) + for i in range(vec_count) if with_offsets + ] + + if with_offsets: + def get_copy_insn(i): + return ("dest%d[dest_idx] = " + "src%d[src_idx+offset%d];" + % (i, i, i)) + else: + def get_copy_insn(i): + return ("dest%d[dest_idx] = " + "src%d[src_idx];" % (i, i)) + + body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n" + "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx) + + "\n".join(get_copy_insn(i) for i in range(vec_count))) + + return get_elwise_kernel(context, args, body, "take_put") + + + + +@context_dependent_memoize +def get_put_kernel(context, dtype, idx_dtype, vec_count=1): + ctx = { + "idx_tp": dtype_to_ctype(idx_dtype), + "tp": dtype_to_ctype(dtype), + } + + args = [ + VectorArg(idx_dtype, "gmem_dest_idx"), + ] + [ + VectorArg(dtype, "dest%d" % i) + for i in range(vec_count) + ] + [ + VectorArg(dtype, "src%d" % i) + for i in range(vec_count) + ] + + body = ( + "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i) + for i in range(vec_count))) + + return get_elwise_kernel(args, body, "put") + + + + +@context_dependent_memoize +def get_copy_kernel(context, dtype_dest, dtype_src): + return get_elwise_kernel(context, + "%(tp_dest)s *dest, %(tp_src)s *src" % { + "tp_dest": dtype_to_ctype(dtype_dest), + "tp_src": dtype_to_ctype(dtype_src), + }, + "dest[i] = src[i]", + "copy") + + + +@context_dependent_memoize +def get_linear_combination_kernel(summand_descriptors, + dtype_z): + from pycuda.tools import dtype_to_ctype + from pycuda.elementwise import \ + VectorArg, ScalarArg, get_elwise_module + + args = [] + preamble = [ "#include <pycuda-helpers.hpp>\n\n" ] + loop_prep = [] + summands = [] + tex_names = [] + + for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \ + enumerate(summand_descriptors): + if is_gpu_scalar: + preamble.append( + "texture <%s, 1, cudaReadModeElementType> tex_a%d;" + % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i)) + args.append(VectorArg(vector_dtype, "x%d" % i)) + tex_names.append("tex_a%d" % i) + loop_prep.append( + "%s a%d = fp_tex1Dfetch(tex_a%d, 0)" + % (dtype_to_ctype(scalar_dtype), i, i)) + else: + args.append(ScalarArg(scalar_dtype, "a%d" % i)) + args.append(VectorArg(vector_dtype, "x%d" % i)) + + summands.append("a%d*x%d[i]" % (i, i)) + + args.append(VectorArg(dtype_z, "z")) + args.append(ScalarArg(numpy.uintp, "n")) + + mod = get_elwise_module(args, + "z[i] = " + " + ".join(summands), + "linear_combination", + preamble="\n".join(preamble), + loop_prep=";\n".join(loop_prep)) + + func = mod.get_function("linear_combination") + tex_src = [mod.get_texref(tn) for tn in tex_names] + func.prepare("".join(arg.struct_char for arg in args), + (1,1,1), texrefs=tex_src) + + return func, tex_src + + + + +@context_dependent_memoize +def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): + return get_elwise_kernel(context, + "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = a*x[i] + b*y[i]", + "axpbyz") + +@context_dependent_memoize +def get_axpbz_kernel(context, dtype): + return get_elwise_kernel(context, + "%(tp)s a, %(tp)s *x,%(tp)s b, %(tp)s *z" % { + "tp": dtype_to_ctype(dtype)}, + "z[i] = a * x[i] + b", + "axpb") + +@context_dependent_memoize +def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z): + return get_elwise_kernel(context, + "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = x[i] * y[i]", + "multiply") + +@context_dependent_memoize +def get_divide_kernel(context, dtype_x, dtype_y, dtype_z): + return get_elwise_kernel(context, + "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = x[i] / y[i]", + "divide") + +@context_dependent_memoize +def get_rdivide_elwise_kernel(context, dtype): + return get_elwise_kernel(context, + "%(tp)s *x, %(tp)s y, %(tp)s *z" % { + "tp": dtype_to_ctype(dtype), + }, + "z[i] = y / x[i]", + "divide_r") + +@context_dependent_memoize +def get_fill_kernel(context, dtype): + return get_elwise_kernel(context, + "%(tp)s a, %(tp)s *z" % { + "tp": dtype_to_ctype(dtype), + }, + "z[i] = a", + "fill") + +@context_dependent_memoize +def get_reverse_kernel(context, dtype): + return get_elwise_kernel(context, + "%(tp)s *y, %(tp)s *z" % { + "tp": dtype_to_ctype(dtype), + }, + "z[i] = y[n-1-i]", + "reverse") + +@context_dependent_memoize +def get_arange_kernel(context, dtype): + return get_elwise_kernel(context, + "%(tp)s *z, %(tp)s start, %(tp)s step" % { + "tp": dtype_to_ctype(dtype), + }, + "z[i] = start + i*step", + "arange") + + +@context_dependent_memoize +def get_pow_kernel(context, dtype): + return get_elwise_kernel(context, + "%(tp)s value, %(tp)s *y, %(tp)s *z" % { + "tp": dtype_to_ctype(dtype), + }, + "z[i] = pow(y[i], value)", + "pow_method") + +@context_dependent_memoize +def get_pow_array_kernel(context, dtype_x, dtype_y, dtype_z): + return get_elwise_kernel(context, + "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = pow(x[i], y[i])", + "pow_method") + +@context_dependent_memoize +def get_fmod_kernel(context): + return get_elwise_kernel(context, + "float *arg, float *mod, float *z", + "z[i] = fmod(arg[i], mod[i])", + "fmod_kernel") + +@context_dependent_memoize +def get_modf_kernel(context): + return get_elwise_kernel(context, + "float *x, float *intpart ,float *fracpart", + "fracpart[i] = modf(x[i], &intpart[i])", + "modf_kernel") + +@context_dependent_memoize +def get_frexp_kernel(context): + return get_elwise_kernel(context, + "float *x, float *significand, float *exponent", + """ + int expt = 0; + significand[i] = frexp(x[i], &expt); + exponent[i] = expt; + """, + "frexp_kernel") + +@context_dependent_memoize +def get_ldexp_kernel(context): + return get_elwise_kernel(context, + "float *sig, float *expt, float *z", + "z[i] = ldexp(sig[i], int(expt[i]))", + "ldexp_kernel") + +@context_dependent_memoize +def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None): + if out_dtype is None: + out_dtype = in_dtype + + return get_elwise_kernel(context, + "%(tp_in)s *y, %(tp_out)s *z" % { + "tp_in": dtype_to_ctype(in_dtype), + "tp_out": dtype_to_ctype(out_dtype), + }, + "z[i] = %s(y[i])" % func_name, + "%s_kernel" % func_name) + + + + +@context_dependent_memoize +def get_if_positive_kernel(context, crit_dtype, dtype): + return get_elwise_kernel(context, [ + VectorArg(crit_dtype, "crit"), + VectorArg(dtype, "then_"), + VectorArg(dtype, "else_"), + VectorArg(dtype, "result"), + ], + "result[i] = crit[i] > 0 ? then_[i] : else_[i]", + "if_positive") diff --git a/pyopencl/tools.py b/pyopencl/tools.py new file mode 100644 index 00000000..aca8c4c9 --- /dev/null +++ b/pyopencl/tools.py @@ -0,0 +1,173 @@ +"""H.""" + +from __future__ import division + +__copyright__ = "Copyright (C) 2010 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. +""" + + + +import numpy +from decorator import decorator + + + + +@decorator +def context_dependent_memoize(func, context, *args): + """Provides memoization for things that get created inside + a context, i.e. mainly programs and kernels. Assumes that + the first argument of the decorated function is the context. + """ + dicname = "_ctx_memoize_dic_%s_%x" % ( + func.__name__, hash(func)) + + try: + return getattr(context, dicname)[args] + except AttributeError: + result = func(context, *args) + setattr(context, dicname, {args: result}) + return result + except KeyError: + result = func(context, *args) + getattr(context,dicname)[args] = result + return result + + + +# {{{ C code generation helpers ----------------------------------------------- +def dtype_to_ctype(dtype): + if dtype is None: + raise ValueError("dtype may not be None") + + dtype = numpy.dtype(dtype) + if dtype == numpy.int64: + return "long" + elif dtype == numpy.uint64: + return "unsigned long" + elif dtype == numpy.int32: + return "int" + elif dtype == numpy.uint32: + return "unsigned int" + elif dtype == numpy.int16: + return "short int" + elif dtype == numpy.uint16: + return "short unsigned int" + elif dtype == numpy.int8: + return "signed char" + elif dtype == numpy.uint8: + return "unsigned char" + elif dtype == numpy.bool: + return "bool" + elif dtype == numpy.float32: + return "float" + elif dtype == numpy.float64: + return "double" + elif dtype == numpy.complex64: + return "complex float" + elif dtype == numpy.complex128: + return "complex double" + else: + raise ValueError, "unable to map dtype '%s'" % dtype + +# }}} + + + + +# {{{ C argument lists -------------------------------------------------------- +class Argument: + def __init__(self, dtype, name): + self.dtype = numpy.dtype(dtype) + self.name = name + + def __repr__(self): + return "%s(%r, %s)" % ( + self.__class__.__name__, + self.name, + self.dtype) + +class VectorArg(Argument): + def declarator(self): + return "__global %s *%s" % (dtype_to_ctype(self.dtype), self.name) + +class ScalarArg(Argument): + def declarator(self): + return "%s %s" % (dtype_to_ctype(self.dtype), self.name) + + + + + +def parse_c_arg(c_arg): + c_arg = (c_arg + .replace("const", "") + .replace("volatile", "") + .replace("__global", "") + .replace("__local", "") + .replace("__constant", "")) + + # process and remove declarator + import re + decl_re = re.compile(r"(\**)\s*([_a-zA-Z0-9]+)(\s*\[[ 0-9]*\])*\s*$") + decl_match = decl_re.search(c_arg) + + if decl_match is None: + raise ValueError("couldn't parse C declarator '%s'" % c_arg) + + name = decl_match.group(2) + + if decl_match.group(1) or decl_match.group(3) is not None: + arg_class = VectorArg + else: + arg_class = ScalarArg + + tp = c_arg[:decl_match.start()] + tp = " ".join(tp.split()) + + if tp == "float": dtype = numpy.float32 + elif tp == "double": dtype = numpy.float64 + elif tp == "pycuda::complex<float>": dtype = numpy.complex64 + elif tp == "pycuda::complex<double>": dtype = numpy.complex128 + elif tp in ["int", "signed int"]: dtype = numpy.int32 + elif tp in ["unsigned", "unsigned int"]: dtype = numpy.uint32 + elif tp in ["long", "long int"]: dtype = numpy.int64 + elif tp in ["unsigned long", "unsigned long int"]: + dtype = numpy.uint64 + elif tp in ["short", "short int"]: dtype = numpy.int16 + elif tp in ["unsigned short", "unsigned short int"]: dtype = numpy.uint16 + elif tp in ["char"]: dtype = numpy.int8 + elif tp in ["unsigned char"]: dtype = numpy.uint8 + elif tp in ["bool"]: dtype = numpy.bool + else: raise ValueError, "unknown type '%s'" % tp + + return arg_class(dtype, name) + +# }}} + + + + +# vim: foldmethod=marker diff --git a/setup.py b/setup.py index 0e488cd8..0088d0bd 100644 --- a/setup.py +++ b/setup.py @@ -128,7 +128,8 @@ def main(): install_requires=[ "pytools>=7", - "py>=1.0.2" + "py>=1.0.2", + "decorator>=3.2.0", ], ext_package="pyopencl", diff --git a/test/test_array.py b/test/test_array.py new file mode 100644 index 00000000..a1f9d263 --- /dev/null +++ b/test/test_array.py @@ -0,0 +1,593 @@ +#! /usr/bin/env python +import numpy +import numpy.linalg as la +import sys +import pytools.test + + + + +def have_cl(): + try: + import pyopencl + return True + except: + return False + +if have_cl(): + import pyopencl.array as cl_array + import pyopencl as cl + + + + +def has_double_support(dev): + for ext in dev.extensions.split(" "): + if ext == "cl_khr_fp64": + return True + return False + + + + +@pytools.test.mark_test.opencl +def test_pow_array(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + a = numpy.array([1,2,3,4,5]).astype(numpy.float32) + a_gpu = cl_array.to_device(context, queue, a) + + result = pow(a_gpu,a_gpu).get() + assert (numpy.abs(a**a - result) < 1e-3).all() + + result = (a_gpu**a_gpu).get() + assert (numpy.abs(pow(a, a) - result) < 1e-3).all() + + + + +@pytools.test.mark_test.opencl +def test_pow_number(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32) + a_gpu = cl_array.to_device(context, queue, a) + + result = pow(a_gpu, 2).get() + assert (numpy.abs(a**2 - result) < 1e-3).all() + + + +@pytools.test.mark_test.opencl +def test_abs(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + a = -cl_array.arange(context, queue, 111, dtype=numpy.float32) + res = a.get() + + for i in range(111): + assert res[i] <= 0 + + a = abs(a) + + res = a.get() + + for i in range (111): + assert abs(res[i]) >= 0 + assert res[i] == i + + +@pytools.test.mark_test.opencl +def test_len(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32) + a_cpu = cl_array.to_device(context, queue, a) + assert len(a_cpu) == 10 + + + + +@pytools.test.mark_test.opencl +def test_multiply(ctx_getter): + """Test the muliplication of an array with a scalar. """ + + context = ctx_getter() + queue = cl.CommandQueue(context) + + + for sz in [10, 50000]: + for dtype, scalars in [ + (numpy.float32, [2]), + #(numpy.complex64, [2, 2j]) + ]: + for scalar in scalars: + a = numpy.arange(sz).astype(dtype) + a_gpu = cl_array.to_device(context, queue, a) + a_doubled = (scalar * a_gpu).get() + + assert (a * scalar == a_doubled).all() + +@pytools.test.mark_test.opencl +def test_multiply_array(ctx_getter): + """Test the multiplication of two arrays.""" + + context = ctx_getter() + queue = cl.CommandQueue(context) + + a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32) + + a_gpu = cl_array.to_device(context, queue, a) + b_gpu = cl_array.to_device(context, queue, a) + + a_squared = (b_gpu*a_gpu).get() + + assert (a*a == a_squared).all() + + + + +@pytools.test.mark_test.opencl +def test_addition_array(ctx_getter): + """Test the addition of two arrays.""" + + context = ctx_getter() + queue = cl.CommandQueue(context) + + a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32) + a_gpu = cl_array.to_device(context, queue, a) + a_added = (a_gpu+a_gpu).get() + + assert (a+a == a_added).all() + + + + +@pytools.test.mark_test.opencl +def test_addition_scalar(ctx_getter): + """Test the addition of an array and a scalar.""" + + context = ctx_getter() + queue = cl.CommandQueue(context) + + a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32) + a_gpu = cl_array.to_device(context, queue, a) + a_added = (7+a_gpu).get() + + assert (7+a == a_added).all() + + + + +@pytools.test.mark_test.opencl +def test_substract_array(ctx_getter): + """Test the substraction of two arrays.""" + #test data + a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32) + b = numpy.array([10,20,30,40,50,60,70,80,90,100]).astype(numpy.float32) + + context = ctx_getter() + queue = cl.CommandQueue(context) + + a_gpu = cl_array.to_device(context, queue, a) + b_gpu = cl_array.to_device(context, queue, b) + + result = (a_gpu-b_gpu).get() + assert (a-b == result).all() + + result = (b_gpu-a_gpu).get() + assert (b-a == result).all() + + + + +@pytools.test.mark_test.opencl +def test_substract_scalar(ctx_getter): + """Test the substraction of an array and a scalar.""" + + context = ctx_getter() + queue = cl.CommandQueue(context) + + #test data + a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32) + + #convert a to a gpu object + a_gpu = cl_array.to_device(context, queue, a) + + result = (a_gpu-7).get() + assert (a-7 == result).all() + + result = (7-a_gpu).get() + assert (7-a == result).all() + + + + +@pytools.test.mark_test.opencl +def test_divide_scalar(ctx_getter): + """Test the division of an array and a scalar.""" + + context = ctx_getter() + queue = cl.CommandQueue(context) + + a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32) + a_gpu = cl_array.to_device(context, queue, a) + + result = (a_gpu/2).get() + assert (a/2 == result).all() + + result = (2/a_gpu).get() + assert (2/a == result).all() + + + + +@pytools.test.mark_test.opencl +def test_divide_array(ctx_getter): + """Test the division of an array and a scalar. """ + + context = ctx_getter() + queue = cl.CommandQueue(context) + + #test data + a = numpy.array([10,20,30,40,50,60,70,80,90,100]).astype(numpy.float32) + b = numpy.array([10,10,10,10,10,10,10,10,10,10]).astype(numpy.float32) + + a_gpu = cl_array.to_device(context, queue, a) + b_gpu = cl_array.to_device(context, queue, b) + + a_divide = (a_gpu/b_gpu).get() + assert (numpy.abs(a/b - a_divide) < 1e-3).all() + + a_divide = (b_gpu/a_gpu).get() + assert (numpy.abs(b/a - a_divide) < 1e-3).all() + + + + +@pytools.test.mark_test.opencl +def test_random(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + from pyopencl.clrandom import rand as clrand + + if has_double_support(context.devices[0]): + dtypes = [numpy.float32, numpy.float64] + else: + dtypes = [numpy.float32] + + for dtype in dtypes: + a = clrand(context, queue, (10, 100), dtype=dtype).get() + + assert (0 <= a).all() + assert (a < 1).all() + + + + +@pytools.test.mark_test.opencl +def test_nan_arithmetic(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + def make_nan_contaminated_vector(size): + shape = (size,) + a = numpy.random.randn(*shape).astype(numpy.float32) + #for i in range(0, shape[0], 3): + #a[i] = float('nan') + from random import randrange + for i in range(size//10): + a[randrange(0, size)] = float('nan') + return a + + size = 1 << 20 + + a = make_nan_contaminated_vector(size) + a_gpu = cl_array.to_device(context, queue, a) + b = make_nan_contaminated_vector(size) + b_gpu = cl_array.to_device(context, queue, b) + + ab = a*b + ab_gpu = (a_gpu*b_gpu).get() + + for i in range(size): + assert numpy.isnan(ab[i]) == numpy.isnan(ab_gpu[i]) + + + + +@pytools.test.mark_test.opencl +def test_elwise_kernel(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + from pyopencl.clrandom import rand as clrand + + a_gpu = clrand(context, queue, (50,), numpy.float32) + b_gpu = clrand(context, queue, (50,), numpy.float32) + + from pyopencl.elementwise import ElementwiseKernel + lin_comb = ElementwiseKernel(context, + "float a, float *x, float b, float *y, float *z", + "z[i] = a*x[i] + b*y[i]", + "linear_combination") + + c_gpu = cl_array.empty_like(a_gpu) + lin_comb(5, a_gpu, 6, b_gpu, c_gpu) + + assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5 + + + + +@pytools.test.mark_test.opencl +def test_take(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + idx = cl_array.arange(context, queue, 0, 200000, 2, dtype=numpy.uint32) + a = cl_array.arange(context, queue, 0, 600000, 3, dtype=numpy.float32) + result = cl_array.take(a, idx) + assert ((3*idx).get() == result.get()).all() + + + + +@pytools.test.mark_test.opencl +def test_arange(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + n = 5000 + a = cl_array.arange(context, queue, n, dtype=numpy.float32) + assert (numpy.arange(n, dtype=numpy.float32) == a.get()).all() + + + + +@pytools.test.mark_test.opencl +def test_reverse(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + n = 5000 + a = numpy.arange(n).astype(numpy.float32) + a_gpu = cl_array.to_device(context, queue, a) + + a_gpu = a_gpu.reverse() + + assert (a[::-1] == a_gpu.get()).all() + +if False: + # not yet + + @pytools.test.mark_test.opencl + def test_sum(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + from pyopencl.clrandom import rand as clrand + + a_gpu = clrand(context, queue, (200000,)) + a = a_gpu.get() + + sum_a = numpy.sum(a) + + from pycuda.reduction import get_sum_kernel + sum_a_gpu = cl_array.sum(a_gpu).get() + + assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4 + + @pytools.test.mark_test.opencl + def test_minmax(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + from pyopencl.clrandom import rand as clrand + + if has_double_support(): + dtypes = [numpy.float64, numpy.float32, numpy.int32] + else: + dtypes = [numpy.float32, numpy.int32] + + for what in ["min", "max"]: + for dtype in dtypes: + a_gpu = clrand(context, queue, (200000,), dtype) + a = a_gpu.get() + + op_a = getattr(numpy, what)(a) + op_a_gpu = getattr(cl_array, what)(a_gpu).get() + + assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what) + + @pytools.test.mark_test.opencl + def test_subset_minmax(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + from pyopencl.clrandom import rand as clrand + + l_a = 200000 + gran = 5 + l_m = l_a - l_a // gran + 1 + + if has_double_support(): + dtypes = [numpy.float64, numpy.float32, numpy.int32] + else: + dtypes = [numpy.float32, numpy.int32] + + for dtype in dtypes: + a_gpu = clrand(context, queue, (l_a,), dtype) + a = a_gpu.get() + + meaningful_indices_gpu = cl_array.zeros(l_m, dtype=numpy.int32) + meaningful_indices = meaningful_indices_gpu.get() + j = 0 + for i in range(len(meaningful_indices)): + meaningful_indices[i] = j + j = j + 1 + if j % gran == 0: + j = j + 1 + + meaningful_indices_gpu = cl_array.to_device(meaningful_indices) + b = a[meaningful_indices] + + min_a = numpy.min(b) + min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get() + + assert min_a_gpu == min_a + + @pytools.test.mark_test.opencl + def test_dot(ctx_getter): + from pyopencl.clrandom import rand as clrand + a_gpu = clrand(context, queue, (200000,)) + a = a_gpu.get() + b_gpu = clrand(context, queue, (200000,)) + b = b_gpu.get() + + dot_ab = numpy.dot(a, b) + + dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() + + assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4 + + @pytools.test.mark_test.opencl + def test_slice(ctx_getter): + from pyopencl.clrandom import rand as clrand + + l = 20000 + a_gpu = clrand(context, queue, (l,)) + a = a_gpu.get() + + from random import randrange + for i in range(200): + start = randrange(l) + end = randrange(start, l) + + a_gpu_slice = a_gpu[start:end] + a_slice = a[start:end] + + assert la.norm(a_gpu_slice.get()-a_slice) == 0 + +@pytools.test.mark_test.opencl +def test_if_positive(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + from pyopencl.clrandom import rand as clrand + + l = 20000 + a_gpu = clrand(context, queue, (l,), numpy.float32) + b_gpu = clrand(context, queue, (l,), numpy.float32) + a = a_gpu.get() + b = b_gpu.get() + + max_a_b_gpu = cl_array.maximum(a_gpu, b_gpu) + min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu) + + print max_a_b_gpu + print numpy.maximum(a, b) + + assert la.norm(max_a_b_gpu.get()- numpy.maximum(a, b)) == 0 + assert la.norm(min_a_b_gpu.get()- numpy.minimum(a, b)) == 0 + +@pytools.test.mark_test.opencl +def test_take_put(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + for n in [5, 17, 333]: + one_field_size = 8 + buf_gpu = cl_array.zeros(context, queue, + n*one_field_size, dtype=numpy.float32) + dest_indices = cl_array.to_device(context, queue, + numpy.array([ 0, 1, 2, 3, 32, 33, 34, 35], dtype=numpy.uint32)) + read_map = cl_array.to_device(context, queue, + numpy.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=numpy.uint32)) + + cl_array.multi_take_put( + arrays=[buf_gpu for i in range(n)], + dest_indices=dest_indices, + src_indices=read_map, + src_offsets=[i*one_field_size for i in range(n)], + dest_shape=(96,)) + +@pytools.test.mark_test.opencl +def test_astype(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + from pyopencl.clrandom import rand as clrand + + if not has_double_support(context.devices[0]): + return + + a_gpu = clrand(context, queue, (2000,), dtype=numpy.float32) + + a = a_gpu.get().astype(numpy.float64) + a2 = a_gpu.astype(numpy.float64).get() + + assert a2.dtype == numpy.float64 + assert la.norm(a - a2) == 0, (a, a2) + + a_gpu = clrand(context, queue, (2000,), dtype=numpy.float64) + + a = a_gpu.get().astype(numpy.float32) + a2 = a_gpu.astype(numpy.float32).get() + + assert a2.dtype == numpy.float32 + assert la.norm(a - a2)/la.norm(a) < 1e-7 + + + + +def pytest_generate_tests(metafunc): + if have_cl(): + import pyopencl as cl + else: + # will still show "skipped" messages + return + + if ("device" in metafunc.funcargnames + or "ctx_getter" in metafunc.funcargnames): + arg_dict = {} + + for platform in cl.get_platforms(): + if "platform" in metafunc.funcargnames: + arg_dict["platform"] = platform + + for device in platform.get_devices(): + if "device" in metafunc.funcargnames: + arg_dict["device"] = device + + if "ctx_getter" in metafunc.funcargnames: + arg_dict["ctx_getter"] = lambda: cl.Context([device]) + + metafunc.addcall(funcargs=arg_dict.copy(), + id=", ".join("%s=%s" % (arg, value) + for arg, value in arg_dict.iteritems())) + + elif "platform" in metafunc.funcargnames: + for platform in cl.get_platforms(): + metafunc.addcall( + funcargs=dict(platform=platform), + id=str(platform)) + + + + +if __name__ == "__main__": + # make sure that import failures get reported, instead of skipping the tests. + import pyopencl as cl + + import sys + if len(sys.argv) > 1: + exec sys.argv[1] + else: + from py.test.cmdline import main + main([__file__]) -- GitLab