From 31f43762443d7a6d8b4da347a0f900ca232b16d5 Mon Sep 17 00:00:00 2001 From: Stefan van der Walt <stefan@sun.ac.za> Date: Sat, 3 Jul 2010 14:15:36 -0500 Subject: [PATCH] Begin working on kernel invocation refactor. --- pyopencl/array.py | 185 +++++++++++++++++++++++----------------- pyopencl/elementwise.py | 53 ++++++------ 2 files changed, 134 insertions(+), 104 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index c0c74ef6..450aec8e 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -33,19 +33,19 @@ OTHER DEALINGS IN THE SOFTWARE. import numpy import pyopencl.elementwise as elementwise import pyopencl as cl +#from pytools import memoize_method +from decorator import decorator -def splay(ctx, n): - max_work_items = max(dev.max_work_group_size for dev in ctx.devices) - max_work_items = min(128, max_work_items) +def splay(queue, n): + dev = queue.device + max_work_items = min(128, dev.max_work_group_size) min_work_items = min(32, max_work_items) - max_groups = max( - 4 * dev.max_compute_units * 8 - # 4 to overfill the device - # 8 is an Nvidia constant--that's how many - # groups fit onto one compute device - for dev in ctx.devices) + max_groups = dev.max_compute_units * 4 * 8 + # 4 to overfill the device + # 8 is an Nvidia constant--that's how many + # groups fit onto one compute device if n < min_work_items: group_count = 1 @@ -74,6 +74,39 @@ def _get_common_dtype(obj1, obj2): +@decorator +def elwise_kernel_runner(kernel_getter, *args, **kwargs): + """Take a kernel getter of the same signature as the kernel + and return a function that invokes that kernel. + + Assumes that the zeroth entry in *args* is an :class:`Array`. + """ + + knl = kernel_getter(*args) + + repr_ary = args[0] + assert isinstance(repr_ary, Array) + + queue = kwargs.pop("queue", None) or repr_ary.queue + gs, ls = repr_ary.get_sizes(queue) + + if kwargs: + raise TypeError("only the 'queue' keyword argument is supported") + + actual_args = [] + for arg in args: + if isinstance(arg, Array): + actual_args.append(arg.data) + else: + actual_args.append(arg) + actual_args.append(repr_ary.mem_size) + + return knl(queue, gs, ls, *actual_args) + + + + + class DefaultAllocator: def __init__(self, context, flags=cl.mem_flags.READ_WRITE): self.context = context @@ -132,7 +165,9 @@ class Array(object): self.base = base - self._global_size, self._local_size = splay(context, self.mem_size) + #@memoize_method FIXME: reenable + def get_sizes(self, queue): + return splay(self.queue, self.mem_size) def set(self, ary, queue=None, async=False): assert ary.size == self.size @@ -170,38 +205,28 @@ class Array(object): raise TypeError("pyopencl arrays are not hashable.") # kernel invocation wrappers ---------------------------------------------- - def _axpbyz(self, selffac, other, otherfac, out, queue=None): + @elwise_kernel_runner + @staticmethod + def _axpbyz(out, afac, a, bfac, b, queue=None): """Compute ``out = selffac * self + otherfac*other``, where `other` is a vector..""" assert self.shape == other.shape - knl = elementwise.get_axpbyz_kernel( + return elementwise.get_axpbyz_kernel( self.context, self.dtype, other.dtype, out.dtype) - knl(queue or self.queue, self._global_size, self._local_size, - selffac, self.data, otherfac, other.data, - out.data, self.mem_size) - - return out - - def _axpbz(self, selffac, other, out, queue=None): + @elwise_kernel_runner + def _axpbz(self, out, selffac, other, queue=None): """Compute ``out = selffac * self + other``, where `other` is a scalar.""" - knl = elementwise.get_axpbz_kernel(self.context, self.dtype) - knl(queue or self.queue, self._global_size, self._local_size, - selffac, self.data, other, out.data, self.mem_size) + return elementwise.get_axpbz_kernel(self.context, self.dtype) - return out - - def _elwise_multiply(self, other, out, queue=None): - knl = elementwise.get_multiply_kernel( + @elwise_kernel_runner + def _elwise_multiply(self, out, other, queue=None): + return elementwise.get_multiply_kernel( self.context, self.dtype, other.dtype, out.dtype) - knl(queue or self.queue, self._global_size, self._local_size, - self.data, other.data, - out.data, self.mem_size) - - return out - def _rdiv_scalar(self, other, out, queue=None): + @elwise_kernel_runner + def _rdiv_scalar(self, out, other, queue=None): """Divides an array by a scalar:: y = n / self @@ -209,24 +234,29 @@ class Array(object): assert self.dtype == numpy.float32 - knl = elementwise.get_rdivide_elwise_kernel(self.context, self.dtype) - knl(queue or self.queue, self._global_size, self._local_size, - self.data, other, - out.data, self.mem_size) - - return out + return elementwise.get_rdivide_elwise_kernel(self.context, self.dtype) - def _div(self, other, out, queue=None): + @elwise_kernel_runner + def _div(self, out, other, queue=None): """Divides an array by another array.""" assert self.shape == other.shape - knl = elementwise.get_divide_kernel(self.context, + return elementwise.get_divide_kernel(self.context, self.dtype, other.dtype, out.dtype) - knl(queue or self.queue, self._global_size, self._local_size, - self.data, other.data, out.data, self.mem_size) - return out + @elwise_kernel_runner + def _fill(result, scalar): + return elementwise.get_fill_kernel(result.context, result.dtype) + + @elwise_kernel_runner + def _abs(result, arg): + return elementwise.get_unary_func_kernel( + arg.context, fname, arg.dtype) + + @elwise_kernel_runner + def _pow_scalar(self, ): + return elementwise.get_pow_kernel(self.context, self.dtype) def _new_like_me(self, dtype=None): if dtype is None: @@ -236,11 +266,12 @@ class Array(object): queue=self.queue) # operators --------------------------------------------------------------- - def mul_add(self, selffac, other, otherfac, add_timer=None, queue=None): + def mul_add(self, selffac, other, otherfac, queue=None): """Return `selffac * self + otherfac*other`. """ result = self._new_like_me(_get_common_dtype(self, other)) - return self._axpbyz(selffac, other, otherfac, result, add_timer) + self._axpbyz(result, selffac, self, other, otherfac) + return result def __add__(self, other): """Add an array with an array or an array with a scalar.""" @@ -248,14 +279,16 @@ class Array(object): if isinstance(other, Array): # add another vector result = self._new_like_me(_get_common_dtype(self, other)) - return self._axpbyz(1, other, 1, result) + self._axpbyz(result, 1, self, 1, other) + return result else: # add a scalar if other == 0: return self else: result = self._new_like_me() - return self._axpbz(1, other, result) + self._axpbz(result, 1, other) + return result __radd__ = __add__ @@ -264,14 +297,15 @@ class Array(object): if isinstance(other, Array): result = self._new_like_me(_get_common_dtype(self, other)) - return self._axpbyz(1, other, -1, result) + self._axpbyz(result, 1, other, -1) + return result else: if other == 0: return self else: # create a new array for the result result = self._new_like_me() - return self._axpbz(1, -other, result) + return self._axpbz(result, 1, -other) def __rsub__(self,other): """Substracts an array by a scalar or an array:: @@ -280,32 +314,37 @@ class Array(object): """ # other must be a scalar result = self._new_like_me() - return self._axpbz(-1, other, result) + return self._axpbz(result, -1, other) def __iadd__(self, other): - return self._axpbyz(1, other, 1, self) + self._axpbyz(self, 1, self, 1, other) + return self def __isub__(self, other): - return self._axpbyz(1, other, -1, self) + return self._axpbyz(self, 1, self, -1, other) def __neg__(self): result = self._new_like_me() - return self._axpbz(-1, 0, result) + return self._axpbz(result, -1, self) def __mul__(self, other): if isinstance(other, Array): result = self._new_like_me(_get_common_dtype(self, other)) - return self._elwise_multiply(other, result) + self._elwise_multiply(result, other) + return result else: result = self._new_like_me() - return self._axpbz(other, 0, result) + self._axpbz(result, other, self, 0) + return result def __rmul__(self, scalar): result = self._new_like_me() - return self._axpbz(scalar, 0, result) + self._axpbz(result, scalar, self, 0) + return result def __imul__(self, scalar): - return self._axpbz(scalar, 0, self) + self._axpbz(self, scalar, 0) + return self def __div__(self, other): """Divides an array by an array or a scalar:: @@ -314,14 +353,15 @@ class Array(object): """ if isinstance(other, Array): result = self._new_like_me(_get_common_dtype(self, other)) - return self._div(other, result) + return self._div(result, other) else: if other == 1: return self else: # create a new array for the result result = self._new_like_me() - return self._axpbz(1/other, 0, result) + self._axpbz(result, 1/other, 0) + return result __truediv__ = __div__ @@ -333,29 +373,22 @@ class Array(object): if isinstance(other, Array): result = self._new_like_me(_get_common_dtype(self, other)) - - knl = elementwise.get_divide_kernel(self.context) - knl(self.queue, self._global_size, self._local_size, - other.data, self.data, result.data, - self.mem_size) - - return result + other._div(result, self) else: if other == 1: return self else: # create a new array for the result result = self._new_like_me() - return self._rdiv_scalar(other, result) + self._rdiv_scalar(result, other) + + return result __rtruediv__ = __div__ def fill(self, value, queue=None): """fills the array with the specified value""" - knl = elementwise.get_fill_kernel(self.context, self.dtype) - knl(queue or self.queue, self._global_size, self._local_size, - value, self.data, self.mem_size) - + self._fill(self, value, queue=queue) return self def __len__(self): @@ -370,8 +403,6 @@ class Array(object): of `self`. """ - result = self._new_like_me() - if self.dtype.kind == "f": fname = "fabs" elif self.dtype.kind in ["u", "i"]: @@ -379,11 +410,8 @@ class Array(object): else: raise TypeError("unsupported dtype in __abs__()") - knl = elementwise.get_unary_func_kernel( - self.context, fname, self.dtype) - knl(self.queue, self._global_size, self._local_size, - self.data,result.data, self.mem_size) - + result = self._new_like_me() + self._abs(result, self) return result def __pow__(self, other): @@ -406,7 +434,6 @@ class Array(object): return result else: result = self._new_like_me() - knl = elementwise.get_pow_kernel(self.context, self.dtype) knl(self.queue, self._global_size, self._local_size, other, self.data, result.data, self.mem_size) diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index 0fc9aaed..4ec64b73 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -152,9 +152,9 @@ class ElementwiseKernel: queue = repr_vec.queue invocation_args.append(repr_vec.mem_size) + gs, ls = repr_vec.get_sizes(queue) self.kernel.set_args(*invocation_args) - return cl.enqueue_nd_range_kernel(queue, self.kernel, - repr_vec._global_size, repr_vec._local_size) + return cl.enqueue_nd_range_kernel(queue, self.kernel, gs, ls) @@ -166,9 +166,9 @@ def get_take_kernel(context, dtype, idx_dtype, vec_count=1): "tp": dtype_to_ctype(dtype), } - args = ([VectorArg(idx_dtype, "idx")] - + [VectorArg(dtype, "src"+str(i))for i in range(vec_count)] - + [VectorArg(dtype, "dest"+str(i))for i in range(vec_count)]) + args = ([VectorArg(dtype, "dest"+str(i))for i in range(vec_count)] + + [VectorArg(idx_dtype, "idx")] + + [VectorArg(dtype, "src"+str(i))for i in range(vec_count)] ) body = ( ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join( @@ -188,15 +188,15 @@ def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1): } args = [ + VectorArg(dtype, "dest%d" % i) + for i in range(vec_count) + ] + [ VectorArg(idx_dtype, "gmem_dest_idx"), VectorArg(idx_dtype, "gmem_src_idx"), ] + [ VectorArg(dtype, "src%d" % i) for i in range(vec_count) ] + [ - VectorArg(dtype, "dest%d" % i) - for i in range(vec_count) - ] + [ ScalarArg(idx_dtype, "offset%d" % i) for i in range(vec_count) if with_offsets ] @@ -228,11 +228,11 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1): } args = [ - VectorArg(idx_dtype, "gmem_dest_idx"), - ] + [ VectorArg(dtype, "dest%d" % i) for i in range(vec_count) ] + [ + VectorArg(idx_dtype, "gmem_dest_idx"), + ] + [ VectorArg(dtype, "src%d" % i) for i in range(vec_count) ] @@ -262,6 +262,9 @@ def get_copy_kernel(context, dtype_dest, dtype_src): @context_dependent_memoize def get_linear_combination_kernel(summand_descriptors, dtype_z): + # TODO: Port this! + raise NotImplementedError + from pycuda.tools import dtype_to_ctype from pycuda.elementwise import \ VectorArg, ScalarArg, get_elwise_module @@ -311,7 +314,7 @@ def get_linear_combination_kernel(summand_descriptors, @context_dependent_memoize def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): return get_elwise_kernel(context, - "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % { + "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), @@ -322,7 +325,7 @@ def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): @context_dependent_memoize def get_axpbz_kernel(context, dtype): return get_elwise_kernel(context, - "%(tp)s a, %(tp)s *x,%(tp)s b, %(tp)s *z" % { + "%(tp)s *z, %(tp)s a, %(tp)s *x,%(tp)s b" % { "tp": dtype_to_ctype(dtype)}, "z[i] = a * x[i] + b", "axpb") @@ -330,7 +333,7 @@ def get_axpbz_kernel(context, dtype): @context_dependent_memoize def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z): return get_elwise_kernel(context, - "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { + "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), @@ -341,7 +344,7 @@ def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z): @context_dependent_memoize def get_divide_kernel(context, dtype_x, dtype_y, dtype_z): return get_elwise_kernel(context, - "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { + "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y, " % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), @@ -352,7 +355,7 @@ def get_divide_kernel(context, dtype_x, dtype_y, dtype_z): @context_dependent_memoize def get_rdivide_elwise_kernel(context, dtype): return get_elwise_kernel(context, - "%(tp)s *x, %(tp)s y, %(tp)s *z" % { + "%(tp)s *z, %(tp)s *x, %(tp)s y" % { "tp": dtype_to_ctype(dtype), }, "z[i] = y / x[i]", @@ -361,7 +364,7 @@ def get_rdivide_elwise_kernel(context, dtype): @context_dependent_memoize def get_fill_kernel(context, dtype): return get_elwise_kernel(context, - "%(tp)s a, %(tp)s *z" % { + "%(tp)s *z, %(tp)s a" % { "tp": dtype_to_ctype(dtype), }, "z[i] = a", @@ -370,7 +373,7 @@ def get_fill_kernel(context, dtype): @context_dependent_memoize def get_reverse_kernel(context, dtype): return get_elwise_kernel(context, - "%(tp)s *y, %(tp)s *z" % { + "%(tp)s *z, %(tp)s *y" % { "tp": dtype_to_ctype(dtype), }, "z[i] = y[n-1-i]", @@ -389,7 +392,7 @@ def get_arange_kernel(context, dtype): @context_dependent_memoize def get_pow_kernel(context, dtype): return get_elwise_kernel(context, - "%(tp)s value, %(tp)s *y, %(tp)s *z" % { + "%(tp)s *z, %(tp)s value, %(tp)s *y, " % { "tp": dtype_to_ctype(dtype), }, "z[i] = pow(y[i], value)", @@ -398,7 +401,7 @@ def get_pow_kernel(context, dtype): @context_dependent_memoize def get_pow_array_kernel(context, dtype_x, dtype_y, dtype_z): return get_elwise_kernel(context, - "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { + "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), @@ -409,21 +412,21 @@ def get_pow_array_kernel(context, dtype_x, dtype_y, dtype_z): @context_dependent_memoize def get_fmod_kernel(context): return get_elwise_kernel(context, - "float *arg, float *mod, float *z", + "float *z, float *arg, float *mod", "z[i] = fmod(arg[i], mod[i])", "fmod_kernel") @context_dependent_memoize def get_modf_kernel(context): return get_elwise_kernel(context, - "float *x, float *intpart ,float *fracpart", + "float *intpart ,float *fracpart, float *x", "fracpart[i] = modf(x[i], &intpart[i])", "modf_kernel") @context_dependent_memoize def get_frexp_kernel(context): return get_elwise_kernel(context, - "float *x, float *significand, float *exponent", + "float *significand, float *exponent, float *x", """ int expt = 0; significand[i] = frexp(x[i], &expt); @@ -434,7 +437,7 @@ def get_frexp_kernel(context): @context_dependent_memoize def get_ldexp_kernel(context): return get_elwise_kernel(context, - "float *sig, float *expt, float *z", + "float *z, float *sig, float *expt", "z[i] = ldexp(sig[i], (int) expt[i])", "ldexp_kernel") @@ -444,7 +447,7 @@ def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None): out_dtype = in_dtype return get_elwise_kernel(context, - "%(tp_in)s *y, %(tp_out)s *z" % { + "%(tp_out)s *z, %(tp_in)s *y" % { "tp_in": dtype_to_ctype(in_dtype), "tp_out": dtype_to_ctype(out_dtype), }, @@ -457,10 +460,10 @@ def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None): @context_dependent_memoize def get_if_positive_kernel(context, crit_dtype, dtype): return get_elwise_kernel(context, [ + VectorArg(dtype, "result"), VectorArg(crit_dtype, "crit"), VectorArg(dtype, "then_"), VectorArg(dtype, "else_"), - VectorArg(dtype, "result"), ], "result[i] = crit[i] > 0 ? then_[i] : else_[i]", "if_positive") -- GitLab