From ccf6fd131024306adf71d123394f972b614698fd Mon Sep 17 00:00:00 2001 From: Keegan Owsley Date: Mon, 28 Nov 2016 15:49:00 -0800 Subject: [PATCH 01/10] Allow noncontiguous arrays in elementwise ops --- pycuda/elementwise.py | 135 +++++++++++++++++++++++++++++++++++++++--- pycuda/gpuarray.py | 120 ++++++++++++++++++++++--------------- 2 files changed, 201 insertions(+), 54 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index feab0a6b..61fcc2ad 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -34,9 +34,117 @@ OTHER DEALINGS IN THE SOFTWARE. from pycuda.tools import context_dependent_memoize import numpy as np -from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg +from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg, Argument from pytools import memoize_method +import pycuda.driver as cuda + +import struct +import re + +class DimArrayCuda: + max_dims = 2 + itemsize = 4 + 4 # unsigned int and long int + def __init__(self, vector, n=0): # n unused + if hasattr(vector, '_dimholder'): + self.gpudata = vector._dimholder.gpudata + else: + self.gpudata = cuda.mem_alloc(self.max_dims*self.itemsize) + newstrides = tuple(v // vector.dtype.itemsize for v in vector.strides) + shape = vector.shape[::-1] + newstrides = newstrides[::-1] + if len(shape) < self.max_dims: + shape = shape + (1,)*(self.max_dims-len(shape)) + newstrides = newstrides + (0,)*(self.max_dims-len(newstrides)) + data = shape + newstrides + cuda.memcpy_htod(int(self.gpudata), struct.pack('I'*self.max_dims + 'l'*self.max_dims, *data)) + vector._dimholder = self + +class _DimHolder: + struct_char = 'P' + +def get_elwise_module_noncontig(arguments, operation, + name="kernel", keep=False, options=None, + preamble="", loop_prep="", after_loop="", max_dims=2): + from pycuda.compiler import SourceModule + + vecargs = tuple([a for a in arguments if isinstance(a, VectorArg)]) + names = [V.name for V in vecargs] + + def fix_indexing(t): + try: + n = names.index(t.group(1)) + except NameError: + return t.group(0) + return '{}[i2m({},*__dim{})]'.format(t.group(1),t.group(2),n) + + # regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[((?:[^\[\](?:)]|(?R))*)\]" + # Above works recursively to match outer brackets, we can switch if we deem it necessary + # Requires regex module instead of re + regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[(.+?)\]" + operation = re.sub(regex, fix_indexing, operation) + + dimdef = ''' + typedef struct + { + unsigned n[%(max_dims)i]; + long stride[%(max_dims)i]; + } dim; + + __device__ unsigned i2m(unsigned i, dim d) + { + unsigned m = 0; + unsigned j = i; + for(int k = 0; k < %(max_dims)i; k++) + { + m += d.stride[k] * (j%%d.n[k]); + j = j / d.n[k]; + } + return m; + } + ''' % {"max_dims":max_dims} + + args = list(arg.declarator() for arg in arguments) + args.extend('dim *__dim{}'.format(i) for i in range(len(vecargs))) + + + + src = """ + #include + + %(dimdef)s + + %(preamble)s + + __global__ void %(name)s(%(arguments)s) + { + + unsigned tid = threadIdx.x; + unsigned total_threads = gridDim.x*blockDim.x; + unsigned cta_start = blockDim.x*blockIdx.x; + unsigned i; + + %(loop_prep)s; + + for (i = cta_start + tid; i < n; i += total_threads) + { + %(operation)s; + } + + %(after_loop)s; + } + """ % { + "arguments": ", ".join(args), + "operation": operation, + "name": name, + "preamble": preamble, + "loop_prep": loop_prep, + "after_loop": after_loop, + "dimdef": dimdef + } + + return SourceModule(src, + options=options, keep=keep) def get_elwise_module(arguments, operation, name="kernel", keep=False, options=None, @@ -124,7 +232,7 @@ def get_elwise_range_module(arguments, operation, def get_elwise_kernel_and_types(arguments, operation, - name="kernel", keep=False, options=None, use_range=False, **kwargs): + name="kernel", keep=False, options=None, use_range=False, use_slices=False, **kwargs): if isinstance(arguments, str): from pycuda.tools import parse_c_arg arguments = [parse_c_arg(arg) for arg in arguments.split(",")] @@ -138,18 +246,26 @@ def get_elwise_kernel_and_types(arguments, operation, else: arguments.append(ScalarArg(np.uintp, "n")) + # + # if use_slices: + # module_builder = get_elwise_module_noncontig if use_range: module_builder = get_elwise_range_module else: - module_builder = get_elwise_module + module_builder = get_elwise_module_noncontig mod = module_builder(arguments, operation, name, keep, options, **kwargs) + vectors = [a for a in arguments if isinstance(a, VectorArg)] + + oldargs = arguments.copy() + for i,v in enumerate(vectors): + arguments.append(_DimHolder()) func = mod.get_function(name) func.prepare("".join(arg.struct_char for arg in arguments)) - return func, arguments + return func, oldargs def get_elwise_kernel(arguments, operation, @@ -200,15 +316,20 @@ class ElementwiseKernel: for arg, arg_descr in zip(args, arguments): if isinstance(arg_descr, VectorArg): - if not arg.flags.forc: - raise RuntimeError("elementwise kernel cannot " - "deal with non-contiguous arrays") + # if not arg.flags.forc: + # raise RuntimeError("elementwise kernel cannot " + # "deal with non-contiguous arrays") + + vectors.append(arg) invocation_args.append(arg.gpudata) else: invocation_args.append(arg) + for i,v in enumerate(vectors): + invocations_args.append(DimArrayCuda(v,i).gpudata) + repr_vec = vectors[0] if slice_ is not None: diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 896b2fca..21b7bde2 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -118,16 +118,16 @@ def splay(n, dev=None): def _make_binary_op(operator): def func(self, other): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + # if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") if isinstance(other, GPUArray): assert self.shape == other.shape - if not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + # if not other.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") result = self._new_like_me() func = elementwise.get_binary_op_kernel( @@ -135,7 +135,10 @@ def _make_binary_op(operator): operator) func.prepared_async_call(self._grid, self._block, None, self.gpudata, other.gpudata, result.gpudata, - self.mem_size) + self.mem_size, + elementwise.DimArrayCuda(self, 0).gpudata, + elementwise.DimArrayCuda(other, 1).gpudata, + elementwise.DimArrayCuda(result, 2).gpudata) return result else: # scalar operator @@ -144,7 +147,9 @@ def _make_binary_op(operator): self.dtype, result.dtype, operator) func.prepared_async_call(self._grid, self._block, None, self.gpudata, other, result.gpudata, - self.mem_size) + self.mem_size, + elementwise.DimArrayCuda(self, 0).gpudata, + elementwise.DimArrayCuda(result, 1).gpudata) return result return func @@ -297,47 +302,52 @@ class GPUArray(object): """Compute ``out = selffac * self + otherfac*other``, where `other` is a vector..""" assert self.shape == other.shape - if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype) if add_timer is not None: add_timer(3*self.size, func.prepared_timed_call(self._grid, selffac, self.gpudata, otherfac, other.gpudata, - out.gpudata, self.mem_size)) + out.gpudata, self.mem_size, + elementwise.DimArrayCuda(self, 0).gpudata, + elementwise.DimArrayCuda(other, 1).gpudata, + elementwise.DimArrayCuda(out, 2).gpudata)) else: func.prepared_async_call(self._grid, self._block, stream, selffac, self.gpudata, otherfac, other.gpudata, - out.gpudata, self.mem_size) + out.gpudata, self.mem_size, + elementwise.DimArrayCuda(self, 0).gpudata, + elementwise.DimArrayCuda(other, 1).gpudata, + elementwise.DimArrayCuda(out, 2).gpudata) return out def _axpbz(self, selffac, other, out, stream=None): """Compute ``out = selffac * self + other``, where `other` is a scalar.""" - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") func = elementwise.get_axpbz_kernel(self.dtype, out.dtype) func.prepared_async_call(self._grid, self._block, stream, selffac, self.gpudata, - other, out.gpudata, self.mem_size) + other, out.gpudata, self.mem_size, + elementwise.DimArrayCuda(self).gpudata, + elementwise.DimArrayCuda(out).gpudata) return out def _elwise_multiply(self, other, out, stream=None): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + # if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "*") func.prepared_async_call(self._grid, self._block, stream, self.gpudata, other.gpudata, - out.gpudata, self.mem_size) + out.gpudata, self.mem_size, + elementwise.DimArrayCuda(self).gpudata, + elementwise.DimArrayCuda(other).gpudata, + elementwise.DimArrayCuda(out).gpudata) return out @@ -354,24 +364,25 @@ class GPUArray(object): func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype) func.prepared_async_call(self._grid, self._block, stream, self.gpudata, other, - out.gpudata, self.mem_size) + out.gpudata, self.mem_size, + elementwise.DimArrayCuda(self).gpudata, + elementwise.DimArrayCuda(out).gpudata) return out def _div(self, other, out, stream=None): """Divides an array by another array.""" - if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - assert self.shape == other.shape func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "/") func.prepared_async_call(self._grid, self._block, stream, self.gpudata, other.gpudata, - out.gpudata, self.mem_size) + out.gpudata, self.mem_size, + elementwise.DimArrayCuda(self, 0).gpudata, + elementwise.DimArrayCuda(other, 1).gpudata, + elementwise.DimArrayCuda(out, 2).gpudata) return out @@ -381,7 +392,8 @@ class GPUArray(object): dtype = self.dtype else: if dtype == self.dtype: - strides = self.strides + # strides = self.strides + strides = None return self.__class__(self.shape, dtype, allocator=self.allocator, strides=strides) @@ -515,7 +527,8 @@ class GPUArray(object): """fills the array with the specified value""" func = elementwise.get_fill_kernel(self.dtype) func.prepared_async_call(self._grid, self._block, stream, - value, self.gpudata, self.mem_size) + value, self.gpudata, self.mem_size, + elementwise.DimArrayCuda(self, 0).gpudata) return self @@ -601,7 +614,9 @@ class GPUArray(object): out_dtype=out_dtype) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, self.mem_size) + self.gpudata, result.gpudata, self.mem_size, + elementwise.DimArrayCuda(self, 0).gpudata, + elementwise.DimArrayCuda(result, 1).gpudata) return result @@ -616,9 +631,9 @@ class GPUArray(object): """ if isinstance(other, GPUArray): - if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + # if not self.flags.forc or not other.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") assert self.shape == other.shape @@ -629,19 +644,22 @@ class GPUArray(object): func.prepared_async_call(self._grid, self._block, None, self.gpudata, other.gpudata, result.gpudata, - self.mem_size) + self.mem_size, + elementwise.DimArrayCuda(self, 0).gpudata, + elementwise.DimArrayCuda(other, 1).gpudata, + elementwise.DimArrayCuda(result, 2).gpudata) return result else: - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + # if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") result = self._new_like_me() func = elementwise.get_pow_kernel(self.dtype) func.prepared_async_call(self._grid, self._block, None, other, self.gpudata, result.gpudata, - self.mem_size) + self.mem_size, elementwise.DimArrayCuda(self, 0).gpudata, elementwise.DimArrayCuda(result, 1).gpudata) return result @@ -659,7 +677,9 @@ class GPUArray(object): func = elementwise.get_reverse_kernel(self.dtype) func.prepared_async_call(self._grid, self._block, stream, self.gpudata, result.gpudata, - self.mem_size) + self.mem_size, + elementwise.DimArrayCuda(self).gpudata, + elementwise.DimArrayCuda(out).gpudata) return result @@ -676,7 +696,9 @@ class GPUArray(object): func = elementwise.get_copy_kernel(dtype, self.dtype) func.prepared_async_call(self._grid, self._block, stream, result.gpudata, self.gpudata, - self.mem_size) + self.mem_size, + elementwise.DimArrayCuda(result).gpudata, + elementwise.DimArrayCuda(self).gpudata) return result @@ -906,7 +928,9 @@ class GPUArray(object): func = elementwise.get_real_kernel(dtype, real_dtype) func.prepared_async_call(self._grid, self._block, None, self.gpudata, result.gpudata, - self.mem_size) + self.mem_size, + elementwise.DimArrayCuda(self).gpudata, + elementwise.DimArrayCuda(result).gpudata) return result else: @@ -916,9 +940,6 @@ class GPUArray(object): def imag(self): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) @@ -928,7 +949,9 @@ class GPUArray(object): func = elementwise.get_imag_kernel(dtype, real_dtype) func.prepared_async_call(self._grid, self._block, None, self.gpudata, result.gpudata, - self.mem_size) + self.mem_size, + elementwise.DimArrayCuda(self).gpudata, + elementwise.DimArrayCuda(result).gpudata) return result else: @@ -946,7 +969,9 @@ class GPUArray(object): func = elementwise.get_conj_kernel(dtype) func.prepared_async_call(self._grid, self._block, None, self.gpudata, result.gpudata, - self.mem_size) + self.mem_size, + elementwise.DimArrayCuda(self).gpudata, + elementwise.DimArrayCuda(result).gpudata) return result else: @@ -1092,7 +1117,8 @@ def arange(*args, **kwargs): func = elementwise.get_arange_kernel(dtype) func.prepared_async_call(result._grid, result._block, kwargs.get("stream"), - result.gpudata, start, step, size) + result.gpudata, start, step, size, + elementwise.DimArrayCuda(result).gpudata) return result -- GitLab From 22c49c8b6c988d91fc57399c68a183ccd25ed3d5 Mon Sep 17 00:00:00 2001 From: Keegan Owsley Date: Thu, 1 Dec 2016 14:31:37 -0800 Subject: [PATCH 02/10] Only use noncontig kernels as necessary --- pycuda/elementwise.py | 137 +++++++--------- pycuda/gpuarray.py | 369 +++++++++++++++++++++++++++--------------- 2 files changed, 302 insertions(+), 204 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 61fcc2ad..12b588d2 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -37,28 +37,10 @@ import numpy as np from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg, Argument from pytools import memoize_method -import pycuda.driver as cuda import struct import re -class DimArrayCuda: - max_dims = 2 - itemsize = 4 + 4 # unsigned int and long int - def __init__(self, vector, n=0): # n unused - if hasattr(vector, '_dimholder'): - self.gpudata = vector._dimholder.gpudata - else: - self.gpudata = cuda.mem_alloc(self.max_dims*self.itemsize) - newstrides = tuple(v // vector.dtype.itemsize for v in vector.strides) - shape = vector.shape[::-1] - newstrides = newstrides[::-1] - if len(shape) < self.max_dims: - shape = shape + (1,)*(self.max_dims-len(shape)) - newstrides = newstrides + (0,)*(self.max_dims-len(newstrides)) - data = shape + newstrides - cuda.memcpy_htod(int(self.gpudata), struct.pack('I'*self.max_dims + 'l'*self.max_dims, *data)) - vector._dimholder = self class _DimHolder: struct_char = 'P' @@ -142,6 +124,7 @@ def get_elwise_module_noncontig(arguments, operation, "after_loop": after_loop, "dimdef": dimdef } + print(src) return SourceModule(src, options=options, keep=keep) @@ -232,7 +215,7 @@ def get_elwise_range_module(arguments, operation, def get_elwise_kernel_and_types(arguments, operation, - name="kernel", keep=False, options=None, use_range=False, use_slices=False, **kwargs): + name="kernel", keep=False, options=None, use_range=False, use_strides=False, **kwargs): if isinstance(arguments, str): from pycuda.tools import parse_c_arg arguments = [parse_c_arg(arg) for arg in arguments.split(",")] @@ -247,23 +230,25 @@ def get_elwise_kernel_and_types(arguments, operation, arguments.append(ScalarArg(np.uintp, "n")) # - # if use_slices: - # module_builder = get_elwise_module_noncontig - if use_range: + if use_strides: + module_builder = get_elwise_module_noncontig + elif use_range: module_builder = get_elwise_range_module else: - module_builder = get_elwise_module_noncontig + module_builder = get_elwise_module mod = module_builder(arguments, operation, name, keep, options, **kwargs) - vectors = [a for a in arguments if isinstance(a, VectorArg)] - oldargs = arguments.copy() - for i,v in enumerate(vectors): - arguments.append(_DimHolder()) + if use_strides: + vectors = [a for a in arguments if isinstance(a, VectorArg)] + + for i,v in enumerate(vectors): + arguments.append(_DimHolder()) func = mod.get_function(name) func.prepare("".join(arg.struct_char for arg in arguments)) + func.stride_support = use_strides return func, oldargs @@ -327,8 +312,8 @@ class ElementwiseKernel: else: invocation_args.append(arg) - for i,v in enumerate(vectors): - invocations_args.append(DimArrayCuda(v,i).gpudata) + for v in vectors: + invocations_args.append(v.device_shape_and_strides) repr_vec = vectors[0] @@ -462,14 +447,14 @@ def get_put_kernel(dtype, idx_dtype, vec_count=1): @context_dependent_memoize -def get_copy_kernel(dtype_dest, dtype_src): +def get_copy_kernel(dtype_dest, dtype_src, use_strides=False): return get_elwise_kernel( "%(tp_dest)s *dest, %(tp_src)s *src" % { "tp_dest": dtype_to_ctype(dtype_dest), "tp_src": dtype_to_ctype(dtype_src), }, "dest[i] = src[i]", - "copy") + "copy", use_strides=use_strides) @context_dependent_memoize @@ -520,7 +505,7 @@ def get_linear_combination_kernel(summand_descriptors, @context_dependent_memoize -def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): +def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z, use_strides=False): return get_elwise_kernel( "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), @@ -528,22 +513,22 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = a*x[i] + b*y[i]", - "axpbyz") + "axpbyz", use_strides=use_strides) @context_dependent_memoize -def get_axpbz_kernel(dtype_x, dtype_z): +def get_axpbz_kernel(dtype_x, dtype_z, use_strides=False): return get_elwise_kernel( "%(tp_z)s a, %(tp_x)s *x,%(tp_z)s b, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z) }, "z[i] = a * x[i] + b", - "axpb") + "axpb", use_strides=use_strides) @context_dependent_memoize -def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator): +def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator, use_strides=False): return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), @@ -551,22 +536,22 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = x[i] %s y[i]" % operator, - "multiply") + "multiply", use_strides=use_strides) @context_dependent_memoize -def get_rdivide_elwise_kernel(dtype_x, dtype_z): +def get_rdivide_elwise_kernel(dtype_x, dtype_z, use_strides=False): return get_elwise_kernel( "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = y / x[i]", - "divide_r") + "divide_r", use_strides=use_strides) @context_dependent_memoize -def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z): +def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=False): return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), @@ -574,11 +559,11 @@ def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s(x[i], y[i])" % func, - func+"_kernel") + func+"_kernel", use_strides=use_strides) @context_dependent_memoize -def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z): +def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=False): return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), @@ -586,10 +571,10 @@ def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s(x[i], y)" % func, - func+"_kernel") + func+"_kernel", use_strides=use_strides) -def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar): +def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar, use_strides=False): if np.float64 not in [dtype_x, dtype_y]: func = func + "f" @@ -598,75 +583,75 @@ def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar): func = "f"+func if use_scalar: - return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z) + return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=use_strides) else: - return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z) + return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=use_strides) @context_dependent_memoize -def get_fill_kernel(dtype): +def get_fill_kernel(dtype, use_strides=False): return get_elwise_kernel( "%(tp)s a, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, "z[i] = a", - "fill") + "fill", use_strides=use_strides) @context_dependent_memoize -def get_reverse_kernel(dtype): +def get_reverse_kernel(dtype, use_strides=False): return get_elwise_kernel( "%(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, "z[i] = y[n-1-i]", - "reverse") + "reverse", use_strides=use_strides) @context_dependent_memoize -def get_real_kernel(dtype, real_dtype): +def get_real_kernel(dtype, real_dtype, use_strides=False): return get_elwise_kernel( "%(tp)s *y, %(real_tp)s *z" % { "tp": dtype_to_ctype(dtype), "real_tp": dtype_to_ctype(real_dtype), }, "z[i] = real(y[i])", - "real") + "real", use_strides=use_strides) @context_dependent_memoize -def get_imag_kernel(dtype, real_dtype): +def get_imag_kernel(dtype, real_dtype, use_strides=False): return get_elwise_kernel( "%(tp)s *y, %(real_tp)s *z" % { "tp": dtype_to_ctype(dtype), "real_tp": dtype_to_ctype(real_dtype), }, "z[i] = imag(y[i])", - "imag") + "imag", use_strides=use_strides) @context_dependent_memoize -def get_conj_kernel(dtype): +def get_conj_kernel(dtype, use_strides=False): return get_elwise_kernel( "%(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, "z[i] = pycuda::conj(y[i])", - "conj") + "conj", use_strides=use_strides) @context_dependent_memoize -def get_arange_kernel(dtype): +def get_arange_kernel(dtype, use_strides=False): return get_elwise_kernel( "%(tp)s *z, %(tp)s start, %(tp)s step" % { "tp": dtype_to_ctype(dtype), }, "z[i] = start + i*step", - "arange") + "arange", use_strides=use_strides) @context_dependent_memoize -def get_pow_kernel(dtype): +def get_pow_kernel(dtype, use_strides=False): if dtype == np.float32: func = "powf" else: @@ -677,11 +662,11 @@ def get_pow_kernel(dtype): "tp": dtype_to_ctype(dtype), }, "z[i] = %s(y[i], value)" % func, - "pow_method") + "pow_method", use_strides=use_strides) @context_dependent_memoize -def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): +def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, use_strides=False): if np.float64 in [dtype_x, dtype_y]: func = "pow" else: @@ -694,27 +679,27 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s(x[i], y[i])" % func, - "pow_method") + "pow_method", use_strides=use_strides) @context_dependent_memoize -def get_fmod_kernel(): +def get_fmod_kernel(use_strides=False): return get_elwise_kernel( "float *arg, float *mod, float *z", "z[i] = fmod(arg[i], mod[i])", - "fmod_kernel") + "fmod_kernel", use_strides=use_strides) @context_dependent_memoize -def get_modf_kernel(): +def get_modf_kernel(use_strides=False): return get_elwise_kernel( "float *x, float *intpart ,float *fracpart", "fracpart[i] = modf(x[i], &intpart[i])", - "modf_kernel") + "modf_kernel", use_strides=use_strides) @context_dependent_memoize -def get_frexp_kernel(): +def get_frexp_kernel(use_strides=False): return get_elwise_kernel( "float *x, float *significand, float *exponent", """ @@ -722,19 +707,19 @@ def get_frexp_kernel(): significand[i] = frexp(x[i], &expt); exponent[i] = expt; """, - "frexp_kernel") + "frexp_kernel", use_strides=use_strides) @context_dependent_memoize -def get_ldexp_kernel(): +def get_ldexp_kernel(use_strides=False): return get_elwise_kernel( "float *sig, float *expt, float *z", "z[i] = ldexp(sig[i], int(expt[i]))", - "ldexp_kernel") + "ldexp_kernel", use_strides=use_strides) @context_dependent_memoize -def get_unary_func_kernel(func_name, in_dtype, out_dtype=None): +def get_unary_func_kernel(func_name, in_dtype, out_dtype=None, use_strides=False): if out_dtype is None: out_dtype = in_dtype @@ -744,11 +729,11 @@ def get_unary_func_kernel(func_name, in_dtype, out_dtype=None): "tp_out": dtype_to_ctype(out_dtype), }, "z[i] = %s(y[i])" % func_name, - "%s_kernel" % func_name) + "%s_kernel" % func_name, use_strides=use_strides) @context_dependent_memoize -def get_if_positive_kernel(crit_dtype, dtype): +def get_if_positive_kernel(crit_dtype, dtype, use_strides=False): return get_elwise_kernel([ VectorArg(crit_dtype, "crit"), VectorArg(dtype, "then_"), @@ -756,11 +741,11 @@ def get_if_positive_kernel(crit_dtype, dtype): VectorArg(dtype, "result"), ], "result[i] = crit[i] > 0 ? then_[i] : else_[i]", - "if_positive") + "if_positive", use_strides=use_strides) @context_dependent_memoize -def get_scalar_op_kernel(dtype_x, dtype_y, operator): +def get_scalar_op_kernel(dtype_x, dtype_y, operator, use_strides=False): return get_elwise_kernel( "%(tp_x)s *x, %(tp_a)s a, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), @@ -768,4 +753,4 @@ def get_scalar_op_kernel(dtype_x, dtype_y, operator): "tp_a": dtype_to_ctype(dtype_x), }, "y[i] = x[i] %s a" % operator, - "scalarop_kernel") + "scalarop_kernel", use_strides=use_strides) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 21b7bde2..d944271b 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -14,6 +14,7 @@ from pycuda.characterize import has_double_support import six from six.moves import range, zip, reduce import numbers +import struct def _get_common_dtype(obj1, obj2): @@ -129,31 +130,48 @@ def _make_binary_op(operator): # raise RuntimeError("only contiguous arrays may " # "be used as arguments to this operation") + noncontig = is_noncontig(self, other) + result = self._new_like_me() func = elementwise.get_binary_op_kernel( self.dtype, other.dtype, result.dtype, - operator) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other.gpudata, result.gpudata, - self.mem_size, - elementwise.DimArrayCuda(self, 0).gpudata, - elementwise.DimArrayCuda(other, 1).gpudata, - elementwise.DimArrayCuda(result, 2).gpudata) + operator, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, other.gpudata, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + other.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, other.gpudata, result.gpudata, + self.mem_size) return result else: # scalar operator result = self._new_like_me() + noncontig = is_noncontig(self) func = elementwise.get_scalar_op_kernel( - self.dtype, result.dtype, operator) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other, result.gpudata, - self.mem_size, - elementwise.DimArrayCuda(self, 0).gpudata, - elementwise.DimArrayCuda(result, 1).gpudata) + self.dtype, result.dtype, operator, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, other, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, other, result.gpudata, + self.mem_size) return result return func +def is_noncontig(*vecs): + r = False + for v in vecs: r = r or not v.flags.forc + return r class GPUArray(object): """A GPUArray is used to do array-based calculation on the GPU. @@ -163,6 +181,7 @@ class GPUArray(object): """ __array_priority__ = 100 + __max_dims__ = 2 def __init__(self, shape, dtype, allocator=drv.mem_alloc, base=None, gpudata=None, strides=None, order="C"): @@ -232,6 +251,22 @@ class GPUArray(object): def flags(self): return _ArrayFlags(self) + @property + def device_shape_and_strides(self): + itemsize = 8 # unsigned int and long int + + sar = drv.mem_alloc(self.__max_dims__*itemsize) + newstrides = tuple(v // self.dtype.itemsize for v in self.strides) + shape = self.shape[::-1] + newstrides = newstrides[::-1] + if len(shape) < self.__max_dims__: + shape = shape + (1,)*(self.__max_dims__-len(shape)) + newstrides = newstrides + (0,)*(self.__max_dims__-len(newstrides)) + data = shape + newstrides + drv.memcpy_htod(int(sar), struct.pack('I'*self.__max_dims__ + 'l'*self.__max_dims__, *data)) + self.__sar = sar # don't garbage collect yet + return sar + def set(self, ary, async=False, stream=None): if ary.size != self.size: raise ValueError("ary and self must be the same size") @@ -303,51 +338,73 @@ class GPUArray(object): where `other` is a vector..""" assert self.shape == other.shape - func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype) + noncontig = is_noncontig(self, other, out) - if add_timer is not None: - add_timer(3*self.size, func.prepared_timed_call(self._grid, - selffac, self.gpudata, otherfac, other.gpudata, - out.gpudata, self.mem_size, - elementwise.DimArrayCuda(self, 0).gpudata, - elementwise.DimArrayCuda(other, 1).gpudata, - elementwise.DimArrayCuda(out, 2).gpudata)) - else: - func.prepared_async_call(self._grid, self._block, stream, + func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype, use_strides=noncontig) + + if noncontig: + if add_timer is not None: + add_timer(3*self.size, func.prepared_timed_call(self._grid, selffac, self.gpudata, otherfac, other.gpudata, out.gpudata, self.mem_size, - elementwise.DimArrayCuda(self, 0).gpudata, - elementwise.DimArrayCuda(other, 1).gpudata, - elementwise.DimArrayCuda(out, 2).gpudata) + self.device_shape_and_strides, + other.device_shape_and_strides, + out.device_shape_and_strides)) + else: + func.prepared_async_call(self._grid, self._block, stream, + selffac, self.gpudata, otherfac, other.gpudata, + out.gpudata, self.mem_size, + self.device_shape_and_strides, + other.device_shape_and_strides, + out.device_shape_and_strides) + else: + if add_timer is not None: + add_timer(3*self.size, func.prepared_timed_call(self._grid, + selffac, self.gpudata, otherfac, other.gpudata, + out.gpudata, self.mem_size)) + else: + func.prepared_async_call(self._grid, self._block, stream, + selffac, self.gpudata, otherfac, other.gpudata, + out.gpudata, self.mem_size) + return out def _axpbz(self, selffac, other, out, stream=None): """Compute ``out = selffac * self + other``, where `other` is a scalar.""" - - func = elementwise.get_axpbz_kernel(self.dtype, out.dtype) - func.prepared_async_call(self._grid, self._block, stream, - selffac, self.gpudata, - other, out.gpudata, self.mem_size, - elementwise.DimArrayCuda(self).gpudata, - elementwise.DimArrayCuda(out).gpudata) + noncontig = is_noncontig(self, out) + func = elementwise.get_axpbz_kernel(self.dtype, out.dtype, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + selffac, self.gpudata, + other, out.gpudata, self.mem_size, + self.device_shape_and_strides, + out.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + selffac, self.gpudata, + other, out.gpudata, self.mem_size) return out def _elwise_multiply(self, other, out, stream=None): - # if not self.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") + + noncontig = is_noncontig(self, other, out) func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, - out.dtype, "*") - func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other.gpudata, - out.gpudata, self.mem_size, - elementwise.DimArrayCuda(self).gpudata, - elementwise.DimArrayCuda(other).gpudata, - elementwise.DimArrayCuda(out).gpudata) + out.dtype, "*", use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, other.gpudata, + out.gpudata, self.mem_size, + self.device_shape_and_strides, + other.device_shape_and_strides, + out.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, other.gpudata, + out.gpudata, self.mem_size) return out @@ -357,16 +414,20 @@ class GPUArray(object): y = n / self """ - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + noncontig = is_noncontig(self, out) - func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype) - func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other, - out.gpudata, self.mem_size, - elementwise.DimArrayCuda(self).gpudata, - elementwise.DimArrayCuda(out).gpudata) + func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype, use_strides=noncontig) + + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, other, + out.gpudata, self.mem_size, + self.device_shape_and_strides, + out.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, other, + out.gpudata, self.mem_size) return out @@ -375,14 +436,21 @@ class GPUArray(object): assert self.shape == other.shape + noncontig = is_noncontig(self, other, out) + func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, - out.dtype, "/") - func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other.gpudata, - out.gpudata, self.mem_size, - elementwise.DimArrayCuda(self, 0).gpudata, - elementwise.DimArrayCuda(other, 1).gpudata, - elementwise.DimArrayCuda(out, 2).gpudata) + out.dtype, "/", use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, other.gpudata, + out.gpudata, self.mem_size, + self.device_shape_and_strides, + other.device_shape_and_strides, + out.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, other.gpudata, + out.gpudata, self.mem_size) return out @@ -525,10 +593,17 @@ class GPUArray(object): def fill(self, value, stream=None): """fills the array with the specified value""" - func = elementwise.get_fill_kernel(self.dtype) - func.prepared_async_call(self._grid, self._block, stream, - value, self.gpudata, self.mem_size, - elementwise.DimArrayCuda(self, 0).gpudata) + noncontig = is_noncontig(self) + + func = elementwise.get_fill_kernel(self.dtype, use_strides=noncontig) + + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + value, self.gpudata, self.mem_size, + self.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + value, self.gpudata, self.mem_size) return self @@ -610,13 +685,19 @@ class GPUArray(object): else: out_dtype = self.dtype + noncontig = is_noncontig(self, result) + func = elementwise.get_unary_func_kernel(fname, self.dtype, - out_dtype=out_dtype) + out_dtype=out_dtype, use_strides=noncontig) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, self.mem_size, - elementwise.DimArrayCuda(self, 0).gpudata, - elementwise.DimArrayCuda(result, 1).gpudata) + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, self.mem_size, + self.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, self.mem_size) return result @@ -631,23 +712,26 @@ class GPUArray(object): """ if isinstance(other, GPUArray): - # if not self.flags.forc or not other.flags.forc: - # raise RuntimeError("only contiguous arrays may " - # "be used as arguments to this operation") - assert self.shape == other.shape result = self._new_like_me(_get_common_dtype(self, other)) - func = elementwise.get_pow_array_kernel( - self.dtype, other.dtype, result.dtype) + non_contig = is_noncontig(self, other, result) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other.gpudata, result.gpudata, - self.mem_size, - elementwise.DimArrayCuda(self, 0).gpudata, - elementwise.DimArrayCuda(other, 1).gpudata, - elementwise.DimArrayCuda(result, 2).gpudata) + func = elementwise.get_pow_array_kernel( + self.dtype, other.dtype, result.dtype, use_strides=non_contig) + + if non_contig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, other.gpudata, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + other.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, other.gpudata, result.gpudata, + self.mem_size) return result else: @@ -656,10 +740,18 @@ class GPUArray(object): # "be used as arguments to this operation") result = self._new_like_me() - func = elementwise.get_pow_kernel(self.dtype) - func.prepared_async_call(self._grid, self._block, None, - other, self.gpudata, result.gpudata, - self.mem_size, elementwise.DimArrayCuda(self, 0).gpudata, elementwise.DimArrayCuda(result, 1).gpudata) + + noncontig = is_noncontig(self, result) + func = elementwise.get_pow_kernel(self.dtype, use_strides=noncontig) + + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + other, self.gpudata, result.gpudata, + self.mem_size, self.device_shape_and_strides, result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + other, self.gpudata, result.gpudata, + self.mem_size) return result @@ -668,37 +760,43 @@ class GPUArray(object): as one-dimensional. """ - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - result = self._new_like_me() - func = elementwise.get_reverse_kernel(self.dtype) - func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, result.gpudata, - self.mem_size, - elementwise.DimArrayCuda(self).gpudata, - elementwise.DimArrayCuda(out).gpudata) + noncontig = is_noncontig(self, result) + + func = elementwise.get_reverse_kernel(self.dtype, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, result.gpudata, + self.mem_size) return result def astype(self, dtype, stream=None): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - if dtype == self.dtype: return self.copy() + noncontig = is_noncontig(self, result) + result = self._new_like_me(dtype=dtype) - func = elementwise.get_copy_kernel(dtype, self.dtype) - func.prepared_async_call(self._grid, self._block, stream, - result.gpudata, self.gpudata, - self.mem_size, - elementwise.DimArrayCuda(result).gpudata, - elementwise.DimArrayCuda(self).gpudata) + func = elementwise.get_copy_kernel(dtype, self.dtype, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + result.gpudata, self.gpudata, + self.mem_size, + result.device_shape_and_strides, + self.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + result.gpudata, self.gpudata, + self.mem_size) return result @@ -925,12 +1023,18 @@ class GPUArray(object): result = self._new_like_me(dtype=real_dtype) - func = elementwise.get_real_kernel(dtype, real_dtype) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, - self.mem_size, - elementwise.DimArrayCuda(self).gpudata, - elementwise.DimArrayCuda(result).gpudata) + noncontig = is_noncontig(self) + + func = elementwise.get_real_kernel(dtype, real_dtype, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + result.device_shape_and_strides) + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, + self.mem_size) return result else: @@ -946,12 +1050,19 @@ class GPUArray(object): result = self._new_like_me(dtype=real_dtype) - func = elementwise.get_imag_kernel(dtype, real_dtype) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, - self.mem_size, - elementwise.DimArrayCuda(self).gpudata, - elementwise.DimArrayCuda(result).gpudata) + noncontig = is_noncontig(self) + + func = elementwise.get_imag_kernel(dtype, real_dtype, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, + self.mem_size) return result else: @@ -960,18 +1071,21 @@ class GPUArray(object): def conj(self): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") result = self._new_like_me() - - func = elementwise.get_conj_kernel(dtype) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, - self.mem_size, - elementwise.DimArrayCuda(self).gpudata, - elementwise.DimArrayCuda(result).gpudata) + noncontig = is_noncontig(self) + + func = elementwise.get_conj_kernel(dtype, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, + self.mem_size) return result else: @@ -1117,8 +1231,7 @@ def arange(*args, **kwargs): func = elementwise.get_arange_kernel(dtype) func.prepared_async_call(result._grid, result._block, kwargs.get("stream"), - result.gpudata, start, step, size, - elementwise.DimArrayCuda(result).gpudata) + result.gpudata, start, step, size) return result -- GitLab From 692ad423499c85c99e552c805ef8f8402169235e Mon Sep 17 00:00:00 2001 From: Keegan Owsley Date: Thu, 1 Dec 2016 14:35:47 -0800 Subject: [PATCH 03/10] Allow noncontig arrays in ElementwiseKernel.__call__ --- pycuda/elementwise.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 12b588d2..8371028e 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -263,6 +263,10 @@ def get_elwise_kernel(arguments, operation, return func +def is_noncontig(*vecs): + r = False + for v in vecs: r = r or not v.flags.forc + return r class ElementwiseKernel: def __init__(self, arguments, operation, @@ -273,8 +277,8 @@ class ElementwiseKernel: operation=operation, arguments=arguments)) @memoize_method - def generate_stride_kernel_and_types(self, use_range): - knl, arguments = get_elwise_kernel_and_types(use_range=use_range, + def generate_stride_kernel_and_types(self, use_range, use_slices=False): + knl, arguments = get_elwise_kernel_and_types(use_range=use_range, use_slices=use_slices, **self.gen_kwargs) assert [i for i, arg in enumerate(arguments) @@ -295,9 +299,6 @@ class ElementwiseKernel: raise TypeError("invalid keyword arguments specified: " + ", ".join(six.iterkeys(kwargs))) - invocation_args = [] - func, arguments = self.generate_stride_kernel_and_types( - range_ is not None or slice_ is not None) for arg, arg_descr in zip(args, arguments): if isinstance(arg_descr, VectorArg): @@ -312,8 +313,15 @@ class ElementwiseKernel: else: invocation_args.append(arg) - for v in vectors: - invocations_args.append(v.device_shape_and_strides) + noncontig = is_noncontig(*vectors) + + invocation_args = [] + func, arguments = self.generate_stride_kernel_and_types( + range_ is not None or slice_ is not None, use_slices=noncontig) + + if noncontig: + for v in vectors: + invocations_args.append(v.device_shape_and_strides) repr_vec = vectors[0] -- GitLab From 0117ad599104c412a4de6217f1bf8063ac3b85bb Mon Sep 17 00:00:00 2001 From: Keegan Owsley Date: Thu, 1 Dec 2016 15:17:45 -0800 Subject: [PATCH 04/10] Fix bugs in astype and ElementwiseKernel.__call__ --- pycuda/elementwise.py | 13 ++++++------- pycuda/gpuarray.py | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 8371028e..f11ecbbe 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -277,8 +277,8 @@ class ElementwiseKernel: operation=operation, arguments=arguments)) @memoize_method - def generate_stride_kernel_and_types(self, use_range, use_slices=False): - knl, arguments = get_elwise_kernel_and_types(use_range=use_range, use_slices=use_slices, + def generate_stride_kernel_and_types(self, use_range, use_strides=False): + knl, arguments = get_elwise_kernel_and_types(use_range=use_range, use_strides=use_strides, **self.gen_kwargs) assert [i for i, arg in enumerate(arguments) @@ -299,9 +299,9 @@ class ElementwiseKernel: raise TypeError("invalid keyword arguments specified: " + ", ".join(six.iterkeys(kwargs))) - - for arg, arg_descr in zip(args, arguments): - if isinstance(arg_descr, VectorArg): + invocation_args = [] + for arg in args: + if hasattr(arg, 'flags'): # if not arg.flags.forc: # raise RuntimeError("elementwise kernel cannot " # "deal with non-contiguous arrays") @@ -315,9 +315,8 @@ class ElementwiseKernel: noncontig = is_noncontig(*vectors) - invocation_args = [] func, arguments = self.generate_stride_kernel_and_types( - range_ is not None or slice_ is not None, use_slices=noncontig) + range_ is not None or slice_ is not None, use_strides=noncontig) if noncontig: for v in vectors: diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index d944271b..a54ccab3 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -782,7 +782,7 @@ class GPUArray(object): if dtype == self.dtype: return self.copy() - noncontig = is_noncontig(self, result) + noncontig = is_noncontig(self) result = self._new_like_me(dtype=dtype) -- GitLab From 4644b8abb2ec55b18fe1a6740012eab6b4f7ca39 Mon Sep 17 00:00:00 2001 From: Keegan Owsley Date: Fri, 2 Dec 2016 11:13:16 -0800 Subject: [PATCH 05/10] Allow noncontiguous arrays for cumath functions --- pycuda/cumath.py | 100 +++++++++++++++++++++++++++++++---------------- 1 file changed, 67 insertions(+), 33 deletions(-) diff --git a/pycuda/cumath.py b/pycuda/cumath.py index dbae5bd6..1de0824e 100644 --- a/pycuda/cumath.py +++ b/pycuda/cumath.py @@ -6,6 +6,11 @@ import warnings from pycuda.driver import Stream +def is_noncontig(*vecs): + r = False + for v in vecs: r = r or not v.flags.forc + return r + def _make_unary_array_func(name): def f(array, stream_or_out=None, **kwargs): @@ -29,10 +34,6 @@ def _make_unary_array_func(name): else: func_name = name - if not array.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - if out is None: out = array._new_like_me() else: @@ -40,9 +41,18 @@ def _make_unary_array_func(name): assert out.strides == array.strides assert out.shape == array.shape - func = elementwise.get_unary_func_kernel(func_name, array.dtype) - func.prepared_async_call(array._grid, array._block, stream, - array.gpudata, out.gpudata, array.mem_size) + noncontig = is_noncontig(array, out) + + func = elementwise.get_unary_func_kernel(func_name, array.dtype, use_strides=noncontig) + + if noncontig: + func.prepared_async_call(array._grid, array._block, stream, + array.gpudata, out.gpudata, array.mem_size, + array.device_shape_and_strides, + out.device_shape_and_strides) + else: + func.prepared_async_call(array._grid, array._block, stream, + array.gpudata, out.gpudata, array.mem_size) return out return f @@ -71,13 +81,18 @@ def fmod(arg, mod, stream=None): for each element in `arg` and `mod`.""" result = gpuarray.GPUArray(arg.shape, arg.dtype) - if not arg.flags.forc or not mod.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + noncontig = is_noncontig(arg, mod, result) - func = elementwise.get_fmod_kernel() - func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size) + func = elementwise.get_fmod_kernel(use_strides=noncontig) + if noncontig: + func.prepared_async_call(arg._grid, arg._block, stream, + arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size, + arg.device_shape_and_strides, + mod.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(arg._grid, arg._block, stream, + arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size) return result @@ -85,16 +100,22 @@ def frexp(arg, stream=None): """Return a tuple `(significands, exponents)` such that `arg == significand * 2**exponent`. """ - if not arg.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") sig = gpuarray.GPUArray(arg.shape, arg.dtype) expt = gpuarray.GPUArray(arg.shape, arg.dtype) - func = elementwise.get_frexp_kernel() - func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size) + noncontig = is_noncontig(arg) + + func = elementwise.get_frexp_kernel(use_strides=noncontig) + if noncontig: + func.prepared_async_call(arg._grid, arg._block, stream, + arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size, + arg.device_shape_and_strides, + sig.device_shape_and_strides, + expt.device_shape_and_strides) + else: + func.prepared_async_call(arg._grid, arg._block, stream, + arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size) return sig, expt @@ -103,16 +124,23 @@ def ldexp(significand, exponent, stream=None): entries of `significand` and `exponent`, paired together as `result = significand * 2**exponent`. """ - if not significand.flags.forc or not exponent.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + + noncontig = is_noncontig(significand, exponent) result = gpuarray.GPUArray(significand.shape, significand.dtype) - func = elementwise.get_ldexp_kernel() - func.prepared_async_call(significand._grid, significand._block, stream, - significand.gpudata, exponent.gpudata, result.gpudata, - significand.mem_size) + func = elementwise.get_ldexp_kernel(use_strides=noncontig) + if noncontig: + func.prepared_async_call(significand._grid, significand._block, stream, + significand.gpudata, exponent.gpudata, result.gpudata, + significand.mem_size, + significand.device_shape_and_strides, + exponent.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(significand._grid, significand._block, stream, + significand.gpudata, exponent.gpudata, result.gpudata, + significand.mem_size) return result @@ -120,16 +148,22 @@ def modf(arg, stream=None): """Return a tuple `(fracpart, intpart)` of arrays containing the integer and fractional parts of `arg`. """ - if not arg.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + noncontig = is_noncontig(arg) intpart = gpuarray.GPUArray(arg.shape, arg.dtype) fracpart = gpuarray.GPUArray(arg.shape, arg.dtype) - func = elementwise.get_modf_kernel() - func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, intpart.gpudata, fracpart.gpudata, - arg.mem_size) + func = elementwise.get_modf_kernel(use_strides=noncontig) + if noncontig: + func.prepared_async_call(arg._grid, arg._block, stream, + arg.gpudata, intpart.gpudata, fracpart.gpudata, + arg.mem_size, + arg.device_shape_and_strides, + intpart.device_shape_and_strides, + fracpart.device_shape_and_strides) + else: + func.prepared_async_call(arg._grid, arg._block, stream, + arg.gpudata, intpart.gpudata, fracpart.gpudata, + arg.mem_size) return fracpart, intpart -- GitLab From d4e4ef1102390ca6b6ca9011a9fd9c71e7f1d61b Mon Sep 17 00:00:00 2001 From: Keegan Owsley Date: Mon, 5 Dec 2016 10:55:03 -0800 Subject: [PATCH 06/10] Add flag for more robust regex --- pycuda/elementwise.py | 46 ++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index f11ecbbe..12637056 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -39,7 +39,13 @@ from pytools import memoize_method import struct -import re + +recursive_match_outer_brackets = False + +if recursive_match_outer_brackets: + import regex as re +else: + import re class _DimHolder: @@ -53,18 +59,28 @@ def get_elwise_module_noncontig(arguments, operation, vecargs = tuple([a for a in arguments if isinstance(a, VectorArg)]) names = [V.name for V in vecargs] - def fix_indexing(t): - try: - n = names.index(t.group(1)) - except NameError: - return t.group(0) - return '{}[i2m({},*__dim{})]'.format(t.group(1),t.group(2),n) + if recursive_match_outer_brackets: + def fix_indexing(t): + try: + n = names.index(t.group(1)) + except NameError: + return t.group(0) + second_grp = regex.sub(r, fix_indexing, t.group(2)) + + return '{}[i2m({},*__dim{})]'.format(t.group(1),second_grp,n) + + regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[((?:[^\[\](?:)]|(?R))*)\]" + operation = re.sub(regex, fix_indexing, operation) + else: + def fix_indexing(t): + try: + n = names.index(t.group(1)) + except NameError: + return t.group(0) + return '{}[i2m({},*__dim{})]'.format(t.group(1),t.group(2),n) - # regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[((?:[^\[\](?:)]|(?R))*)\]" - # Above works recursively to match outer brackets, we can switch if we deem it necessary - # Requires regex module instead of re - regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[(.+?)\]" - operation = re.sub(regex, fix_indexing, operation) + regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[(.+?)\]" + operation = re.sub(regex, fix_indexing, operation) dimdef = ''' typedef struct @@ -77,10 +93,12 @@ def get_elwise_module_noncontig(arguments, operation, { unsigned m = 0; unsigned j = i; + unsigned l = 0; for(int k = 0; k < %(max_dims)i; k++) { - m += d.stride[k] * (j%%d.n[k]); - j = j / d.n[k]; + l = j / d.n[k]; + m += d.stride[k] * (j-l*d.n[k]); + j = l; } return m; } -- GitLab From af33cddbd83e6af789cfe6670193cdc75d6cece5 Mon Sep 17 00:00:00 2001 From: Keegan Owsley Date: Mon, 5 Dec 2016 11:20:51 -0800 Subject: [PATCH 07/10] Use python2-friendly list copying --- pycuda/elementwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 12637056..8d11ef79 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -258,7 +258,7 @@ def get_elwise_kernel_and_types(arguments, operation, mod = module_builder(arguments, operation, name, keep, options, **kwargs) - oldargs = arguments.copy() + oldargs = list(arguments) if use_strides: vectors = [a for a in arguments if isinstance(a, VectorArg)] -- GitLab From 3c89c30136b47f1a4c6e2ac7dd89663265d68dd5 Mon Sep 17 00:00:00 2001 From: Keegan Owsley Date: Mon, 5 Dec 2016 11:48:31 -0800 Subject: [PATCH 08/10] Fix missing else block in gpuarray.real --- pycuda/gpuarray.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index a54ccab3..8d364005 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1032,6 +1032,7 @@ class GPUArray(object): self.mem_size, self.device_shape_and_strides, result.device_shape_and_strides) + else: func.prepared_async_call(self._grid, self._block, None, self.gpudata, result.gpudata, self.mem_size) -- GitLab From 910c7ecfb5bdc2309ea1183fc3734a650ffbc0ae Mon Sep 17 00:00:00 2001 From: Keegan Owsley Date: Mon, 5 Dec 2016 12:01:07 -0800 Subject: [PATCH 09/10] Outer bracket matching by default Fix typos in outer bracket matching code and use it by default --- pycuda/elementwise.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 8d11ef79..37945b3c 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -40,7 +40,7 @@ from pytools import memoize_method import struct -recursive_match_outer_brackets = False +recursive_match_outer_brackets = True if recursive_match_outer_brackets: import regex as re @@ -60,16 +60,16 @@ def get_elwise_module_noncontig(arguments, operation, names = [V.name for V in vecargs] if recursive_match_outer_brackets: + regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[((?:[^\[\](?:)]|(?R))*)\]" def fix_indexing(t): try: n = names.index(t.group(1)) except NameError: return t.group(0) - second_grp = regex.sub(r, fix_indexing, t.group(2)) + second_grp = re.sub(regex, fix_indexing, t.group(2)) return '{}[i2m({},*__dim{})]'.format(t.group(1),second_grp,n) - regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[((?:[^\[\](?:)]|(?R))*)\]" operation = re.sub(regex, fix_indexing, operation) else: def fix_indexing(t): -- GitLab From 4a098e85817ff0a28b2f724d1931026354cf6166 Mon Sep 17 00:00:00 2001 From: Keegan Owsley Date: Mon, 5 Dec 2016 12:10:49 -0800 Subject: [PATCH 10/10] Use simpler regex by default (passes the tests) --- pycuda/elementwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 37945b3c..2da202b8 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -40,7 +40,7 @@ from pytools import memoize_method import struct -recursive_match_outer_brackets = True +recursive_match_outer_brackets = False if recursive_match_outer_brackets: import regex as re -- GitLab