diff --git a/pycuda/cumath.py b/pycuda/cumath.py index dbae5bd62a8fdc2b4b6899b5c2f2a3267e5a9dd8..1de0824effbbdd4a6e47465a0c1704365aaa9d88 100644 --- a/pycuda/cumath.py +++ b/pycuda/cumath.py @@ -6,6 +6,11 @@ import warnings from pycuda.driver import Stream +def is_noncontig(*vecs): + r = False + for v in vecs: r = r or not v.flags.forc + return r + def _make_unary_array_func(name): def f(array, stream_or_out=None, **kwargs): @@ -29,10 +34,6 @@ def _make_unary_array_func(name): else: func_name = name - if not array.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - if out is None: out = array._new_like_me() else: @@ -40,9 +41,18 @@ def _make_unary_array_func(name): assert out.strides == array.strides assert out.shape == array.shape - func = elementwise.get_unary_func_kernel(func_name, array.dtype) - func.prepared_async_call(array._grid, array._block, stream, - array.gpudata, out.gpudata, array.mem_size) + noncontig = is_noncontig(array, out) + + func = elementwise.get_unary_func_kernel(func_name, array.dtype, use_strides=noncontig) + + if noncontig: + func.prepared_async_call(array._grid, array._block, stream, + array.gpudata, out.gpudata, array.mem_size, + array.device_shape_and_strides, + out.device_shape_and_strides) + else: + func.prepared_async_call(array._grid, array._block, stream, + array.gpudata, out.gpudata, array.mem_size) return out return f @@ -71,13 +81,18 @@ def fmod(arg, mod, stream=None): for each element in `arg` and `mod`.""" result = gpuarray.GPUArray(arg.shape, arg.dtype) - if not arg.flags.forc or not mod.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + noncontig = is_noncontig(arg, mod, result) - func = elementwise.get_fmod_kernel() - func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size) + func = elementwise.get_fmod_kernel(use_strides=noncontig) + if noncontig: + func.prepared_async_call(arg._grid, arg._block, stream, + arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size, + arg.device_shape_and_strides, + mod.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(arg._grid, arg._block, stream, + arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size) return result @@ -85,16 +100,22 @@ def frexp(arg, stream=None): """Return a tuple `(significands, exponents)` such that `arg == significand * 2**exponent`. """ - if not arg.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") sig = gpuarray.GPUArray(arg.shape, arg.dtype) expt = gpuarray.GPUArray(arg.shape, arg.dtype) - func = elementwise.get_frexp_kernel() - func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size) + noncontig = is_noncontig(arg) + + func = elementwise.get_frexp_kernel(use_strides=noncontig) + if noncontig: + func.prepared_async_call(arg._grid, arg._block, stream, + arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size, + arg.device_shape_and_strides, + sig.device_shape_and_strides, + expt.device_shape_and_strides) + else: + func.prepared_async_call(arg._grid, arg._block, stream, + arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size) return sig, expt @@ -103,16 +124,23 @@ def ldexp(significand, exponent, stream=None): entries of `significand` and `exponent`, paired together as `result = significand * 2**exponent`. """ - if not significand.flags.forc or not exponent.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + + noncontig = is_noncontig(significand, exponent) result = gpuarray.GPUArray(significand.shape, significand.dtype) - func = elementwise.get_ldexp_kernel() - func.prepared_async_call(significand._grid, significand._block, stream, - significand.gpudata, exponent.gpudata, result.gpudata, - significand.mem_size) + func = elementwise.get_ldexp_kernel(use_strides=noncontig) + if noncontig: + func.prepared_async_call(significand._grid, significand._block, stream, + significand.gpudata, exponent.gpudata, result.gpudata, + significand.mem_size, + significand.device_shape_and_strides, + exponent.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(significand._grid, significand._block, stream, + significand.gpudata, exponent.gpudata, result.gpudata, + significand.mem_size) return result @@ -120,16 +148,22 @@ def modf(arg, stream=None): """Return a tuple `(fracpart, intpart)` of arrays containing the integer and fractional parts of `arg`. """ - if not arg.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + noncontig = is_noncontig(arg) intpart = gpuarray.GPUArray(arg.shape, arg.dtype) fracpart = gpuarray.GPUArray(arg.shape, arg.dtype) - func = elementwise.get_modf_kernel() - func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, intpart.gpudata, fracpart.gpudata, - arg.mem_size) + func = elementwise.get_modf_kernel(use_strides=noncontig) + if noncontig: + func.prepared_async_call(arg._grid, arg._block, stream, + arg.gpudata, intpart.gpudata, fracpart.gpudata, + arg.mem_size, + arg.device_shape_and_strides, + intpart.device_shape_and_strides, + fracpart.device_shape_and_strides) + else: + func.prepared_async_call(arg._grid, arg._block, stream, + arg.gpudata, intpart.gpudata, fracpart.gpudata, + arg.mem_size) return fracpart, intpart diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index feab0a6be89d54d72e1b6ec8759838910063a13d..2da202b8f51ee1e2d1ca84cd33f4d96b2f8c2ba2 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -34,10 +34,119 @@ OTHER DEALINGS IN THE SOFTWARE. from pycuda.tools import context_dependent_memoize import numpy as np -from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg +from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg, Argument from pytools import memoize_method +import struct + +recursive_match_outer_brackets = False + +if recursive_match_outer_brackets: + import regex as re +else: + import re + + +class _DimHolder: + struct_char = 'P' + +def get_elwise_module_noncontig(arguments, operation, + name="kernel", keep=False, options=None, + preamble="", loop_prep="", after_loop="", max_dims=2): + from pycuda.compiler import SourceModule + + vecargs = tuple([a for a in arguments if isinstance(a, VectorArg)]) + names = [V.name for V in vecargs] + + if recursive_match_outer_brackets: + regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[((?:[^\[\](?:)]|(?R))*)\]" + def fix_indexing(t): + try: + n = names.index(t.group(1)) + except NameError: + return t.group(0) + second_grp = re.sub(regex, fix_indexing, t.group(2)) + + return '{}[i2m({},*__dim{})]'.format(t.group(1),second_grp,n) + + operation = re.sub(regex, fix_indexing, operation) + else: + def fix_indexing(t): + try: + n = names.index(t.group(1)) + except NameError: + return t.group(0) + return '{}[i2m({},*__dim{})]'.format(t.group(1),t.group(2),n) + + regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[(.+?)\]" + operation = re.sub(regex, fix_indexing, operation) + + dimdef = ''' + typedef struct + { + unsigned n[%(max_dims)i]; + long stride[%(max_dims)i]; + } dim; + + __device__ unsigned i2m(unsigned i, dim d) + { + unsigned m = 0; + unsigned j = i; + unsigned l = 0; + for(int k = 0; k < %(max_dims)i; k++) + { + l = j / d.n[k]; + m += d.stride[k] * (j-l*d.n[k]); + j = l; + } + return m; + } + ''' % {"max_dims":max_dims} + + args = list(arg.declarator() for arg in arguments) + args.extend('dim *__dim{}'.format(i) for i in range(len(vecargs))) + + + + src = """ + #include + + %(dimdef)s + + %(preamble)s + + __global__ void %(name)s(%(arguments)s) + { + + unsigned tid = threadIdx.x; + unsigned total_threads = gridDim.x*blockDim.x; + unsigned cta_start = blockDim.x*blockIdx.x; + unsigned i; + + %(loop_prep)s; + + for (i = cta_start + tid; i < n; i += total_threads) + { + %(operation)s; + } + + %(after_loop)s; + } + """ % { + "arguments": ", ".join(args), + "operation": operation, + "name": name, + "preamble": preamble, + "loop_prep": loop_prep, + "after_loop": after_loop, + "dimdef": dimdef + } + print(src) + + return SourceModule(src, + options=options, keep=keep) + def get_elwise_module(arguments, operation, name="kernel", keep=False, options=None, preamble="", loop_prep="", after_loop=""): @@ -124,7 +233,7 @@ def get_elwise_range_module(arguments, operation, def get_elwise_kernel_and_types(arguments, operation, - name="kernel", keep=False, options=None, use_range=False, **kwargs): + name="kernel", keep=False, options=None, use_range=False, use_strides=False, **kwargs): if isinstance(arguments, str): from pycuda.tools import parse_c_arg arguments = [parse_c_arg(arg) for arg in arguments.split(",")] @@ -138,7 +247,10 @@ def get_elwise_kernel_and_types(arguments, operation, else: arguments.append(ScalarArg(np.uintp, "n")) - if use_range: + # + if use_strides: + module_builder = get_elwise_module_noncontig + elif use_range: module_builder = get_elwise_range_module else: module_builder = get_elwise_module @@ -146,10 +258,17 @@ def get_elwise_kernel_and_types(arguments, operation, mod = module_builder(arguments, operation, name, keep, options, **kwargs) + oldargs = list(arguments) + if use_strides: + vectors = [a for a in arguments if isinstance(a, VectorArg)] + + for i,v in enumerate(vectors): + arguments.append(_DimHolder()) func = mod.get_function(name) func.prepare("".join(arg.struct_char for arg in arguments)) + func.stride_support = use_strides - return func, arguments + return func, oldargs def get_elwise_kernel(arguments, operation, @@ -162,6 +281,10 @@ def get_elwise_kernel(arguments, operation, return func +def is_noncontig(*vecs): + r = False + for v in vecs: r = r or not v.flags.forc + return r class ElementwiseKernel: def __init__(self, arguments, operation, @@ -172,8 +295,8 @@ class ElementwiseKernel: operation=operation, arguments=arguments)) @memoize_method - def generate_stride_kernel_and_types(self, use_range): - knl, arguments = get_elwise_kernel_and_types(use_range=use_range, + def generate_stride_kernel_and_types(self, use_range, use_strides=False): + knl, arguments = get_elwise_kernel_and_types(use_range=use_range, use_strides=use_strides, **self.gen_kwargs) assert [i for i, arg in enumerate(arguments) @@ -195,20 +318,28 @@ class ElementwiseKernel: + ", ".join(six.iterkeys(kwargs))) invocation_args = [] - func, arguments = self.generate_stride_kernel_and_types( - range_ is not None or slice_ is not None) + for arg in args: + if hasattr(arg, 'flags'): + # if not arg.flags.forc: + # raise RuntimeError("elementwise kernel cannot " + # "deal with non-contiguous arrays") + - for arg, arg_descr in zip(args, arguments): - if isinstance(arg_descr, VectorArg): - if not arg.flags.forc: - raise RuntimeError("elementwise kernel cannot " - "deal with non-contiguous arrays") vectors.append(arg) invocation_args.append(arg.gpudata) else: invocation_args.append(arg) + noncontig = is_noncontig(*vectors) + + func, arguments = self.generate_stride_kernel_and_types( + range_ is not None or slice_ is not None, use_strides=noncontig) + + if noncontig: + for v in vectors: + invocations_args.append(v.device_shape_and_strides) + repr_vec = vectors[0] if slice_ is not None: @@ -341,14 +472,14 @@ def get_put_kernel(dtype, idx_dtype, vec_count=1): @context_dependent_memoize -def get_copy_kernel(dtype_dest, dtype_src): +def get_copy_kernel(dtype_dest, dtype_src, use_strides=False): return get_elwise_kernel( "%(tp_dest)s *dest, %(tp_src)s *src" % { "tp_dest": dtype_to_ctype(dtype_dest), "tp_src": dtype_to_ctype(dtype_src), }, "dest[i] = src[i]", - "copy") + "copy", use_strides=use_strides) @context_dependent_memoize @@ -399,7 +530,7 @@ def get_linear_combination_kernel(summand_descriptors, @context_dependent_memoize -def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): +def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z, use_strides=False): return get_elwise_kernel( "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), @@ -407,22 +538,22 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = a*x[i] + b*y[i]", - "axpbyz") + "axpbyz", use_strides=use_strides) @context_dependent_memoize -def get_axpbz_kernel(dtype_x, dtype_z): +def get_axpbz_kernel(dtype_x, dtype_z, use_strides=False): return get_elwise_kernel( "%(tp_z)s a, %(tp_x)s *x,%(tp_z)s b, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z) }, "z[i] = a * x[i] + b", - "axpb") + "axpb", use_strides=use_strides) @context_dependent_memoize -def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator): +def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator, use_strides=False): return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), @@ -430,22 +561,22 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = x[i] %s y[i]" % operator, - "multiply") + "multiply", use_strides=use_strides) @context_dependent_memoize -def get_rdivide_elwise_kernel(dtype_x, dtype_z): +def get_rdivide_elwise_kernel(dtype_x, dtype_z, use_strides=False): return get_elwise_kernel( "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = y / x[i]", - "divide_r") + "divide_r", use_strides=use_strides) @context_dependent_memoize -def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z): +def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=False): return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), @@ -453,11 +584,11 @@ def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s(x[i], y[i])" % func, - func+"_kernel") + func+"_kernel", use_strides=use_strides) @context_dependent_memoize -def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z): +def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=False): return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), @@ -465,10 +596,10 @@ def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s(x[i], y)" % func, - func+"_kernel") + func+"_kernel", use_strides=use_strides) -def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar): +def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar, use_strides=False): if np.float64 not in [dtype_x, dtype_y]: func = func + "f" @@ -477,75 +608,75 @@ def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar): func = "f"+func if use_scalar: - return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z) + return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=use_strides) else: - return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z) + return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=use_strides) @context_dependent_memoize -def get_fill_kernel(dtype): +def get_fill_kernel(dtype, use_strides=False): return get_elwise_kernel( "%(tp)s a, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, "z[i] = a", - "fill") + "fill", use_strides=use_strides) @context_dependent_memoize -def get_reverse_kernel(dtype): +def get_reverse_kernel(dtype, use_strides=False): return get_elwise_kernel( "%(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, "z[i] = y[n-1-i]", - "reverse") + "reverse", use_strides=use_strides) @context_dependent_memoize -def get_real_kernel(dtype, real_dtype): +def get_real_kernel(dtype, real_dtype, use_strides=False): return get_elwise_kernel( "%(tp)s *y, %(real_tp)s *z" % { "tp": dtype_to_ctype(dtype), "real_tp": dtype_to_ctype(real_dtype), }, "z[i] = real(y[i])", - "real") + "real", use_strides=use_strides) @context_dependent_memoize -def get_imag_kernel(dtype, real_dtype): +def get_imag_kernel(dtype, real_dtype, use_strides=False): return get_elwise_kernel( "%(tp)s *y, %(real_tp)s *z" % { "tp": dtype_to_ctype(dtype), "real_tp": dtype_to_ctype(real_dtype), }, "z[i] = imag(y[i])", - "imag") + "imag", use_strides=use_strides) @context_dependent_memoize -def get_conj_kernel(dtype): +def get_conj_kernel(dtype, use_strides=False): return get_elwise_kernel( "%(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, "z[i] = pycuda::conj(y[i])", - "conj") + "conj", use_strides=use_strides) @context_dependent_memoize -def get_arange_kernel(dtype): +def get_arange_kernel(dtype, use_strides=False): return get_elwise_kernel( "%(tp)s *z, %(tp)s start, %(tp)s step" % { "tp": dtype_to_ctype(dtype), }, "z[i] = start + i*step", - "arange") + "arange", use_strides=use_strides) @context_dependent_memoize -def get_pow_kernel(dtype): +def get_pow_kernel(dtype, use_strides=False): if dtype == np.float32: func = "powf" else: @@ -556,11 +687,11 @@ def get_pow_kernel(dtype): "tp": dtype_to_ctype(dtype), }, "z[i] = %s(y[i], value)" % func, - "pow_method") + "pow_method", use_strides=use_strides) @context_dependent_memoize -def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): +def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, use_strides=False): if np.float64 in [dtype_x, dtype_y]: func = "pow" else: @@ -573,27 +704,27 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s(x[i], y[i])" % func, - "pow_method") + "pow_method", use_strides=use_strides) @context_dependent_memoize -def get_fmod_kernel(): +def get_fmod_kernel(use_strides=False): return get_elwise_kernel( "float *arg, float *mod, float *z", "z[i] = fmod(arg[i], mod[i])", - "fmod_kernel") + "fmod_kernel", use_strides=use_strides) @context_dependent_memoize -def get_modf_kernel(): +def get_modf_kernel(use_strides=False): return get_elwise_kernel( "float *x, float *intpart ,float *fracpart", "fracpart[i] = modf(x[i], &intpart[i])", - "modf_kernel") + "modf_kernel", use_strides=use_strides) @context_dependent_memoize -def get_frexp_kernel(): +def get_frexp_kernel(use_strides=False): return get_elwise_kernel( "float *x, float *significand, float *exponent", """ @@ -601,19 +732,19 @@ def get_frexp_kernel(): significand[i] = frexp(x[i], &expt); exponent[i] = expt; """, - "frexp_kernel") + "frexp_kernel", use_strides=use_strides) @context_dependent_memoize -def get_ldexp_kernel(): +def get_ldexp_kernel(use_strides=False): return get_elwise_kernel( "float *sig, float *expt, float *z", "z[i] = ldexp(sig[i], int(expt[i]))", - "ldexp_kernel") + "ldexp_kernel", use_strides=use_strides) @context_dependent_memoize -def get_unary_func_kernel(func_name, in_dtype, out_dtype=None): +def get_unary_func_kernel(func_name, in_dtype, out_dtype=None, use_strides=False): if out_dtype is None: out_dtype = in_dtype @@ -623,11 +754,11 @@ def get_unary_func_kernel(func_name, in_dtype, out_dtype=None): "tp_out": dtype_to_ctype(out_dtype), }, "z[i] = %s(y[i])" % func_name, - "%s_kernel" % func_name) + "%s_kernel" % func_name, use_strides=use_strides) @context_dependent_memoize -def get_if_positive_kernel(crit_dtype, dtype): +def get_if_positive_kernel(crit_dtype, dtype, use_strides=False): return get_elwise_kernel([ VectorArg(crit_dtype, "crit"), VectorArg(dtype, "then_"), @@ -635,11 +766,11 @@ def get_if_positive_kernel(crit_dtype, dtype): VectorArg(dtype, "result"), ], "result[i] = crit[i] > 0 ? then_[i] : else_[i]", - "if_positive") + "if_positive", use_strides=use_strides) @context_dependent_memoize -def get_scalar_op_kernel(dtype_x, dtype_y, operator): +def get_scalar_op_kernel(dtype_x, dtype_y, operator, use_strides=False): return get_elwise_kernel( "%(tp_x)s *x, %(tp_a)s a, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), @@ -647,4 +778,4 @@ def get_scalar_op_kernel(dtype_x, dtype_y, operator): "tp_a": dtype_to_ctype(dtype_x), }, "y[i] = x[i] %s a" % operator, - "scalarop_kernel") + "scalarop_kernel", use_strides=use_strides) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 896b2fcab6ce9ff89bd01155292a39f61d374bf0..8d36400557608d990bdae778a8a03c09500dd3be 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -14,6 +14,7 @@ from pycuda.characterize import has_double_support import six from six.moves import range, zip, reduce import numbers +import struct def _get_common_dtype(obj1, obj2): @@ -118,37 +119,59 @@ def splay(n, dev=None): def _make_binary_op(operator): def func(self, other): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + # if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") if isinstance(other, GPUArray): assert self.shape == other.shape - if not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + # if not other.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") + + noncontig = is_noncontig(self, other) result = self._new_like_me() func = elementwise.get_binary_op_kernel( self.dtype, other.dtype, result.dtype, - operator) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other.gpudata, result.gpudata, - self.mem_size) + operator, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, other.gpudata, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + other.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, other.gpudata, result.gpudata, + self.mem_size) return result else: # scalar operator result = self._new_like_me() + noncontig = is_noncontig(self) func = elementwise.get_scalar_op_kernel( - self.dtype, result.dtype, operator) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other, result.gpudata, - self.mem_size) + self.dtype, result.dtype, operator, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, other, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, other, result.gpudata, + self.mem_size) return result return func +def is_noncontig(*vecs): + r = False + for v in vecs: r = r or not v.flags.forc + return r class GPUArray(object): """A GPUArray is used to do array-based calculation on the GPU. @@ -158,6 +181,7 @@ class GPUArray(object): """ __array_priority__ = 100 + __max_dims__ = 2 def __init__(self, shape, dtype, allocator=drv.mem_alloc, base=None, gpudata=None, strides=None, order="C"): @@ -227,6 +251,22 @@ class GPUArray(object): def flags(self): return _ArrayFlags(self) + @property + def device_shape_and_strides(self): + itemsize = 8 # unsigned int and long int + + sar = drv.mem_alloc(self.__max_dims__*itemsize) + newstrides = tuple(v // self.dtype.itemsize for v in self.strides) + shape = self.shape[::-1] + newstrides = newstrides[::-1] + if len(shape) < self.__max_dims__: + shape = shape + (1,)*(self.__max_dims__-len(shape)) + newstrides = newstrides + (0,)*(self.__max_dims__-len(newstrides)) + data = shape + newstrides + drv.memcpy_htod(int(sar), struct.pack('I'*self.__max_dims__ + 'l'*self.__max_dims__, *data)) + self.__sar = sar # don't garbage collect yet + return sar + def set(self, ary, async=False, stream=None): if ary.size != self.size: raise ValueError("ary and self must be the same size") @@ -297,47 +337,74 @@ class GPUArray(object): """Compute ``out = selffac * self + otherfac*other``, where `other` is a vector..""" assert self.shape == other.shape - if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype) + noncontig = is_noncontig(self, other, out) - if add_timer is not None: - add_timer(3*self.size, func.prepared_timed_call(self._grid, - selffac, self.gpudata, otherfac, other.gpudata, - out.gpudata, self.mem_size)) + func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype, use_strides=noncontig) + + if noncontig: + if add_timer is not None: + add_timer(3*self.size, func.prepared_timed_call(self._grid, + selffac, self.gpudata, otherfac, other.gpudata, + out.gpudata, self.mem_size, + self.device_shape_and_strides, + other.device_shape_and_strides, + out.device_shape_and_strides)) + else: + func.prepared_async_call(self._grid, self._block, stream, + selffac, self.gpudata, otherfac, other.gpudata, + out.gpudata, self.mem_size, + self.device_shape_and_strides, + other.device_shape_and_strides, + out.device_shape_and_strides) else: - func.prepared_async_call(self._grid, self._block, stream, + if add_timer is not None: + add_timer(3*self.size, func.prepared_timed_call(self._grid, selffac, self.gpudata, otherfac, other.gpudata, - out.gpudata, self.mem_size) + out.gpudata, self.mem_size)) + else: + func.prepared_async_call(self._grid, self._block, stream, + selffac, self.gpudata, otherfac, other.gpudata, + out.gpudata, self.mem_size) + return out def _axpbz(self, selffac, other, out, stream=None): """Compute ``out = selffac * self + other``, where `other` is a scalar.""" - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - - func = elementwise.get_axpbz_kernel(self.dtype, out.dtype) - func.prepared_async_call(self._grid, self._block, stream, - selffac, self.gpudata, - other, out.gpudata, self.mem_size) + noncontig = is_noncontig(self, out) + func = elementwise.get_axpbz_kernel(self.dtype, out.dtype, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + selffac, self.gpudata, + other, out.gpudata, self.mem_size, + self.device_shape_and_strides, + out.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + selffac, self.gpudata, + other, out.gpudata, self.mem_size) return out def _elwise_multiply(self, other, out, stream=None): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + + noncontig = is_noncontig(self, other, out) func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, - out.dtype, "*") - func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other.gpudata, - out.gpudata, self.mem_size) + out.dtype, "*", use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, other.gpudata, + out.gpudata, self.mem_size, + self.device_shape_and_strides, + other.device_shape_and_strides, + out.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, other.gpudata, + out.gpudata, self.mem_size) return out @@ -347,31 +414,43 @@ class GPUArray(object): y = n / self """ - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + noncontig = is_noncontig(self, out) - func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype) - func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other, - out.gpudata, self.mem_size) + func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype, use_strides=noncontig) + + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, other, + out.gpudata, self.mem_size, + self.device_shape_and_strides, + out.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, other, + out.gpudata, self.mem_size) return out def _div(self, other, out, stream=None): """Divides an array by another array.""" - if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - assert self.shape == other.shape + noncontig = is_noncontig(self, other, out) + func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, - out.dtype, "/") - func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other.gpudata, - out.gpudata, self.mem_size) + out.dtype, "/", use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, other.gpudata, + out.gpudata, self.mem_size, + self.device_shape_and_strides, + other.device_shape_and_strides, + out.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, other.gpudata, + out.gpudata, self.mem_size) return out @@ -381,7 +460,8 @@ class GPUArray(object): dtype = self.dtype else: if dtype == self.dtype: - strides = self.strides + # strides = self.strides + strides = None return self.__class__(self.shape, dtype, allocator=self.allocator, strides=strides) @@ -513,9 +593,17 @@ class GPUArray(object): def fill(self, value, stream=None): """fills the array with the specified value""" - func = elementwise.get_fill_kernel(self.dtype) - func.prepared_async_call(self._grid, self._block, stream, - value, self.gpudata, self.mem_size) + noncontig = is_noncontig(self) + + func = elementwise.get_fill_kernel(self.dtype, use_strides=noncontig) + + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + value, self.gpudata, self.mem_size, + self.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + value, self.gpudata, self.mem_size) return self @@ -597,11 +685,19 @@ class GPUArray(object): else: out_dtype = self.dtype + noncontig = is_noncontig(self, result) + func = elementwise.get_unary_func_kernel(fname, self.dtype, - out_dtype=out_dtype) + out_dtype=out_dtype, use_strides=noncontig) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, self.mem_size) + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, self.mem_size, + self.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, self.mem_size) return result @@ -616,32 +712,46 @@ class GPUArray(object): """ if isinstance(other, GPUArray): - if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - assert self.shape == other.shape result = self._new_like_me(_get_common_dtype(self, other)) - func = elementwise.get_pow_array_kernel( - self.dtype, other.dtype, result.dtype) + non_contig = is_noncontig(self, other, result) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other.gpudata, result.gpudata, - self.mem_size) + func = elementwise.get_pow_array_kernel( + self.dtype, other.dtype, result.dtype, use_strides=non_contig) + + if non_contig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, other.gpudata, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + other.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, other.gpudata, result.gpudata, + self.mem_size) return result else: - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + # if not self.flags.forc: + # raise RuntimeError("only contiguous arrays may " + # "be used as arguments to this operation") result = self._new_like_me() - func = elementwise.get_pow_kernel(self.dtype) - func.prepared_async_call(self._grid, self._block, None, - other, self.gpudata, result.gpudata, - self.mem_size) + + noncontig = is_noncontig(self, result) + func = elementwise.get_pow_kernel(self.dtype, use_strides=noncontig) + + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + other, self.gpudata, result.gpudata, + self.mem_size, self.device_shape_and_strides, result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + other, self.gpudata, result.gpudata, + self.mem_size) return result @@ -650,33 +760,43 @@ class GPUArray(object): as one-dimensional. """ - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - result = self._new_like_me() - func = elementwise.get_reverse_kernel(self.dtype) - func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, result.gpudata, - self.mem_size) + noncontig = is_noncontig(self, result) + + func = elementwise.get_reverse_kernel(self.dtype, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + self.gpudata, result.gpudata, + self.mem_size) return result def astype(self, dtype, stream=None): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - if dtype == self.dtype: return self.copy() + noncontig = is_noncontig(self) + result = self._new_like_me(dtype=dtype) - func = elementwise.get_copy_kernel(dtype, self.dtype) - func.prepared_async_call(self._grid, self._block, stream, - result.gpudata, self.gpudata, - self.mem_size) + func = elementwise.get_copy_kernel(dtype, self.dtype, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, stream, + result.gpudata, self.gpudata, + self.mem_size, + result.device_shape_and_strides, + self.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, stream, + result.gpudata, self.gpudata, + self.mem_size) return result @@ -903,10 +1023,19 @@ class GPUArray(object): result = self._new_like_me(dtype=real_dtype) - func = elementwise.get_real_kernel(dtype, real_dtype) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, - self.mem_size) + noncontig = is_noncontig(self) + + func = elementwise.get_real_kernel(dtype, real_dtype, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, + self.mem_size) return result else: @@ -916,19 +1045,25 @@ class GPUArray(object): def imag(self): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) result = self._new_like_me(dtype=real_dtype) - func = elementwise.get_imag_kernel(dtype, real_dtype) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, - self.mem_size) + noncontig = is_noncontig(self) + + func = elementwise.get_imag_kernel(dtype, real_dtype, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, + self.mem_size) return result else: @@ -937,16 +1072,21 @@ class GPUArray(object): def conj(self): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") result = self._new_like_me() - - func = elementwise.get_conj_kernel(dtype) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, - self.mem_size) + noncontig = is_noncontig(self) + + func = elementwise.get_conj_kernel(dtype, use_strides=noncontig) + if noncontig: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, + self.mem_size, + self.device_shape_and_strides, + result.device_shape_and_strides) + else: + func.prepared_async_call(self._grid, self._block, None, + self.gpudata, result.gpudata, + self.mem_size) return result else: