From ccf6fd131024306adf71d123394f972b614698fd Mon Sep 17 00:00:00 2001
From: Keegan Owsley <keegano@gmail.com>
Date: Mon, 28 Nov 2016 15:49:00 -0800
Subject: [PATCH 01/10] Allow noncontiguous arrays in elementwise ops

---
 pycuda/elementwise.py | 135 +++++++++++++++++++++++++++++++++++++++---
 pycuda/gpuarray.py    | 120 ++++++++++++++++++++++---------------
 2 files changed, 201 insertions(+), 54 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index feab0a6b..61fcc2ad 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -34,9 +34,117 @@ OTHER DEALINGS IN THE SOFTWARE.
 
 from pycuda.tools import context_dependent_memoize
 import numpy as np
-from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg
+from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg, Argument
 from pytools import memoize_method
 
+import pycuda.driver as cuda
+
+import struct
+import re
+
+class DimArrayCuda:
+    max_dims = 2
+    itemsize = 4 + 4 # unsigned int and long int
+    def __init__(self, vector, n=0): # n unused
+        if hasattr(vector, '_dimholder'):
+            self.gpudata = vector._dimholder.gpudata
+        else:
+            self.gpudata = cuda.mem_alloc(self.max_dims*self.itemsize)
+            newstrides = tuple(v // vector.dtype.itemsize for v in vector.strides)
+            shape = vector.shape[::-1]
+            newstrides = newstrides[::-1]
+            if len(shape) < self.max_dims:
+                shape = shape + (1,)*(self.max_dims-len(shape))
+                newstrides = newstrides + (0,)*(self.max_dims-len(newstrides))
+            data = shape + newstrides
+            cuda.memcpy_htod(int(self.gpudata), struct.pack('I'*self.max_dims + 'l'*self.max_dims, *data))
+            vector._dimholder = self
+
+class _DimHolder:
+    struct_char = 'P'
+
+def get_elwise_module_noncontig(arguments, operation,
+                                name="kernel", keep=False, options=None,
+                                preamble="", loop_prep="", after_loop="", max_dims=2):
+    from pycuda.compiler import SourceModule
+
+    vecargs = tuple([a for a in arguments if isinstance(a, VectorArg)])
+    names = [V.name for V in vecargs]
+
+    def fix_indexing(t):
+        try:
+            n = names.index(t.group(1))
+        except NameError:
+            return t.group(0)
+        return '{}[i2m({},*__dim{})]'.format(t.group(1),t.group(2),n)
+
+    # regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[((?:[^\[\](?:)]|(?R))*)\]"
+    # Above works recursively to match outer brackets, we can switch if we deem it necessary
+    # Requires regex module instead of re
+    regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[(.+?)\]"
+    operation = re.sub(regex, fix_indexing, operation)
+
+    dimdef = '''
+    typedef struct
+    {
+        unsigned n[%(max_dims)i];
+        long stride[%(max_dims)i];
+    } dim;
+
+    __device__ unsigned i2m(unsigned i, dim d)
+    {
+        unsigned m = 0;
+        unsigned j = i;
+        for(int k = 0; k < %(max_dims)i; k++)
+        {
+            m += d.stride[k] * (j%%d.n[k]);
+            j = j / d.n[k];
+        }
+        return m;
+    }
+    ''' % {"max_dims":max_dims}
+
+    args = list(arg.declarator() for arg in arguments)
+    args.extend('dim *__dim{}'.format(i) for i in range(len(vecargs)))
+
+
+
+    src = """
+        #include <pycuda-complex.hpp>
+
+        %(dimdef)s
+
+        %(preamble)s
+
+        __global__ void %(name)s(%(arguments)s)
+        {
+
+          unsigned tid = threadIdx.x;
+          unsigned total_threads = gridDim.x*blockDim.x;
+          unsigned cta_start = blockDim.x*blockIdx.x;
+          unsigned i;
+
+          %(loop_prep)s;
+
+          for (i = cta_start + tid; i < n; i += total_threads)
+          {
+            %(operation)s;
+          }
+
+          %(after_loop)s;
+        }
+        """ % {
+            "arguments": ", ".join(args),
+            "operation": operation,
+            "name": name,
+            "preamble": preamble,
+            "loop_prep": loop_prep,
+            "after_loop": after_loop,
+            "dimdef": dimdef
+            }
+
+    return SourceModule(src,
+        options=options, keep=keep)
 
 def get_elwise_module(arguments, operation,
         name="kernel", keep=False, options=None,
@@ -124,7 +232,7 @@ def get_elwise_range_module(arguments, operation,
 
 
 def get_elwise_kernel_and_types(arguments, operation,
-        name="kernel", keep=False, options=None, use_range=False, **kwargs):
+        name="kernel", keep=False, options=None, use_range=False, use_slices=False, **kwargs):
     if isinstance(arguments, str):
         from pycuda.tools import parse_c_arg
         arguments = [parse_c_arg(arg) for arg in arguments.split(",")]
@@ -138,18 +246,26 @@ def get_elwise_kernel_and_types(arguments, operation,
     else:
         arguments.append(ScalarArg(np.uintp, "n"))
 
+    #
+    # if use_slices:
+    #     module_builder = get_elwise_module_noncontig
     if use_range:
         module_builder = get_elwise_range_module
     else:
-        module_builder = get_elwise_module
+        module_builder = get_elwise_module_noncontig
 
     mod = module_builder(arguments, operation, name,
             keep, options, **kwargs)
 
+    vectors = [a for a in arguments if isinstance(a, VectorArg)]
+
+    oldargs = arguments.copy()
+    for i,v in enumerate(vectors):
+        arguments.append(_DimHolder())
     func = mod.get_function(name)
     func.prepare("".join(arg.struct_char for arg in arguments))
 
-    return func, arguments
+    return func, oldargs
 
 
 def get_elwise_kernel(arguments, operation,
@@ -200,15 +316,20 @@ class ElementwiseKernel:
 
         for arg, arg_descr in zip(args, arguments):
             if isinstance(arg_descr, VectorArg):
-                if not arg.flags.forc:
-                    raise RuntimeError("elementwise kernel cannot "
-                            "deal with non-contiguous arrays")
+                # if not arg.flags.forc:
+                #     raise RuntimeError("elementwise kernel cannot "
+                #             "deal with non-contiguous arrays")
+
+
 
                 vectors.append(arg)
                 invocation_args.append(arg.gpudata)
             else:
                 invocation_args.append(arg)
 
+        for i,v in enumerate(vectors):
+            invocations_args.append(DimArrayCuda(v,i).gpudata)
+
         repr_vec = vectors[0]
 
         if slice_ is not None:
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 896b2fca..21b7bde2 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -118,16 +118,16 @@ def splay(n, dev=None):
 
 def _make_binary_op(operator):
     def func(self, other):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+        # if not self.flags.forc:
+        #     raise RuntimeError("only contiguous arrays may "
+        #             "be used as arguments to this operation")
 
         if isinstance(other, GPUArray):
             assert self.shape == other.shape
 
-            if not other.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+            # if not other.flags.forc:
+            #     raise RuntimeError("only contiguous arrays may "
+            #             "be used as arguments to this operation")
 
             result = self._new_like_me()
             func = elementwise.get_binary_op_kernel(
@@ -135,7 +135,10 @@ def _make_binary_op(operator):
                     operator)
             func.prepared_async_call(self._grid, self._block, None,
                     self.gpudata, other.gpudata, result.gpudata,
-                    self.mem_size)
+                    self.mem_size,
+                    elementwise.DimArrayCuda(self, 0).gpudata,
+                    elementwise.DimArrayCuda(other, 1).gpudata,
+                    elementwise.DimArrayCuda(result, 2).gpudata)
 
             return result
         else:  # scalar operator
@@ -144,7 +147,9 @@ def _make_binary_op(operator):
                     self.dtype, result.dtype, operator)
             func.prepared_async_call(self._grid, self._block, None,
                     self.gpudata, other, result.gpudata,
-                    self.mem_size)
+                    self.mem_size,
+                    elementwise.DimArrayCuda(self, 0).gpudata,
+                    elementwise.DimArrayCuda(result, 1).gpudata)
             return result
 
     return func
@@ -297,47 +302,52 @@ class GPUArray(object):
         """Compute ``out = selffac * self + otherfac*other``,
         where `other` is a vector.."""
         assert self.shape == other.shape
-        if not self.flags.forc or not other.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
 
         func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype)
 
         if add_timer is not None:
             add_timer(3*self.size, func.prepared_timed_call(self._grid,
                 selffac, self.gpudata, otherfac, other.gpudata,
-                out.gpudata, self.mem_size))
+                out.gpudata, self.mem_size,
+                elementwise.DimArrayCuda(self, 0).gpudata,
+                elementwise.DimArrayCuda(other, 1).gpudata,
+                elementwise.DimArrayCuda(out, 2).gpudata))
         else:
             func.prepared_async_call(self._grid, self._block, stream,
                     selffac, self.gpudata, otherfac, other.gpudata,
-                    out.gpudata, self.mem_size)
+                    out.gpudata, self.mem_size,
+                    elementwise.DimArrayCuda(self, 0).gpudata,
+                    elementwise.DimArrayCuda(other, 1).gpudata,
+                    elementwise.DimArrayCuda(out, 2).gpudata)
 
         return out
 
     def _axpbz(self, selffac, other, out, stream=None):
         """Compute ``out = selffac * self + other``, where `other` is a scalar."""
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
 
         func = elementwise.get_axpbz_kernel(self.dtype, out.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
                 selffac, self.gpudata,
-                other, out.gpudata, self.mem_size)
+                other, out.gpudata, self.mem_size,
+                elementwise.DimArrayCuda(self).gpudata,
+                elementwise.DimArrayCuda(out).gpudata)
 
         return out
 
     def _elwise_multiply(self, other, out, stream=None):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+        # if not self.flags.forc:
+            # raise RuntimeError("only contiguous arrays may "
+            #         "be used as arguments to this operation")
 
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
                 out.dtype, "*")
         func.prepared_async_call(self._grid, self._block, stream,
                 self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size)
+                out.gpudata, self.mem_size,
+                elementwise.DimArrayCuda(self).gpudata,
+                elementwise.DimArrayCuda(other).gpudata,
+                elementwise.DimArrayCuda(out).gpudata)
 
         return out
 
@@ -354,24 +364,25 @@ class GPUArray(object):
         func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
                 self.gpudata, other,
-                out.gpudata, self.mem_size)
+                out.gpudata, self.mem_size,
+                elementwise.DimArrayCuda(self).gpudata,
+                elementwise.DimArrayCuda(out).gpudata)
 
         return out
 
     def _div(self, other, out, stream=None):
         """Divides an array by another array."""
 
-        if not self.flags.forc or not other.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         assert self.shape == other.shape
 
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
                 out.dtype, "/")
         func.prepared_async_call(self._grid, self._block, stream,
                 self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size)
+                out.gpudata, self.mem_size,
+                elementwise.DimArrayCuda(self, 0).gpudata,
+                elementwise.DimArrayCuda(other, 1).gpudata,
+                elementwise.DimArrayCuda(out, 2).gpudata)
 
         return out
 
@@ -381,7 +392,8 @@ class GPUArray(object):
             dtype = self.dtype
         else:
             if dtype == self.dtype:
-                strides = self.strides
+                # strides = self.strides
+                strides = None
 
         return self.__class__(self.shape, dtype,
                 allocator=self.allocator, strides=strides)
@@ -515,7 +527,8 @@ class GPUArray(object):
         """fills the array with the specified value"""
         func = elementwise.get_fill_kernel(self.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                value, self.gpudata, self.mem_size)
+                value, self.gpudata, self.mem_size,
+                elementwise.DimArrayCuda(self, 0).gpudata)
 
         return self
 
@@ -601,7 +614,9 @@ class GPUArray(object):
                 out_dtype=out_dtype)
 
         func.prepared_async_call(self._grid, self._block, None,
-                self.gpudata, result.gpudata, self.mem_size)
+                self.gpudata, result.gpudata, self.mem_size,
+                    elementwise.DimArrayCuda(self, 0).gpudata,
+                    elementwise.DimArrayCuda(result, 1).gpudata)
 
         return result
 
@@ -616,9 +631,9 @@ class GPUArray(object):
         """
 
         if isinstance(other, GPUArray):
-            if not self.flags.forc or not other.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+            # if not self.flags.forc or not other.flags.forc:
+            #     raise RuntimeError("only contiguous arrays may "
+            #             "be used as arguments to this operation")
 
             assert self.shape == other.shape
 
@@ -629,19 +644,22 @@ class GPUArray(object):
 
             func.prepared_async_call(self._grid, self._block, None,
                     self.gpudata, other.gpudata, result.gpudata,
-                    self.mem_size)
+                    self.mem_size,
+                    elementwise.DimArrayCuda(self, 0).gpudata,
+                    elementwise.DimArrayCuda(other, 1).gpudata,
+                    elementwise.DimArrayCuda(result, 2).gpudata)
 
             return result
         else:
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+            # if not self.flags.forc:
+            #     raise RuntimeError("only contiguous arrays may "
+            #             "be used as arguments to this operation")
 
             result = self._new_like_me()
             func = elementwise.get_pow_kernel(self.dtype)
             func.prepared_async_call(self._grid, self._block, None,
                     other, self.gpudata, result.gpudata,
-                    self.mem_size)
+                    self.mem_size, elementwise.DimArrayCuda(self, 0).gpudata, elementwise.DimArrayCuda(result, 1).gpudata)
 
             return result
 
@@ -659,7 +677,9 @@ class GPUArray(object):
         func = elementwise.get_reverse_kernel(self.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
                 self.gpudata, result.gpudata,
-                self.mem_size)
+                self.mem_size,
+                elementwise.DimArrayCuda(self).gpudata,
+                elementwise.DimArrayCuda(out).gpudata)
 
         return result
 
@@ -676,7 +696,9 @@ class GPUArray(object):
         func = elementwise.get_copy_kernel(dtype, self.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
                 result.gpudata, self.gpudata,
-                self.mem_size)
+                self.mem_size,
+                elementwise.DimArrayCuda(result).gpudata,
+                elementwise.DimArrayCuda(self).gpudata)
 
         return result
 
@@ -906,7 +928,9 @@ class GPUArray(object):
             func = elementwise.get_real_kernel(dtype, real_dtype)
             func.prepared_async_call(self._grid, self._block, None,
                     self.gpudata, result.gpudata,
-                    self.mem_size)
+                    self.mem_size,
+                    elementwise.DimArrayCuda(self).gpudata,
+                    elementwise.DimArrayCuda(result).gpudata)
 
             return result
         else:
@@ -916,9 +940,6 @@ class GPUArray(object):
     def imag(self):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
 
             from pytools import match_precision
             real_dtype = match_precision(np.dtype(np.float64), dtype)
@@ -928,7 +949,9 @@ class GPUArray(object):
             func = elementwise.get_imag_kernel(dtype, real_dtype)
             func.prepared_async_call(self._grid, self._block, None,
                     self.gpudata, result.gpudata,
-                    self.mem_size)
+                    self.mem_size,
+                    elementwise.DimArrayCuda(self).gpudata,
+                    elementwise.DimArrayCuda(result).gpudata)
 
             return result
         else:
@@ -946,7 +969,9 @@ class GPUArray(object):
             func = elementwise.get_conj_kernel(dtype)
             func.prepared_async_call(self._grid, self._block, None,
                     self.gpudata, result.gpudata,
-                    self.mem_size)
+                    self.mem_size,
+                    elementwise.DimArrayCuda(self).gpudata,
+                    elementwise.DimArrayCuda(result).gpudata)
 
             return result
         else:
@@ -1092,7 +1117,8 @@ def arange(*args, **kwargs):
 
     func = elementwise.get_arange_kernel(dtype)
     func.prepared_async_call(result._grid, result._block, kwargs.get("stream"),
-            result.gpudata, start, step, size)
+            result.gpudata, start, step, size,
+            elementwise.DimArrayCuda(result).gpudata)
 
     return result
 
-- 
GitLab


From 22c49c8b6c988d91fc57399c68a183ccd25ed3d5 Mon Sep 17 00:00:00 2001
From: Keegan Owsley <keegano@gmail.com>
Date: Thu, 1 Dec 2016 14:31:37 -0800
Subject: [PATCH 02/10] Only use noncontig kernels as necessary

---
 pycuda/elementwise.py | 137 +++++++---------
 pycuda/gpuarray.py    | 369 +++++++++++++++++++++++++++---------------
 2 files changed, 302 insertions(+), 204 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 61fcc2ad..12b588d2 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -37,28 +37,10 @@ import numpy as np
 from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg, Argument
 from pytools import memoize_method
 
-import pycuda.driver as cuda
 
 import struct
 import re
 
-class DimArrayCuda:
-    max_dims = 2
-    itemsize = 4 + 4 # unsigned int and long int
-    def __init__(self, vector, n=0): # n unused
-        if hasattr(vector, '_dimholder'):
-            self.gpudata = vector._dimholder.gpudata
-        else:
-            self.gpudata = cuda.mem_alloc(self.max_dims*self.itemsize)
-            newstrides = tuple(v // vector.dtype.itemsize for v in vector.strides)
-            shape = vector.shape[::-1]
-            newstrides = newstrides[::-1]
-            if len(shape) < self.max_dims:
-                shape = shape + (1,)*(self.max_dims-len(shape))
-                newstrides = newstrides + (0,)*(self.max_dims-len(newstrides))
-            data = shape + newstrides
-            cuda.memcpy_htod(int(self.gpudata), struct.pack('I'*self.max_dims + 'l'*self.max_dims, *data))
-            vector._dimholder = self
 
 class _DimHolder:
     struct_char = 'P'
@@ -142,6 +124,7 @@ def get_elwise_module_noncontig(arguments, operation,
             "after_loop": after_loop,
             "dimdef": dimdef
             }
+    print(src)
 
     return SourceModule(src,
         options=options, keep=keep)
@@ -232,7 +215,7 @@ def get_elwise_range_module(arguments, operation,
 
 
 def get_elwise_kernel_and_types(arguments, operation,
-        name="kernel", keep=False, options=None, use_range=False, use_slices=False, **kwargs):
+                                name="kernel", keep=False, options=None, use_range=False, use_strides=False, **kwargs):
     if isinstance(arguments, str):
         from pycuda.tools import parse_c_arg
         arguments = [parse_c_arg(arg) for arg in arguments.split(",")]
@@ -247,23 +230,25 @@ def get_elwise_kernel_and_types(arguments, operation,
         arguments.append(ScalarArg(np.uintp, "n"))
 
     #
-    # if use_slices:
-    #     module_builder = get_elwise_module_noncontig
-    if use_range:
+    if use_strides:
+        module_builder = get_elwise_module_noncontig
+    elif use_range:
         module_builder = get_elwise_range_module
     else:
-        module_builder = get_elwise_module_noncontig
+        module_builder = get_elwise_module
 
     mod = module_builder(arguments, operation, name,
             keep, options, **kwargs)
 
-    vectors = [a for a in arguments if isinstance(a, VectorArg)]
-
     oldargs = arguments.copy()
-    for i,v in enumerate(vectors):
-        arguments.append(_DimHolder())
+    if use_strides:
+        vectors = [a for a in arguments if isinstance(a, VectorArg)]
+
+        for i,v in enumerate(vectors):
+            arguments.append(_DimHolder())
     func = mod.get_function(name)
     func.prepare("".join(arg.struct_char for arg in arguments))
+    func.stride_support = use_strides
 
     return func, oldargs
 
@@ -327,8 +312,8 @@ class ElementwiseKernel:
             else:
                 invocation_args.append(arg)
 
-        for i,v in enumerate(vectors):
-            invocations_args.append(DimArrayCuda(v,i).gpudata)
+        for v in vectors:
+            invocations_args.append(v.device_shape_and_strides)
 
         repr_vec = vectors[0]
 
@@ -462,14 +447,14 @@ def get_put_kernel(dtype, idx_dtype, vec_count=1):
 
 
 @context_dependent_memoize
-def get_copy_kernel(dtype_dest, dtype_src):
+def get_copy_kernel(dtype_dest, dtype_src, use_strides=False):
     return get_elwise_kernel(
             "%(tp_dest)s *dest, %(tp_src)s *src" % {
                 "tp_dest": dtype_to_ctype(dtype_dest),
                 "tp_src": dtype_to_ctype(dtype_src),
                 },
             "dest[i] = src[i]",
-            "copy")
+            "copy", use_strides=use_strides)
 
 
 @context_dependent_memoize
@@ -520,7 +505,7 @@ def get_linear_combination_kernel(summand_descriptors,
 
 
 @context_dependent_memoize
-def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z):
+def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z, use_strides=False):
     return get_elwise_kernel(
             "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % {
                 "tp_x": dtype_to_ctype(dtype_x),
@@ -528,22 +513,22 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = a*x[i] + b*y[i]",
-            "axpbyz")
+            "axpbyz", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_axpbz_kernel(dtype_x, dtype_z):
+def get_axpbz_kernel(dtype_x, dtype_z, use_strides=False):
     return get_elwise_kernel(
             "%(tp_z)s a, %(tp_x)s *x,%(tp_z)s b, %(tp_z)s *z" % {
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_z": dtype_to_ctype(dtype_z)
                 },
             "z[i] = a * x[i] + b",
-            "axpb")
+            "axpb", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator):
+def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator, use_strides=False):
     return get_elwise_kernel(
             "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
                 "tp_x": dtype_to_ctype(dtype_x),
@@ -551,22 +536,22 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = x[i] %s y[i]" % operator,
-            "multiply")
+            "multiply", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_rdivide_elwise_kernel(dtype_x, dtype_z):
+def get_rdivide_elwise_kernel(dtype_x, dtype_z, use_strides=False):
     return get_elwise_kernel(
             "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z" % {
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = y / x[i]",
-            "divide_r")
+            "divide_r", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z):
+def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=False):
     return get_elwise_kernel(
             "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
                 "tp_x": dtype_to_ctype(dtype_x),
@@ -574,11 +559,11 @@ def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = %s(x[i], y[i])" % func,
-            func+"_kernel")
+            func+"_kernel", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z):
+def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=False):
     return get_elwise_kernel(
             "%(tp_x)s *x, %(tp_y)s y, %(tp_z)s *z" % {
                 "tp_x": dtype_to_ctype(dtype_x),
@@ -586,10 +571,10 @@ def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = %s(x[i], y)" % func,
-            func+"_kernel")
+            func+"_kernel", use_strides=use_strides)
 
 
-def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar):
+def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar, use_strides=False):
     if np.float64 not in [dtype_x, dtype_y]:
         func = func + "f"
 
@@ -598,75 +583,75 @@ def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar):
         func = "f"+func
 
     if use_scalar:
-        return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z)
+        return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=use_strides)
     else:
-        return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z)
+        return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_fill_kernel(dtype):
+def get_fill_kernel(dtype, use_strides=False):
     return get_elwise_kernel(
             "%(tp)s a, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = a",
-            "fill")
+            "fill", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_reverse_kernel(dtype):
+def get_reverse_kernel(dtype, use_strides=False):
     return get_elwise_kernel(
             "%(tp)s *y, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = y[n-1-i]",
-            "reverse")
+            "reverse", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_real_kernel(dtype, real_dtype):
+def get_real_kernel(dtype, real_dtype, use_strides=False):
     return get_elwise_kernel(
             "%(tp)s *y, %(real_tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 "real_tp": dtype_to_ctype(real_dtype),
                 },
             "z[i] = real(y[i])",
-            "real")
+            "real", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_imag_kernel(dtype, real_dtype):
+def get_imag_kernel(dtype, real_dtype, use_strides=False):
     return get_elwise_kernel(
             "%(tp)s *y, %(real_tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 "real_tp": dtype_to_ctype(real_dtype),
                 },
             "z[i] = imag(y[i])",
-            "imag")
+            "imag", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_conj_kernel(dtype):
+def get_conj_kernel(dtype, use_strides=False):
     return get_elwise_kernel(
             "%(tp)s *y, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = pycuda::conj(y[i])",
-            "conj")
+            "conj", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_arange_kernel(dtype):
+def get_arange_kernel(dtype, use_strides=False):
     return get_elwise_kernel(
             "%(tp)s *z, %(tp)s start, %(tp)s step" % {
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = start + i*step",
-            "arange")
+            "arange", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_pow_kernel(dtype):
+def get_pow_kernel(dtype, use_strides=False):
     if dtype == np.float32:
         func = "powf"
     else:
@@ -677,11 +662,11 @@ def get_pow_kernel(dtype):
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = %s(y[i], value)" % func,
-            "pow_method")
+            "pow_method", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
+def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, use_strides=False):
     if np.float64 in [dtype_x, dtype_y]:
         func = "pow"
     else:
@@ -694,27 +679,27 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = %s(x[i], y[i])" % func,
-            "pow_method")
+            "pow_method", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_fmod_kernel():
+def get_fmod_kernel(use_strides=False):
     return get_elwise_kernel(
             "float *arg, float *mod, float *z",
             "z[i] = fmod(arg[i], mod[i])",
-            "fmod_kernel")
+            "fmod_kernel", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_modf_kernel():
+def get_modf_kernel(use_strides=False):
     return get_elwise_kernel(
             "float *x, float *intpart ,float *fracpart",
             "fracpart[i] = modf(x[i], &intpart[i])",
-            "modf_kernel")
+            "modf_kernel", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_frexp_kernel():
+def get_frexp_kernel(use_strides=False):
     return get_elwise_kernel(
             "float *x, float *significand, float *exponent",
             """
@@ -722,19 +707,19 @@ def get_frexp_kernel():
                 significand[i] = frexp(x[i], &expt);
                 exponent[i] = expt;
             """,
-            "frexp_kernel")
+            "frexp_kernel", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_ldexp_kernel():
+def get_ldexp_kernel(use_strides=False):
     return get_elwise_kernel(
             "float *sig, float *expt, float *z",
             "z[i] = ldexp(sig[i], int(expt[i]))",
-            "ldexp_kernel")
+            "ldexp_kernel", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_unary_func_kernel(func_name, in_dtype, out_dtype=None):
+def get_unary_func_kernel(func_name, in_dtype, out_dtype=None, use_strides=False):
     if out_dtype is None:
         out_dtype = in_dtype
 
@@ -744,11 +729,11 @@ def get_unary_func_kernel(func_name, in_dtype, out_dtype=None):
                 "tp_out": dtype_to_ctype(out_dtype),
                 },
             "z[i] = %s(y[i])" % func_name,
-            "%s_kernel" % func_name)
+            "%s_kernel" % func_name, use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_if_positive_kernel(crit_dtype, dtype):
+def get_if_positive_kernel(crit_dtype, dtype, use_strides=False):
     return get_elwise_kernel([
             VectorArg(crit_dtype, "crit"),
             VectorArg(dtype, "then_"),
@@ -756,11 +741,11 @@ def get_if_positive_kernel(crit_dtype, dtype):
             VectorArg(dtype, "result"),
             ],
             "result[i] = crit[i] > 0 ? then_[i] : else_[i]",
-            "if_positive")
+            "if_positive", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_scalar_op_kernel(dtype_x, dtype_y, operator):
+def get_scalar_op_kernel(dtype_x, dtype_y, operator, use_strides=False):
     return get_elwise_kernel(
             "%(tp_x)s *x, %(tp_a)s a, %(tp_y)s *y" % {
                 "tp_x": dtype_to_ctype(dtype_x),
@@ -768,4 +753,4 @@ def get_scalar_op_kernel(dtype_x, dtype_y, operator):
                 "tp_a": dtype_to_ctype(dtype_x),
                 },
             "y[i] = x[i] %s a" % operator,
-            "scalarop_kernel")
+            "scalarop_kernel", use_strides=use_strides)
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 21b7bde2..d944271b 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -14,6 +14,7 @@ from pycuda.characterize import has_double_support
 import six
 from six.moves import range, zip, reduce
 import numbers
+import struct
 
 
 def _get_common_dtype(obj1, obj2):
@@ -129,31 +130,48 @@ def _make_binary_op(operator):
             #     raise RuntimeError("only contiguous arrays may "
             #             "be used as arguments to this operation")
 
+            noncontig = is_noncontig(self, other)
+
             result = self._new_like_me()
             func = elementwise.get_binary_op_kernel(
                     self.dtype, other.dtype, result.dtype,
-                    operator)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other.gpudata, result.gpudata,
-                    self.mem_size,
-                    elementwise.DimArrayCuda(self, 0).gpudata,
-                    elementwise.DimArrayCuda(other, 1).gpudata,
-                    elementwise.DimArrayCuda(result, 2).gpudata)
+                    operator, use_strides=noncontig)
+            if noncontig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, other.gpudata, result.gpudata,
+                        self.mem_size,
+                        self.device_shape_and_strides,
+                        other.device_shape_and_strides,
+                        result.device_shape_and_strides)
+            else:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, other.gpudata, result.gpudata,
+                        self.mem_size)
 
             return result
         else:  # scalar operator
             result = self._new_like_me()
+            noncontig = is_noncontig(self)
             func = elementwise.get_scalar_op_kernel(
-                    self.dtype, result.dtype, operator)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other, result.gpudata,
-                    self.mem_size,
-                    elementwise.DimArrayCuda(self, 0).gpudata,
-                    elementwise.DimArrayCuda(result, 1).gpudata)
+                    self.dtype, result.dtype, operator, use_strides=noncontig)
+            if noncontig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, other, result.gpudata,
+                        self.mem_size,
+                        self.device_shape_and_strides,
+                        result.device_shape_and_strides)
+            else:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, other, result.gpudata,
+                        self.mem_size)
             return result
 
     return func
 
+def is_noncontig(*vecs):
+    r = False
+    for v in vecs: r = r or not v.flags.forc
+    return r
 
 class GPUArray(object):
     """A GPUArray is used to do array-based calculation on the GPU.
@@ -163,6 +181,7 @@ class GPUArray(object):
     """
 
     __array_priority__ = 100
+    __max_dims__ = 2
 
     def __init__(self, shape, dtype, allocator=drv.mem_alloc,
             base=None, gpudata=None, strides=None, order="C"):
@@ -232,6 +251,22 @@ class GPUArray(object):
     def flags(self):
         return _ArrayFlags(self)
 
+    @property
+    def device_shape_and_strides(self):
+        itemsize = 8 # unsigned int and long int
+
+        sar = drv.mem_alloc(self.__max_dims__*itemsize)
+        newstrides = tuple(v // self.dtype.itemsize for v in self.strides)
+        shape = self.shape[::-1]
+        newstrides = newstrides[::-1]
+        if len(shape) < self.__max_dims__:
+            shape = shape + (1,)*(self.__max_dims__-len(shape))
+            newstrides = newstrides + (0,)*(self.__max_dims__-len(newstrides))
+        data = shape + newstrides
+        drv.memcpy_htod(int(sar), struct.pack('I'*self.__max_dims__ + 'l'*self.__max_dims__, *data))
+        self.__sar = sar # don't garbage collect yet
+        return sar
+
     def set(self, ary, async=False, stream=None):
         if ary.size != self.size:
             raise ValueError("ary and self must be the same size")
@@ -303,51 +338,73 @@ class GPUArray(object):
         where `other` is a vector.."""
         assert self.shape == other.shape
 
-        func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype)
+        noncontig = is_noncontig(self, other, out)
 
-        if add_timer is not None:
-            add_timer(3*self.size, func.prepared_timed_call(self._grid,
-                selffac, self.gpudata, otherfac, other.gpudata,
-                out.gpudata, self.mem_size,
-                elementwise.DimArrayCuda(self, 0).gpudata,
-                elementwise.DimArrayCuda(other, 1).gpudata,
-                elementwise.DimArrayCuda(out, 2).gpudata))
-        else:
-            func.prepared_async_call(self._grid, self._block, stream,
+        func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype, use_strides=noncontig)
+
+        if noncontig:
+            if add_timer is not None:
+                add_timer(3*self.size, func.prepared_timed_call(self._grid,
                     selffac, self.gpudata, otherfac, other.gpudata,
                     out.gpudata, self.mem_size,
-                    elementwise.DimArrayCuda(self, 0).gpudata,
-                    elementwise.DimArrayCuda(other, 1).gpudata,
-                    elementwise.DimArrayCuda(out, 2).gpudata)
+                    self.device_shape_and_strides,
+                    other.device_shape_and_strides,
+                    out.device_shape_and_strides))
+            else:
+                func.prepared_async_call(self._grid, self._block, stream,
+                        selffac, self.gpudata, otherfac, other.gpudata,
+                        out.gpudata, self.mem_size,
+                        self.device_shape_and_strides,
+                        other.device_shape_and_strides,
+                        out.device_shape_and_strides)
+        else:
+            if add_timer is not None:
+                add_timer(3*self.size, func.prepared_timed_call(self._grid,
+                    selffac, self.gpudata, otherfac, other.gpudata,
+                    out.gpudata, self.mem_size))
+            else:
+                func.prepared_async_call(self._grid, self._block, stream,
+                        selffac, self.gpudata, otherfac, other.gpudata,
+                        out.gpudata, self.mem_size)
+
 
         return out
 
     def _axpbz(self, selffac, other, out, stream=None):
         """Compute ``out = selffac * self + other``, where `other` is a scalar."""
 
-
-        func = elementwise.get_axpbz_kernel(self.dtype, out.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                selffac, self.gpudata,
-                other, out.gpudata, self.mem_size,
-                elementwise.DimArrayCuda(self).gpudata,
-                elementwise.DimArrayCuda(out).gpudata)
+        noncontig = is_noncontig(self, out)
+        func = elementwise.get_axpbz_kernel(self.dtype, out.dtype, use_strides=noncontig)
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    selffac, self.gpudata,
+                    other, out.gpudata, self.mem_size,
+                    self.device_shape_and_strides,
+                    out.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    selffac, self.gpudata,
+                    other, out.gpudata, self.mem_size)
 
         return out
 
     def _elwise_multiply(self, other, out, stream=None):
-        # if not self.flags.forc:
-            # raise RuntimeError("only contiguous arrays may "
-            #         "be used as arguments to this operation")
+
+        noncontig = is_noncontig(self, other, out)
 
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
-                out.dtype, "*")
-        func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size,
-                elementwise.DimArrayCuda(self).gpudata,
-                elementwise.DimArrayCuda(other).gpudata,
-                elementwise.DimArrayCuda(out).gpudata)
+                out.dtype, "*", use_strides=noncontig)
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, other.gpudata,
+                    out.gpudata, self.mem_size,
+                    self.device_shape_and_strides,
+                    other.device_shape_and_strides,
+                    out.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, other.gpudata,
+                    out.gpudata, self.mem_size)
 
         return out
 
@@ -357,16 +414,20 @@ class GPUArray(object):
            y = n / self
         """
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+        noncontig = is_noncontig(self, out)
 
-        func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other,
-                out.gpudata, self.mem_size,
-                elementwise.DimArrayCuda(self).gpudata,
-                elementwise.DimArrayCuda(out).gpudata)
+        func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype, use_strides=noncontig)
+
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, other,
+                    out.gpudata, self.mem_size,
+                    self.device_shape_and_strides,
+                    out.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, other,
+                    out.gpudata, self.mem_size)
 
         return out
 
@@ -375,14 +436,21 @@ class GPUArray(object):
 
         assert self.shape == other.shape
 
+        noncontig = is_noncontig(self, other, out)
+
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
-                out.dtype, "/")
-        func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size,
-                elementwise.DimArrayCuda(self, 0).gpudata,
-                elementwise.DimArrayCuda(other, 1).gpudata,
-                elementwise.DimArrayCuda(out, 2).gpudata)
+                out.dtype, "/", use_strides=noncontig)
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, other.gpudata,
+                    out.gpudata, self.mem_size,
+                    self.device_shape_and_strides,
+                    other.device_shape_and_strides,
+                    out.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, other.gpudata,
+                    out.gpudata, self.mem_size)
 
         return out
 
@@ -525,10 +593,17 @@ class GPUArray(object):
 
     def fill(self, value, stream=None):
         """fills the array with the specified value"""
-        func = elementwise.get_fill_kernel(self.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                value, self.gpudata, self.mem_size,
-                elementwise.DimArrayCuda(self, 0).gpudata)
+        noncontig = is_noncontig(self)
+
+        func = elementwise.get_fill_kernel(self.dtype, use_strides=noncontig)
+
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    value, self.gpudata, self.mem_size,
+                    self.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    value, self.gpudata, self.mem_size)
 
         return self
 
@@ -610,13 +685,19 @@ class GPUArray(object):
         else:
             out_dtype = self.dtype
 
+        noncontig = is_noncontig(self, result)
+
         func = elementwise.get_unary_func_kernel(fname, self.dtype,
-                out_dtype=out_dtype)
+                out_dtype=out_dtype, use_strides=noncontig)
 
-        func.prepared_async_call(self._grid, self._block, None,
-                self.gpudata, result.gpudata, self.mem_size,
-                    elementwise.DimArrayCuda(self, 0).gpudata,
-                    elementwise.DimArrayCuda(result, 1).gpudata)
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, None,
+                    self.gpudata, result.gpudata, self.mem_size,
+                        self.device_shape_and_strides,
+                        result.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, None,
+                    self.gpudata, result.gpudata, self.mem_size)
 
         return result
 
@@ -631,23 +712,26 @@ class GPUArray(object):
         """
 
         if isinstance(other, GPUArray):
-            # if not self.flags.forc or not other.flags.forc:
-            #     raise RuntimeError("only contiguous arrays may "
-            #             "be used as arguments to this operation")
-
             assert self.shape == other.shape
 
             result = self._new_like_me(_get_common_dtype(self, other))
 
-            func = elementwise.get_pow_array_kernel(
-                    self.dtype, other.dtype, result.dtype)
+            non_contig = is_noncontig(self, other, result)
 
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other.gpudata, result.gpudata,
-                    self.mem_size,
-                    elementwise.DimArrayCuda(self, 0).gpudata,
-                    elementwise.DimArrayCuda(other, 1).gpudata,
-                    elementwise.DimArrayCuda(result, 2).gpudata)
+            func = elementwise.get_pow_array_kernel(
+                    self.dtype, other.dtype, result.dtype, use_strides=non_contig)
+
+            if non_contig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, other.gpudata, result.gpudata,
+                        self.mem_size,
+                        self.device_shape_and_strides,
+                        other.device_shape_and_strides,
+                        result.device_shape_and_strides)
+            else:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, other.gpudata, result.gpudata,
+                        self.mem_size)
 
             return result
         else:
@@ -656,10 +740,18 @@ class GPUArray(object):
             #             "be used as arguments to this operation")
 
             result = self._new_like_me()
-            func = elementwise.get_pow_kernel(self.dtype)
-            func.prepared_async_call(self._grid, self._block, None,
-                    other, self.gpudata, result.gpudata,
-                    self.mem_size, elementwise.DimArrayCuda(self, 0).gpudata, elementwise.DimArrayCuda(result, 1).gpudata)
+
+            noncontig = is_noncontig(self, result)
+            func = elementwise.get_pow_kernel(self.dtype, use_strides=noncontig)
+
+            if noncontig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        other, self.gpudata, result.gpudata,
+                        self.mem_size, self.device_shape_and_strides, result.device_shape_and_strides)
+            else:
+                func.prepared_async_call(self._grid, self._block, None,
+                        other, self.gpudata, result.gpudata,
+                        self.mem_size)
 
             return result
 
@@ -668,37 +760,43 @@ class GPUArray(object):
         as one-dimensional.
         """
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         result = self._new_like_me()
 
-        func = elementwise.get_reverse_kernel(self.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, result.gpudata,
-                self.mem_size,
-                elementwise.DimArrayCuda(self).gpudata,
-                elementwise.DimArrayCuda(out).gpudata)
+        noncontig = is_noncontig(self, result)
+
+        func = elementwise.get_reverse_kernel(self.dtype, use_strides=noncontig)
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, result.gpudata,
+                    self.mem_size,
+                    self.device_shape_and_strides,
+                    result.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, result.gpudata,
+                    self.mem_size)
 
         return result
 
     def astype(self, dtype, stream=None):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         if dtype == self.dtype:
             return self.copy()
 
+        noncontig = is_noncontig(self, result)
+
         result = self._new_like_me(dtype=dtype)
 
-        func = elementwise.get_copy_kernel(dtype, self.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                result.gpudata, self.gpudata,
-                self.mem_size,
-                elementwise.DimArrayCuda(result).gpudata,
-                elementwise.DimArrayCuda(self).gpudata)
+        func = elementwise.get_copy_kernel(dtype, self.dtype, use_strides=noncontig)
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    result.gpudata, self.gpudata,
+                    self.mem_size,
+                    result.device_shape_and_strides,
+                    self.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    result.gpudata, self.gpudata,
+                    self.mem_size)
 
         return result
 
@@ -925,12 +1023,18 @@ class GPUArray(object):
 
             result = self._new_like_me(dtype=real_dtype)
 
-            func = elementwise.get_real_kernel(dtype, real_dtype)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
-                    self.mem_size,
-                    elementwise.DimArrayCuda(self).gpudata,
-                    elementwise.DimArrayCuda(result).gpudata)
+            noncontig = is_noncontig(self)
+
+            func = elementwise.get_real_kernel(dtype, real_dtype, use_strides=noncontig)
+            if noncontig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, result.gpudata,
+                        self.mem_size,
+                        self.device_shape_and_strides,
+                        result.device_shape_and_strides)
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, result.gpudata,
+                        self.mem_size)
 
             return result
         else:
@@ -946,12 +1050,19 @@ class GPUArray(object):
 
             result = self._new_like_me(dtype=real_dtype)
 
-            func = elementwise.get_imag_kernel(dtype, real_dtype)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
-                    self.mem_size,
-                    elementwise.DimArrayCuda(self).gpudata,
-                    elementwise.DimArrayCuda(result).gpudata)
+            noncontig = is_noncontig(self)
+
+            func = elementwise.get_imag_kernel(dtype, real_dtype, use_strides=noncontig)
+            if noncontig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, result.gpudata,
+                        self.mem_size,
+                        self.device_shape_and_strides,
+                        result.device_shape_and_strides)
+            else:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, result.gpudata,
+                        self.mem_size)
 
             return result
         else:
@@ -960,18 +1071,21 @@ class GPUArray(object):
     def conj(self):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
 
             result = self._new_like_me()
-
-            func = elementwise.get_conj_kernel(dtype)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
-                    self.mem_size,
-                    elementwise.DimArrayCuda(self).gpudata,
-                    elementwise.DimArrayCuda(result).gpudata)
+            noncontig = is_noncontig(self)
+
+            func = elementwise.get_conj_kernel(dtype, use_strides=noncontig)
+            if noncontig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, result.gpudata,
+                        self.mem_size,
+                        self.device_shape_and_strides,
+                        result.device_shape_and_strides)
+            else:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, result.gpudata,
+                        self.mem_size)
 
             return result
         else:
@@ -1117,8 +1231,7 @@ def arange(*args, **kwargs):
 
     func = elementwise.get_arange_kernel(dtype)
     func.prepared_async_call(result._grid, result._block, kwargs.get("stream"),
-            result.gpudata, start, step, size,
-            elementwise.DimArrayCuda(result).gpudata)
+            result.gpudata, start, step, size)
 
     return result
 
-- 
GitLab


From 692ad423499c85c99e552c805ef8f8402169235e Mon Sep 17 00:00:00 2001
From: Keegan Owsley <keegano@gmail.com>
Date: Thu, 1 Dec 2016 14:35:47 -0800
Subject: [PATCH 03/10] Allow noncontig arrays in ElementwiseKernel.__call__

---
 pycuda/elementwise.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 12b588d2..8371028e 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -263,6 +263,10 @@ def get_elwise_kernel(arguments, operation,
 
     return func
 
+def is_noncontig(*vecs):
+    r = False
+    for v in vecs: r = r or not v.flags.forc
+    return r
 
 class ElementwiseKernel:
     def __init__(self, arguments, operation,
@@ -273,8 +277,8 @@ class ElementwiseKernel:
             operation=operation, arguments=arguments))
 
     @memoize_method
-    def generate_stride_kernel_and_types(self, use_range):
-        knl, arguments = get_elwise_kernel_and_types(use_range=use_range,
+    def generate_stride_kernel_and_types(self, use_range, use_slices=False):
+        knl, arguments = get_elwise_kernel_and_types(use_range=use_range, use_slices=use_slices,
                 **self.gen_kwargs)
 
         assert [i for i, arg in enumerate(arguments)
@@ -295,9 +299,6 @@ class ElementwiseKernel:
             raise TypeError("invalid keyword arguments specified: "
                     + ", ".join(six.iterkeys(kwargs)))
 
-        invocation_args = []
-        func, arguments = self.generate_stride_kernel_and_types(
-                range_ is not None or slice_ is not None)
 
         for arg, arg_descr in zip(args, arguments):
             if isinstance(arg_descr, VectorArg):
@@ -312,8 +313,15 @@ class ElementwiseKernel:
             else:
                 invocation_args.append(arg)
 
-        for v in vectors:
-            invocations_args.append(v.device_shape_and_strides)
+        noncontig = is_noncontig(*vectors)
+
+        invocation_args = []
+        func, arguments = self.generate_stride_kernel_and_types(
+                range_ is not None or slice_ is not None, use_slices=noncontig)
+
+        if noncontig:
+            for v in vectors:
+                invocations_args.append(v.device_shape_and_strides)
 
         repr_vec = vectors[0]
 
-- 
GitLab


From 0117ad599104c412a4de6217f1bf8063ac3b85bb Mon Sep 17 00:00:00 2001
From: Keegan Owsley <keegano@gmail.com>
Date: Thu, 1 Dec 2016 15:17:45 -0800
Subject: [PATCH 04/10] Fix bugs in astype and ElementwiseKernel.__call__

---
 pycuda/elementwise.py | 13 ++++++-------
 pycuda/gpuarray.py    |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 8371028e..f11ecbbe 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -277,8 +277,8 @@ class ElementwiseKernel:
             operation=operation, arguments=arguments))
 
     @memoize_method
-    def generate_stride_kernel_and_types(self, use_range, use_slices=False):
-        knl, arguments = get_elwise_kernel_and_types(use_range=use_range, use_slices=use_slices,
+    def generate_stride_kernel_and_types(self, use_range, use_strides=False):
+        knl, arguments = get_elwise_kernel_and_types(use_range=use_range, use_strides=use_strides,
                 **self.gen_kwargs)
 
         assert [i for i, arg in enumerate(arguments)
@@ -299,9 +299,9 @@ class ElementwiseKernel:
             raise TypeError("invalid keyword arguments specified: "
                     + ", ".join(six.iterkeys(kwargs)))
 
-
-        for arg, arg_descr in zip(args, arguments):
-            if isinstance(arg_descr, VectorArg):
+        invocation_args = []
+        for arg in args:
+            if hasattr(arg, 'flags'):
                 # if not arg.flags.forc:
                 #     raise RuntimeError("elementwise kernel cannot "
                 #             "deal with non-contiguous arrays")
@@ -315,9 +315,8 @@ class ElementwiseKernel:
 
         noncontig = is_noncontig(*vectors)
 
-        invocation_args = []
         func, arguments = self.generate_stride_kernel_and_types(
-                range_ is not None or slice_ is not None, use_slices=noncontig)
+                range_ is not None or slice_ is not None, use_strides=noncontig)
 
         if noncontig:
             for v in vectors:
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index d944271b..a54ccab3 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -782,7 +782,7 @@ class GPUArray(object):
         if dtype == self.dtype:
             return self.copy()
 
-        noncontig = is_noncontig(self, result)
+        noncontig = is_noncontig(self)
 
         result = self._new_like_me(dtype=dtype)
 
-- 
GitLab


From 4644b8abb2ec55b18fe1a6740012eab6b4f7ca39 Mon Sep 17 00:00:00 2001
From: Keegan Owsley <keegano@gmail.com>
Date: Fri, 2 Dec 2016 11:13:16 -0800
Subject: [PATCH 05/10] Allow noncontiguous arrays for cumath functions

---
 pycuda/cumath.py | 100 +++++++++++++++++++++++++++++++----------------
 1 file changed, 67 insertions(+), 33 deletions(-)

diff --git a/pycuda/cumath.py b/pycuda/cumath.py
index dbae5bd6..1de0824e 100644
--- a/pycuda/cumath.py
+++ b/pycuda/cumath.py
@@ -6,6 +6,11 @@ import warnings
 from pycuda.driver import Stream
 
 
+def is_noncontig(*vecs):
+    r = False
+    for v in vecs: r = r or not v.flags.forc
+    return r
+
 def _make_unary_array_func(name):
     def f(array, stream_or_out=None, **kwargs):
 
@@ -29,10 +34,6 @@ def _make_unary_array_func(name):
         else:
             func_name = name
 
-        if not array.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         if out is None:
             out = array._new_like_me()
         else:
@@ -40,9 +41,18 @@ def _make_unary_array_func(name):
             assert out.strides == array.strides
             assert out.shape == array.shape
 
-        func = elementwise.get_unary_func_kernel(func_name, array.dtype)
-        func.prepared_async_call(array._grid, array._block, stream,
-                array.gpudata, out.gpudata, array.mem_size)
+        noncontig = is_noncontig(array, out)
+
+        func = elementwise.get_unary_func_kernel(func_name, array.dtype, use_strides=noncontig)
+
+        if noncontig:
+            func.prepared_async_call(array._grid, array._block, stream,
+                    array.gpudata, out.gpudata, array.mem_size,
+                    array.device_shape_and_strides,
+                    out.device_shape_and_strides)
+        else:
+            func.prepared_async_call(array._grid, array._block, stream,
+                    array.gpudata, out.gpudata, array.mem_size)
 
         return out
     return f
@@ -71,13 +81,18 @@ def fmod(arg, mod, stream=None):
     for each element in `arg` and `mod`."""
     result = gpuarray.GPUArray(arg.shape, arg.dtype)
 
-    if not arg.flags.forc or not mod.flags.forc:
-        raise RuntimeError("only contiguous arrays may "
-                "be used as arguments to this operation")
+    noncontig = is_noncontig(arg, mod, result)
 
-    func = elementwise.get_fmod_kernel()
-    func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size)
+    func = elementwise.get_fmod_kernel(use_strides=noncontig)
+    if noncontig:
+        func.prepared_async_call(arg._grid, arg._block, stream,
+                arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size,
+                arg.device_shape_and_strides,
+                mod.device_shape_and_strides,
+                result.device_shape_and_strides)
+    else:
+        func.prepared_async_call(arg._grid, arg._block, stream,
+                arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size)
 
     return result
 
@@ -85,16 +100,22 @@ def frexp(arg, stream=None):
     """Return a tuple `(significands, exponents)` such that
     `arg == significand * 2**exponent`.
     """
-    if not arg.flags.forc:
-        raise RuntimeError("only contiguous arrays may "
-                "be used as arguments to this operation")
 
     sig = gpuarray.GPUArray(arg.shape, arg.dtype)
     expt = gpuarray.GPUArray(arg.shape, arg.dtype)
 
-    func = elementwise.get_frexp_kernel()
-    func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size)
+    noncontig = is_noncontig(arg)
+
+    func = elementwise.get_frexp_kernel(use_strides=noncontig)
+    if noncontig:
+        func.prepared_async_call(arg._grid, arg._block, stream,
+                arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size,
+                arg.device_shape_and_strides,
+                sig.device_shape_and_strides,
+                expt.device_shape_and_strides)
+    else:
+        func.prepared_async_call(arg._grid, arg._block, stream,
+                arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size)
 
     return sig, expt
 
@@ -103,16 +124,23 @@ def ldexp(significand, exponent, stream=None):
     entries of `significand` and `exponent`, paired together as
     `result = significand * 2**exponent`.
     """
-    if not significand.flags.forc or not exponent.flags.forc:
-        raise RuntimeError("only contiguous arrays may "
-                "be used as arguments to this operation")
+
+    noncontig = is_noncontig(significand, exponent)
 
     result = gpuarray.GPUArray(significand.shape, significand.dtype)
 
-    func = elementwise.get_ldexp_kernel()
-    func.prepared_async_call(significand._grid, significand._block, stream,
-            significand.gpudata, exponent.gpudata, result.gpudata,
-            significand.mem_size)
+    func = elementwise.get_ldexp_kernel(use_strides=noncontig)
+    if noncontig:
+        func.prepared_async_call(significand._grid, significand._block, stream,
+                significand.gpudata, exponent.gpudata, result.gpudata,
+                significand.mem_size,
+                significand.device_shape_and_strides,
+                exponent.device_shape_and_strides,
+                result.device_shape_and_strides)
+    else:
+        func.prepared_async_call(significand._grid, significand._block, stream,
+                significand.gpudata, exponent.gpudata, result.gpudata,
+                significand.mem_size)
 
     return result
 
@@ -120,16 +148,22 @@ def modf(arg, stream=None):
     """Return a tuple `(fracpart, intpart)` of arrays containing the
     integer and fractional parts of `arg`.
     """
-    if not arg.flags.forc:
-        raise RuntimeError("only contiguous arrays may "
-                "be used as arguments to this operation")
+    noncontig = is_noncontig(arg)
 
     intpart = gpuarray.GPUArray(arg.shape, arg.dtype)
     fracpart = gpuarray.GPUArray(arg.shape, arg.dtype)
 
-    func = elementwise.get_modf_kernel()
-    func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, intpart.gpudata, fracpart.gpudata,
-            arg.mem_size)
+    func = elementwise.get_modf_kernel(use_strides=noncontig)
+    if noncontig:
+        func.prepared_async_call(arg._grid, arg._block, stream,
+                arg.gpudata, intpart.gpudata, fracpart.gpudata,
+                arg.mem_size,
+                arg.device_shape_and_strides,
+                intpart.device_shape_and_strides,
+                fracpart.device_shape_and_strides)
+    else:
+        func.prepared_async_call(arg._grid, arg._block, stream,
+                arg.gpudata, intpart.gpudata, fracpart.gpudata,
+                arg.mem_size)
 
     return fracpart, intpart
-- 
GitLab


From d4e4ef1102390ca6b6ca9011a9fd9c71e7f1d61b Mon Sep 17 00:00:00 2001
From: Keegan Owsley <keegano@gmail.com>
Date: Mon, 5 Dec 2016 10:55:03 -0800
Subject: [PATCH 06/10] Add flag for more robust regex

---
 pycuda/elementwise.py | 46 ++++++++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index f11ecbbe..12637056 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -39,7 +39,13 @@ from pytools import memoize_method
 
 
 import struct
-import re
+
+recursive_match_outer_brackets = False
+
+if recursive_match_outer_brackets:
+    import regex as re
+else:
+    import re
 
 
 class _DimHolder:
@@ -53,18 +59,28 @@ def get_elwise_module_noncontig(arguments, operation,
     vecargs = tuple([a for a in arguments if isinstance(a, VectorArg)])
     names = [V.name for V in vecargs]
 
-    def fix_indexing(t):
-        try:
-            n = names.index(t.group(1))
-        except NameError:
-            return t.group(0)
-        return '{}[i2m({},*__dim{})]'.format(t.group(1),t.group(2),n)
+    if recursive_match_outer_brackets:
+        def fix_indexing(t):
+            try:
+                n = names.index(t.group(1))
+            except NameError:
+                return t.group(0)
+            second_grp = regex.sub(r, fix_indexing, t.group(2))
+
+            return '{}[i2m({},*__dim{})]'.format(t.group(1),second_grp,n)
+
+        regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[((?:[^\[\](?:)]|(?R))*)\]"
+        operation = re.sub(regex, fix_indexing, operation)
+    else:
+        def fix_indexing(t):
+            try:
+                n = names.index(t.group(1))
+            except NameError:
+                return t.group(0)
+            return '{}[i2m({},*__dim{})]'.format(t.group(1),t.group(2),n)
 
-    # regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[((?:[^\[\](?:)]|(?R))*)\]"
-    # Above works recursively to match outer brackets, we can switch if we deem it necessary
-    # Requires regex module instead of re
-    regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[(.+?)\]"
-    operation = re.sub(regex, fix_indexing, operation)
+        regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[(.+?)\]"
+        operation = re.sub(regex, fix_indexing, operation)
 
     dimdef = '''
     typedef struct
@@ -77,10 +93,12 @@ def get_elwise_module_noncontig(arguments, operation,
     {
         unsigned m = 0;
         unsigned j = i;
+        unsigned l = 0;
         for(int k = 0; k < %(max_dims)i; k++)
         {
-            m += d.stride[k] * (j%%d.n[k]);
-            j = j / d.n[k];
+            l = j / d.n[k];
+            m += d.stride[k] * (j-l*d.n[k]);
+            j = l;
         }
         return m;
     }
-- 
GitLab


From af33cddbd83e6af789cfe6670193cdc75d6cece5 Mon Sep 17 00:00:00 2001
From: Keegan Owsley <keegano@gmail.com>
Date: Mon, 5 Dec 2016 11:20:51 -0800
Subject: [PATCH 07/10] Use python2-friendly list copying

---
 pycuda/elementwise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 12637056..8d11ef79 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -258,7 +258,7 @@ def get_elwise_kernel_and_types(arguments, operation,
     mod = module_builder(arguments, operation, name,
             keep, options, **kwargs)
 
-    oldargs = arguments.copy()
+    oldargs = list(arguments)
     if use_strides:
         vectors = [a for a in arguments if isinstance(a, VectorArg)]
 
-- 
GitLab


From 3c89c30136b47f1a4c6e2ac7dd89663265d68dd5 Mon Sep 17 00:00:00 2001
From: Keegan Owsley <keegano@gmail.com>
Date: Mon, 5 Dec 2016 11:48:31 -0800
Subject: [PATCH 08/10] Fix missing else block in gpuarray.real

---
 pycuda/gpuarray.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index a54ccab3..8d364005 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1032,6 +1032,7 @@ class GPUArray(object):
                         self.mem_size,
                         self.device_shape_and_strides,
                         result.device_shape_and_strides)
+            else:
                 func.prepared_async_call(self._grid, self._block, None,
                         self.gpudata, result.gpudata,
                         self.mem_size)
-- 
GitLab


From 910c7ecfb5bdc2309ea1183fc3734a650ffbc0ae Mon Sep 17 00:00:00 2001
From: Keegan Owsley <keegano@gmail.com>
Date: Mon, 5 Dec 2016 12:01:07 -0800
Subject: [PATCH 09/10] Outer bracket matching by default

Fix typos in outer bracket matching code and use it by default
---
 pycuda/elementwise.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 8d11ef79..37945b3c 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -40,7 +40,7 @@ from pytools import memoize_method
 
 import struct
 
-recursive_match_outer_brackets = False
+recursive_match_outer_brackets = True
 
 if recursive_match_outer_brackets:
     import regex as re
@@ -60,16 +60,16 @@ def get_elwise_module_noncontig(arguments, operation,
     names = [V.name for V in vecargs]
 
     if recursive_match_outer_brackets:
+        regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[((?:[^\[\](?:)]|(?R))*)\]"
         def fix_indexing(t):
             try:
                 n = names.index(t.group(1))
             except NameError:
                 return t.group(0)
-            second_grp = regex.sub(r, fix_indexing, t.group(2))
+            second_grp = re.sub(regex, fix_indexing, t.group(2))
 
             return '{}[i2m({},*__dim{})]'.format(t.group(1),second_grp,n)
 
-        regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[((?:[^\[\](?:)]|(?R))*)\]"
         operation = re.sub(regex, fix_indexing, operation)
     else:
         def fix_indexing(t):
-- 
GitLab


From 4a098e85817ff0a28b2f724d1931026354cf6166 Mon Sep 17 00:00:00 2001
From: Keegan Owsley <keegano@gmail.com>
Date: Mon, 5 Dec 2016 12:10:49 -0800
Subject: [PATCH 10/10] Use simpler regex by default (passes the tests)

---
 pycuda/elementwise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 37945b3c..2da202b8 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -40,7 +40,7 @@ from pytools import memoize_method
 
 import struct
 
-recursive_match_outer_brackets = True
+recursive_match_outer_brackets = False
 
 if recursive_match_outer_brackets:
     import regex as re
-- 
GitLab