diff --git a/pycuda/cumath.py b/pycuda/cumath.py
index dbae5bd62a8fdc2b4b6899b5c2f2a3267e5a9dd8..1de0824effbbdd4a6e47465a0c1704365aaa9d88 100644
--- a/pycuda/cumath.py
+++ b/pycuda/cumath.py
@@ -6,6 +6,11 @@ import warnings
 from pycuda.driver import Stream
 
 
+def is_noncontig(*vecs):
+    r = False
+    for v in vecs: r = r or not v.flags.forc
+    return r
+
 def _make_unary_array_func(name):
     def f(array, stream_or_out=None, **kwargs):
 
@@ -29,10 +34,6 @@ def _make_unary_array_func(name):
         else:
             func_name = name
 
-        if not array.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         if out is None:
             out = array._new_like_me()
         else:
@@ -40,9 +41,18 @@ def _make_unary_array_func(name):
             assert out.strides == array.strides
             assert out.shape == array.shape
 
-        func = elementwise.get_unary_func_kernel(func_name, array.dtype)
-        func.prepared_async_call(array._grid, array._block, stream,
-                array.gpudata, out.gpudata, array.mem_size)
+        noncontig = is_noncontig(array, out)
+
+        func = elementwise.get_unary_func_kernel(func_name, array.dtype, use_strides=noncontig)
+
+        if noncontig:
+            func.prepared_async_call(array._grid, array._block, stream,
+                    array.gpudata, out.gpudata, array.mem_size,
+                    array.device_shape_and_strides,
+                    out.device_shape_and_strides)
+        else:
+            func.prepared_async_call(array._grid, array._block, stream,
+                    array.gpudata, out.gpudata, array.mem_size)
 
         return out
     return f
@@ -71,13 +81,18 @@ def fmod(arg, mod, stream=None):
     for each element in `arg` and `mod`."""
     result = gpuarray.GPUArray(arg.shape, arg.dtype)
 
-    if not arg.flags.forc or not mod.flags.forc:
-        raise RuntimeError("only contiguous arrays may "
-                "be used as arguments to this operation")
+    noncontig = is_noncontig(arg, mod, result)
 
-    func = elementwise.get_fmod_kernel()
-    func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size)
+    func = elementwise.get_fmod_kernel(use_strides=noncontig)
+    if noncontig:
+        func.prepared_async_call(arg._grid, arg._block, stream,
+                arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size,
+                arg.device_shape_and_strides,
+                mod.device_shape_and_strides,
+                result.device_shape_and_strides)
+    else:
+        func.prepared_async_call(arg._grid, arg._block, stream,
+                arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size)
 
     return result
 
@@ -85,16 +100,22 @@ def frexp(arg, stream=None):
     """Return a tuple `(significands, exponents)` such that
     `arg == significand * 2**exponent`.
     """
-    if not arg.flags.forc:
-        raise RuntimeError("only contiguous arrays may "
-                "be used as arguments to this operation")
 
     sig = gpuarray.GPUArray(arg.shape, arg.dtype)
     expt = gpuarray.GPUArray(arg.shape, arg.dtype)
 
-    func = elementwise.get_frexp_kernel()
-    func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size)
+    noncontig = is_noncontig(arg)
+
+    func = elementwise.get_frexp_kernel(use_strides=noncontig)
+    if noncontig:
+        func.prepared_async_call(arg._grid, arg._block, stream,
+                arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size,
+                arg.device_shape_and_strides,
+                sig.device_shape_and_strides,
+                expt.device_shape_and_strides)
+    else:
+        func.prepared_async_call(arg._grid, arg._block, stream,
+                arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size)
 
     return sig, expt
 
@@ -103,16 +124,23 @@ def ldexp(significand, exponent, stream=None):
     entries of `significand` and `exponent`, paired together as
     `result = significand * 2**exponent`.
     """
-    if not significand.flags.forc or not exponent.flags.forc:
-        raise RuntimeError("only contiguous arrays may "
-                "be used as arguments to this operation")
+
+    noncontig = is_noncontig(significand, exponent)
 
     result = gpuarray.GPUArray(significand.shape, significand.dtype)
 
-    func = elementwise.get_ldexp_kernel()
-    func.prepared_async_call(significand._grid, significand._block, stream,
-            significand.gpudata, exponent.gpudata, result.gpudata,
-            significand.mem_size)
+    func = elementwise.get_ldexp_kernel(use_strides=noncontig)
+    if noncontig:
+        func.prepared_async_call(significand._grid, significand._block, stream,
+                significand.gpudata, exponent.gpudata, result.gpudata,
+                significand.mem_size,
+                significand.device_shape_and_strides,
+                exponent.device_shape_and_strides,
+                result.device_shape_and_strides)
+    else:
+        func.prepared_async_call(significand._grid, significand._block, stream,
+                significand.gpudata, exponent.gpudata, result.gpudata,
+                significand.mem_size)
 
     return result
 
@@ -120,16 +148,22 @@ def modf(arg, stream=None):
     """Return a tuple `(fracpart, intpart)` of arrays containing the
     integer and fractional parts of `arg`.
     """
-    if not arg.flags.forc:
-        raise RuntimeError("only contiguous arrays may "
-                "be used as arguments to this operation")
+    noncontig = is_noncontig(arg)
 
     intpart = gpuarray.GPUArray(arg.shape, arg.dtype)
     fracpart = gpuarray.GPUArray(arg.shape, arg.dtype)
 
-    func = elementwise.get_modf_kernel()
-    func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, intpart.gpudata, fracpart.gpudata,
-            arg.mem_size)
+    func = elementwise.get_modf_kernel(use_strides=noncontig)
+    if noncontig:
+        func.prepared_async_call(arg._grid, arg._block, stream,
+                arg.gpudata, intpart.gpudata, fracpart.gpudata,
+                arg.mem_size,
+                arg.device_shape_and_strides,
+                intpart.device_shape_and_strides,
+                fracpart.device_shape_and_strides)
+    else:
+        func.prepared_async_call(arg._grid, arg._block, stream,
+                arg.gpudata, intpart.gpudata, fracpart.gpudata,
+                arg.mem_size)
 
     return fracpart, intpart
diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index feab0a6be89d54d72e1b6ec8759838910063a13d..2da202b8f51ee1e2d1ca84cd33f4d96b2f8c2ba2 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -34,10 +34,119 @@ OTHER DEALINGS IN THE SOFTWARE.
 
 from pycuda.tools import context_dependent_memoize
 import numpy as np
-from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg
+from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg, Argument
 from pytools import memoize_method
 
 
+import struct
+
+recursive_match_outer_brackets = False
+
+if recursive_match_outer_brackets:
+    import regex as re
+else:
+    import re
+
+
+class _DimHolder:
+    struct_char = 'P'
+
+def get_elwise_module_noncontig(arguments, operation,
+                                name="kernel", keep=False, options=None,
+                                preamble="", loop_prep="", after_loop="", max_dims=2):
+    from pycuda.compiler import SourceModule
+
+    vecargs = tuple([a for a in arguments if isinstance(a, VectorArg)])
+    names = [V.name for V in vecargs]
+
+    if recursive_match_outer_brackets:
+        regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[((?:[^\[\](?:)]|(?R))*)\]"
+        def fix_indexing(t):
+            try:
+                n = names.index(t.group(1))
+            except NameError:
+                return t.group(0)
+            second_grp = re.sub(regex, fix_indexing, t.group(2))
+
+            return '{}[i2m({},*__dim{})]'.format(t.group(1),second_grp,n)
+
+        operation = re.sub(regex, fix_indexing, operation)
+    else:
+        def fix_indexing(t):
+            try:
+                n = names.index(t.group(1))
+            except NameError:
+                return t.group(0)
+            return '{}[i2m({},*__dim{})]'.format(t.group(1),t.group(2),n)
+
+        regex = "([a-zA-Z_][a-zA-Z0-9_]*)\[(.+?)\]"
+        operation = re.sub(regex, fix_indexing, operation)
+
+    dimdef = '''
+    typedef struct
+    {
+        unsigned n[%(max_dims)i];
+        long stride[%(max_dims)i];
+    } dim;
+
+    __device__ unsigned i2m(unsigned i, dim d)
+    {
+        unsigned m = 0;
+        unsigned j = i;
+        unsigned l = 0;
+        for(int k = 0; k < %(max_dims)i; k++)
+        {
+            l = j / d.n[k];
+            m += d.stride[k] * (j-l*d.n[k]);
+            j = l;
+        }
+        return m;
+    }
+    ''' % {"max_dims":max_dims}
+
+    args = list(arg.declarator() for arg in arguments)
+    args.extend('dim *__dim{}'.format(i) for i in range(len(vecargs)))
+
+
+
+    src = """
+        #include <pycuda-complex.hpp>
+
+        %(dimdef)s
+
+        %(preamble)s
+
+        __global__ void %(name)s(%(arguments)s)
+        {
+
+          unsigned tid = threadIdx.x;
+          unsigned total_threads = gridDim.x*blockDim.x;
+          unsigned cta_start = blockDim.x*blockIdx.x;
+          unsigned i;
+
+          %(loop_prep)s;
+
+          for (i = cta_start + tid; i < n; i += total_threads)
+          {
+            %(operation)s;
+          }
+
+          %(after_loop)s;
+        }
+        """ % {
+            "arguments": ", ".join(args),
+            "operation": operation,
+            "name": name,
+            "preamble": preamble,
+            "loop_prep": loop_prep,
+            "after_loop": after_loop,
+            "dimdef": dimdef
+            }
+    print(src)
+
+    return SourceModule(src,
+        options=options, keep=keep)
+
 def get_elwise_module(arguments, operation,
         name="kernel", keep=False, options=None,
         preamble="", loop_prep="", after_loop=""):
@@ -124,7 +233,7 @@ def get_elwise_range_module(arguments, operation,
 
 
 def get_elwise_kernel_and_types(arguments, operation,
-        name="kernel", keep=False, options=None, use_range=False, **kwargs):
+                                name="kernel", keep=False, options=None, use_range=False, use_strides=False, **kwargs):
     if isinstance(arguments, str):
         from pycuda.tools import parse_c_arg
         arguments = [parse_c_arg(arg) for arg in arguments.split(",")]
@@ -138,7 +247,10 @@ def get_elwise_kernel_and_types(arguments, operation,
     else:
         arguments.append(ScalarArg(np.uintp, "n"))
 
-    if use_range:
+    #
+    if use_strides:
+        module_builder = get_elwise_module_noncontig
+    elif use_range:
         module_builder = get_elwise_range_module
     else:
         module_builder = get_elwise_module
@@ -146,10 +258,17 @@ def get_elwise_kernel_and_types(arguments, operation,
     mod = module_builder(arguments, operation, name,
             keep, options, **kwargs)
 
+    oldargs = list(arguments)
+    if use_strides:
+        vectors = [a for a in arguments if isinstance(a, VectorArg)]
+
+        for i,v in enumerate(vectors):
+            arguments.append(_DimHolder())
     func = mod.get_function(name)
     func.prepare("".join(arg.struct_char for arg in arguments))
+    func.stride_support = use_strides
 
-    return func, arguments
+    return func, oldargs
 
 
 def get_elwise_kernel(arguments, operation,
@@ -162,6 +281,10 @@ def get_elwise_kernel(arguments, operation,
 
     return func
 
+def is_noncontig(*vecs):
+    r = False
+    for v in vecs: r = r or not v.flags.forc
+    return r
 
 class ElementwiseKernel:
     def __init__(self, arguments, operation,
@@ -172,8 +295,8 @@ class ElementwiseKernel:
             operation=operation, arguments=arguments))
 
     @memoize_method
-    def generate_stride_kernel_and_types(self, use_range):
-        knl, arguments = get_elwise_kernel_and_types(use_range=use_range,
+    def generate_stride_kernel_and_types(self, use_range, use_strides=False):
+        knl, arguments = get_elwise_kernel_and_types(use_range=use_range, use_strides=use_strides,
                 **self.gen_kwargs)
 
         assert [i for i, arg in enumerate(arguments)
@@ -195,20 +318,28 @@ class ElementwiseKernel:
                     + ", ".join(six.iterkeys(kwargs)))
 
         invocation_args = []
-        func, arguments = self.generate_stride_kernel_and_types(
-                range_ is not None or slice_ is not None)
+        for arg in args:
+            if hasattr(arg, 'flags'):
+                # if not arg.flags.forc:
+                #     raise RuntimeError("elementwise kernel cannot "
+                #             "deal with non-contiguous arrays")
+
 
-        for arg, arg_descr in zip(args, arguments):
-            if isinstance(arg_descr, VectorArg):
-                if not arg.flags.forc:
-                    raise RuntimeError("elementwise kernel cannot "
-                            "deal with non-contiguous arrays")
 
                 vectors.append(arg)
                 invocation_args.append(arg.gpudata)
             else:
                 invocation_args.append(arg)
 
+        noncontig = is_noncontig(*vectors)
+
+        func, arguments = self.generate_stride_kernel_and_types(
+                range_ is not None or slice_ is not None, use_strides=noncontig)
+
+        if noncontig:
+            for v in vectors:
+                invocations_args.append(v.device_shape_and_strides)
+
         repr_vec = vectors[0]
 
         if slice_ is not None:
@@ -341,14 +472,14 @@ def get_put_kernel(dtype, idx_dtype, vec_count=1):
 
 
 @context_dependent_memoize
-def get_copy_kernel(dtype_dest, dtype_src):
+def get_copy_kernel(dtype_dest, dtype_src, use_strides=False):
     return get_elwise_kernel(
             "%(tp_dest)s *dest, %(tp_src)s *src" % {
                 "tp_dest": dtype_to_ctype(dtype_dest),
                 "tp_src": dtype_to_ctype(dtype_src),
                 },
             "dest[i] = src[i]",
-            "copy")
+            "copy", use_strides=use_strides)
 
 
 @context_dependent_memoize
@@ -399,7 +530,7 @@ def get_linear_combination_kernel(summand_descriptors,
 
 
 @context_dependent_memoize
-def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z):
+def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z, use_strides=False):
     return get_elwise_kernel(
             "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % {
                 "tp_x": dtype_to_ctype(dtype_x),
@@ -407,22 +538,22 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = a*x[i] + b*y[i]",
-            "axpbyz")
+            "axpbyz", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_axpbz_kernel(dtype_x, dtype_z):
+def get_axpbz_kernel(dtype_x, dtype_z, use_strides=False):
     return get_elwise_kernel(
             "%(tp_z)s a, %(tp_x)s *x,%(tp_z)s b, %(tp_z)s *z" % {
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_z": dtype_to_ctype(dtype_z)
                 },
             "z[i] = a * x[i] + b",
-            "axpb")
+            "axpb", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator):
+def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator, use_strides=False):
     return get_elwise_kernel(
             "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
                 "tp_x": dtype_to_ctype(dtype_x),
@@ -430,22 +561,22 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = x[i] %s y[i]" % operator,
-            "multiply")
+            "multiply", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_rdivide_elwise_kernel(dtype_x, dtype_z):
+def get_rdivide_elwise_kernel(dtype_x, dtype_z, use_strides=False):
     return get_elwise_kernel(
             "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z" % {
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = y / x[i]",
-            "divide_r")
+            "divide_r", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z):
+def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=False):
     return get_elwise_kernel(
             "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
                 "tp_x": dtype_to_ctype(dtype_x),
@@ -453,11 +584,11 @@ def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = %s(x[i], y[i])" % func,
-            func+"_kernel")
+            func+"_kernel", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z):
+def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=False):
     return get_elwise_kernel(
             "%(tp_x)s *x, %(tp_y)s y, %(tp_z)s *z" % {
                 "tp_x": dtype_to_ctype(dtype_x),
@@ -465,10 +596,10 @@ def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = %s(x[i], y)" % func,
-            func+"_kernel")
+            func+"_kernel", use_strides=use_strides)
 
 
-def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar):
+def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar, use_strides=False):
     if np.float64 not in [dtype_x, dtype_y]:
         func = func + "f"
 
@@ -477,75 +608,75 @@ def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar):
         func = "f"+func
 
     if use_scalar:
-        return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z)
+        return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=use_strides)
     else:
-        return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z)
+        return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z, use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_fill_kernel(dtype):
+def get_fill_kernel(dtype, use_strides=False):
     return get_elwise_kernel(
             "%(tp)s a, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = a",
-            "fill")
+            "fill", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_reverse_kernel(dtype):
+def get_reverse_kernel(dtype, use_strides=False):
     return get_elwise_kernel(
             "%(tp)s *y, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = y[n-1-i]",
-            "reverse")
+            "reverse", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_real_kernel(dtype, real_dtype):
+def get_real_kernel(dtype, real_dtype, use_strides=False):
     return get_elwise_kernel(
             "%(tp)s *y, %(real_tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 "real_tp": dtype_to_ctype(real_dtype),
                 },
             "z[i] = real(y[i])",
-            "real")
+            "real", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_imag_kernel(dtype, real_dtype):
+def get_imag_kernel(dtype, real_dtype, use_strides=False):
     return get_elwise_kernel(
             "%(tp)s *y, %(real_tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 "real_tp": dtype_to_ctype(real_dtype),
                 },
             "z[i] = imag(y[i])",
-            "imag")
+            "imag", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_conj_kernel(dtype):
+def get_conj_kernel(dtype, use_strides=False):
     return get_elwise_kernel(
             "%(tp)s *y, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = pycuda::conj(y[i])",
-            "conj")
+            "conj", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_arange_kernel(dtype):
+def get_arange_kernel(dtype, use_strides=False):
     return get_elwise_kernel(
             "%(tp)s *z, %(tp)s start, %(tp)s step" % {
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = start + i*step",
-            "arange")
+            "arange", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_pow_kernel(dtype):
+def get_pow_kernel(dtype, use_strides=False):
     if dtype == np.float32:
         func = "powf"
     else:
@@ -556,11 +687,11 @@ def get_pow_kernel(dtype):
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = %s(y[i], value)" % func,
-            "pow_method")
+            "pow_method", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
+def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, use_strides=False):
     if np.float64 in [dtype_x, dtype_y]:
         func = "pow"
     else:
@@ -573,27 +704,27 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = %s(x[i], y[i])" % func,
-            "pow_method")
+            "pow_method", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_fmod_kernel():
+def get_fmod_kernel(use_strides=False):
     return get_elwise_kernel(
             "float *arg, float *mod, float *z",
             "z[i] = fmod(arg[i], mod[i])",
-            "fmod_kernel")
+            "fmod_kernel", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_modf_kernel():
+def get_modf_kernel(use_strides=False):
     return get_elwise_kernel(
             "float *x, float *intpart ,float *fracpart",
             "fracpart[i] = modf(x[i], &intpart[i])",
-            "modf_kernel")
+            "modf_kernel", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_frexp_kernel():
+def get_frexp_kernel(use_strides=False):
     return get_elwise_kernel(
             "float *x, float *significand, float *exponent",
             """
@@ -601,19 +732,19 @@ def get_frexp_kernel():
                 significand[i] = frexp(x[i], &expt);
                 exponent[i] = expt;
             """,
-            "frexp_kernel")
+            "frexp_kernel", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_ldexp_kernel():
+def get_ldexp_kernel(use_strides=False):
     return get_elwise_kernel(
             "float *sig, float *expt, float *z",
             "z[i] = ldexp(sig[i], int(expt[i]))",
-            "ldexp_kernel")
+            "ldexp_kernel", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_unary_func_kernel(func_name, in_dtype, out_dtype=None):
+def get_unary_func_kernel(func_name, in_dtype, out_dtype=None, use_strides=False):
     if out_dtype is None:
         out_dtype = in_dtype
 
@@ -623,11 +754,11 @@ def get_unary_func_kernel(func_name, in_dtype, out_dtype=None):
                 "tp_out": dtype_to_ctype(out_dtype),
                 },
             "z[i] = %s(y[i])" % func_name,
-            "%s_kernel" % func_name)
+            "%s_kernel" % func_name, use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_if_positive_kernel(crit_dtype, dtype):
+def get_if_positive_kernel(crit_dtype, dtype, use_strides=False):
     return get_elwise_kernel([
             VectorArg(crit_dtype, "crit"),
             VectorArg(dtype, "then_"),
@@ -635,11 +766,11 @@ def get_if_positive_kernel(crit_dtype, dtype):
             VectorArg(dtype, "result"),
             ],
             "result[i] = crit[i] > 0 ? then_[i] : else_[i]",
-            "if_positive")
+            "if_positive", use_strides=use_strides)
 
 
 @context_dependent_memoize
-def get_scalar_op_kernel(dtype_x, dtype_y, operator):
+def get_scalar_op_kernel(dtype_x, dtype_y, operator, use_strides=False):
     return get_elwise_kernel(
             "%(tp_x)s *x, %(tp_a)s a, %(tp_y)s *y" % {
                 "tp_x": dtype_to_ctype(dtype_x),
@@ -647,4 +778,4 @@ def get_scalar_op_kernel(dtype_x, dtype_y, operator):
                 "tp_a": dtype_to_ctype(dtype_x),
                 },
             "y[i] = x[i] %s a" % operator,
-            "scalarop_kernel")
+            "scalarop_kernel", use_strides=use_strides)
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 896b2fcab6ce9ff89bd01155292a39f61d374bf0..8d36400557608d990bdae778a8a03c09500dd3be 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -14,6 +14,7 @@ from pycuda.characterize import has_double_support
 import six
 from six.moves import range, zip, reduce
 import numbers
+import struct
 
 
 def _get_common_dtype(obj1, obj2):
@@ -118,37 +119,59 @@ def splay(n, dev=None):
 
 def _make_binary_op(operator):
     def func(self, other):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+        # if not self.flags.forc:
+        #     raise RuntimeError("only contiguous arrays may "
+        #             "be used as arguments to this operation")
 
         if isinstance(other, GPUArray):
             assert self.shape == other.shape
 
-            if not other.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+            # if not other.flags.forc:
+            #     raise RuntimeError("only contiguous arrays may "
+            #             "be used as arguments to this operation")
+
+            noncontig = is_noncontig(self, other)
 
             result = self._new_like_me()
             func = elementwise.get_binary_op_kernel(
                     self.dtype, other.dtype, result.dtype,
-                    operator)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other.gpudata, result.gpudata,
-                    self.mem_size)
+                    operator, use_strides=noncontig)
+            if noncontig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, other.gpudata, result.gpudata,
+                        self.mem_size,
+                        self.device_shape_and_strides,
+                        other.device_shape_and_strides,
+                        result.device_shape_and_strides)
+            else:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, other.gpudata, result.gpudata,
+                        self.mem_size)
 
             return result
         else:  # scalar operator
             result = self._new_like_me()
+            noncontig = is_noncontig(self)
             func = elementwise.get_scalar_op_kernel(
-                    self.dtype, result.dtype, operator)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other, result.gpudata,
-                    self.mem_size)
+                    self.dtype, result.dtype, operator, use_strides=noncontig)
+            if noncontig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, other, result.gpudata,
+                        self.mem_size,
+                        self.device_shape_and_strides,
+                        result.device_shape_and_strides)
+            else:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, other, result.gpudata,
+                        self.mem_size)
             return result
 
     return func
 
+def is_noncontig(*vecs):
+    r = False
+    for v in vecs: r = r or not v.flags.forc
+    return r
 
 class GPUArray(object):
     """A GPUArray is used to do array-based calculation on the GPU.
@@ -158,6 +181,7 @@ class GPUArray(object):
     """
 
     __array_priority__ = 100
+    __max_dims__ = 2
 
     def __init__(self, shape, dtype, allocator=drv.mem_alloc,
             base=None, gpudata=None, strides=None, order="C"):
@@ -227,6 +251,22 @@ class GPUArray(object):
     def flags(self):
         return _ArrayFlags(self)
 
+    @property
+    def device_shape_and_strides(self):
+        itemsize = 8 # unsigned int and long int
+
+        sar = drv.mem_alloc(self.__max_dims__*itemsize)
+        newstrides = tuple(v // self.dtype.itemsize for v in self.strides)
+        shape = self.shape[::-1]
+        newstrides = newstrides[::-1]
+        if len(shape) < self.__max_dims__:
+            shape = shape + (1,)*(self.__max_dims__-len(shape))
+            newstrides = newstrides + (0,)*(self.__max_dims__-len(newstrides))
+        data = shape + newstrides
+        drv.memcpy_htod(int(sar), struct.pack('I'*self.__max_dims__ + 'l'*self.__max_dims__, *data))
+        self.__sar = sar # don't garbage collect yet
+        return sar
+
     def set(self, ary, async=False, stream=None):
         if ary.size != self.size:
             raise ValueError("ary and self must be the same size")
@@ -297,47 +337,74 @@ class GPUArray(object):
         """Compute ``out = selffac * self + otherfac*other``,
         where `other` is a vector.."""
         assert self.shape == other.shape
-        if not self.flags.forc or not other.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
 
-        func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype)
+        noncontig = is_noncontig(self, other, out)
 
-        if add_timer is not None:
-            add_timer(3*self.size, func.prepared_timed_call(self._grid,
-                selffac, self.gpudata, otherfac, other.gpudata,
-                out.gpudata, self.mem_size))
+        func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype, use_strides=noncontig)
+
+        if noncontig:
+            if add_timer is not None:
+                add_timer(3*self.size, func.prepared_timed_call(self._grid,
+                    selffac, self.gpudata, otherfac, other.gpudata,
+                    out.gpudata, self.mem_size,
+                    self.device_shape_and_strides,
+                    other.device_shape_and_strides,
+                    out.device_shape_and_strides))
+            else:
+                func.prepared_async_call(self._grid, self._block, stream,
+                        selffac, self.gpudata, otherfac, other.gpudata,
+                        out.gpudata, self.mem_size,
+                        self.device_shape_and_strides,
+                        other.device_shape_and_strides,
+                        out.device_shape_and_strides)
         else:
-            func.prepared_async_call(self._grid, self._block, stream,
+            if add_timer is not None:
+                add_timer(3*self.size, func.prepared_timed_call(self._grid,
                     selffac, self.gpudata, otherfac, other.gpudata,
-                    out.gpudata, self.mem_size)
+                    out.gpudata, self.mem_size))
+            else:
+                func.prepared_async_call(self._grid, self._block, stream,
+                        selffac, self.gpudata, otherfac, other.gpudata,
+                        out.gpudata, self.mem_size)
+
 
         return out
 
     def _axpbz(self, selffac, other, out, stream=None):
         """Compute ``out = selffac * self + other``, where `other` is a scalar."""
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
-        func = elementwise.get_axpbz_kernel(self.dtype, out.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                selffac, self.gpudata,
-                other, out.gpudata, self.mem_size)
+        noncontig = is_noncontig(self, out)
+        func = elementwise.get_axpbz_kernel(self.dtype, out.dtype, use_strides=noncontig)
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    selffac, self.gpudata,
+                    other, out.gpudata, self.mem_size,
+                    self.device_shape_and_strides,
+                    out.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    selffac, self.gpudata,
+                    other, out.gpudata, self.mem_size)
 
         return out
 
     def _elwise_multiply(self, other, out, stream=None):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+
+        noncontig = is_noncontig(self, other, out)
 
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
-                out.dtype, "*")
-        func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size)
+                out.dtype, "*", use_strides=noncontig)
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, other.gpudata,
+                    out.gpudata, self.mem_size,
+                    self.device_shape_and_strides,
+                    other.device_shape_and_strides,
+                    out.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, other.gpudata,
+                    out.gpudata, self.mem_size)
 
         return out
 
@@ -347,31 +414,43 @@ class GPUArray(object):
            y = n / self
         """
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
+        noncontig = is_noncontig(self, out)
 
-        func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other,
-                out.gpudata, self.mem_size)
+        func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype, use_strides=noncontig)
+
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, other,
+                    out.gpudata, self.mem_size,
+                    self.device_shape_and_strides,
+                    out.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, other,
+                    out.gpudata, self.mem_size)
 
         return out
 
     def _div(self, other, out, stream=None):
         """Divides an array by another array."""
 
-        if not self.flags.forc or not other.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         assert self.shape == other.shape
 
+        noncontig = is_noncontig(self, other, out)
+
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
-                out.dtype, "/")
-        func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size)
+                out.dtype, "/", use_strides=noncontig)
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, other.gpudata,
+                    out.gpudata, self.mem_size,
+                    self.device_shape_and_strides,
+                    other.device_shape_and_strides,
+                    out.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, other.gpudata,
+                    out.gpudata, self.mem_size)
 
         return out
 
@@ -381,7 +460,8 @@ class GPUArray(object):
             dtype = self.dtype
         else:
             if dtype == self.dtype:
-                strides = self.strides
+                # strides = self.strides
+                strides = None
 
         return self.__class__(self.shape, dtype,
                 allocator=self.allocator, strides=strides)
@@ -513,9 +593,17 @@ class GPUArray(object):
 
     def fill(self, value, stream=None):
         """fills the array with the specified value"""
-        func = elementwise.get_fill_kernel(self.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                value, self.gpudata, self.mem_size)
+        noncontig = is_noncontig(self)
+
+        func = elementwise.get_fill_kernel(self.dtype, use_strides=noncontig)
+
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    value, self.gpudata, self.mem_size,
+                    self.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    value, self.gpudata, self.mem_size)
 
         return self
 
@@ -597,11 +685,19 @@ class GPUArray(object):
         else:
             out_dtype = self.dtype
 
+        noncontig = is_noncontig(self, result)
+
         func = elementwise.get_unary_func_kernel(fname, self.dtype,
-                out_dtype=out_dtype)
+                out_dtype=out_dtype, use_strides=noncontig)
 
-        func.prepared_async_call(self._grid, self._block, None,
-                self.gpudata, result.gpudata, self.mem_size)
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, None,
+                    self.gpudata, result.gpudata, self.mem_size,
+                        self.device_shape_and_strides,
+                        result.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, None,
+                    self.gpudata, result.gpudata, self.mem_size)
 
         return result
 
@@ -616,32 +712,46 @@ class GPUArray(object):
         """
 
         if isinstance(other, GPUArray):
-            if not self.flags.forc or not other.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
-
             assert self.shape == other.shape
 
             result = self._new_like_me(_get_common_dtype(self, other))
 
-            func = elementwise.get_pow_array_kernel(
-                    self.dtype, other.dtype, result.dtype)
+            non_contig = is_noncontig(self, other, result)
 
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other.gpudata, result.gpudata,
-                    self.mem_size)
+            func = elementwise.get_pow_array_kernel(
+                    self.dtype, other.dtype, result.dtype, use_strides=non_contig)
+
+            if non_contig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, other.gpudata, result.gpudata,
+                        self.mem_size,
+                        self.device_shape_and_strides,
+                        other.device_shape_and_strides,
+                        result.device_shape_and_strides)
+            else:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, other.gpudata, result.gpudata,
+                        self.mem_size)
 
             return result
         else:
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
+            # if not self.flags.forc:
+            #     raise RuntimeError("only contiguous arrays may "
+            #             "be used as arguments to this operation")
 
             result = self._new_like_me()
-            func = elementwise.get_pow_kernel(self.dtype)
-            func.prepared_async_call(self._grid, self._block, None,
-                    other, self.gpudata, result.gpudata,
-                    self.mem_size)
+
+            noncontig = is_noncontig(self, result)
+            func = elementwise.get_pow_kernel(self.dtype, use_strides=noncontig)
+
+            if noncontig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        other, self.gpudata, result.gpudata,
+                        self.mem_size, self.device_shape_and_strides, result.device_shape_and_strides)
+            else:
+                func.prepared_async_call(self._grid, self._block, None,
+                        other, self.gpudata, result.gpudata,
+                        self.mem_size)
 
             return result
 
@@ -650,33 +760,43 @@ class GPUArray(object):
         as one-dimensional.
         """
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         result = self._new_like_me()
 
-        func = elementwise.get_reverse_kernel(self.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, result.gpudata,
-                self.mem_size)
+        noncontig = is_noncontig(self, result)
+
+        func = elementwise.get_reverse_kernel(self.dtype, use_strides=noncontig)
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, result.gpudata,
+                    self.mem_size,
+                    self.device_shape_and_strides,
+                    result.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    self.gpudata, result.gpudata,
+                    self.mem_size)
 
         return result
 
     def astype(self, dtype, stream=None):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         if dtype == self.dtype:
             return self.copy()
 
+        noncontig = is_noncontig(self)
+
         result = self._new_like_me(dtype=dtype)
 
-        func = elementwise.get_copy_kernel(dtype, self.dtype)
-        func.prepared_async_call(self._grid, self._block, stream,
-                result.gpudata, self.gpudata,
-                self.mem_size)
+        func = elementwise.get_copy_kernel(dtype, self.dtype, use_strides=noncontig)
+        if noncontig:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    result.gpudata, self.gpudata,
+                    self.mem_size,
+                    result.device_shape_and_strides,
+                    self.device_shape_and_strides)
+        else:
+            func.prepared_async_call(self._grid, self._block, stream,
+                    result.gpudata, self.gpudata,
+                    self.mem_size)
 
         return result
 
@@ -903,10 +1023,19 @@ class GPUArray(object):
 
             result = self._new_like_me(dtype=real_dtype)
 
-            func = elementwise.get_real_kernel(dtype, real_dtype)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
-                    self.mem_size)
+            noncontig = is_noncontig(self)
+
+            func = elementwise.get_real_kernel(dtype, real_dtype, use_strides=noncontig)
+            if noncontig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, result.gpudata,
+                        self.mem_size,
+                        self.device_shape_and_strides,
+                        result.device_shape_and_strides)
+            else:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, result.gpudata,
+                        self.mem_size)
 
             return result
         else:
@@ -916,19 +1045,25 @@ class GPUArray(object):
     def imag(self):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
 
             from pytools import match_precision
             real_dtype = match_precision(np.dtype(np.float64), dtype)
 
             result = self._new_like_me(dtype=real_dtype)
 
-            func = elementwise.get_imag_kernel(dtype, real_dtype)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
-                    self.mem_size)
+            noncontig = is_noncontig(self)
+
+            func = elementwise.get_imag_kernel(dtype, real_dtype, use_strides=noncontig)
+            if noncontig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, result.gpudata,
+                        self.mem_size,
+                        self.device_shape_and_strides,
+                        result.device_shape_and_strides)
+            else:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, result.gpudata,
+                        self.mem_size)
 
             return result
         else:
@@ -937,16 +1072,21 @@ class GPUArray(object):
     def conj(self):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
 
             result = self._new_like_me()
-
-            func = elementwise.get_conj_kernel(dtype)
-            func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
-                    self.mem_size)
+            noncontig = is_noncontig(self)
+
+            func = elementwise.get_conj_kernel(dtype, use_strides=noncontig)
+            if noncontig:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, result.gpudata,
+                        self.mem_size,
+                        self.device_shape_and_strides,
+                        result.device_shape_and_strides)
+            else:
+                func.prepared_async_call(self._grid, self._block, None,
+                        self.gpudata, result.gpudata,
+                        self.mem_size)
 
             return result
         else: