diff --git a/pyopencl/array.py b/pyopencl/array.py
index c0c74ef6367223f1351f6d761605319090880137..450aec8e14255dd2f5d08226c0265b3ae2265a4c 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -33,19 +33,19 @@ OTHER DEALINGS IN THE SOFTWARE.
 import numpy
 import pyopencl.elementwise as elementwise
 import pyopencl as cl
+#from pytools import memoize_method
+from decorator import decorator
 
 
 
-def splay(ctx, n):
-    max_work_items = max(dev.max_work_group_size for dev in ctx.devices)
-    max_work_items = min(128, max_work_items)
+def splay(queue, n):
+    dev = queue.device
+    max_work_items = min(128, dev.max_work_group_size)
     min_work_items = min(32, max_work_items)
-    max_groups = max(
-            4 * dev.max_compute_units * 8
-            # 4 to overfill the device
-            # 8 is an Nvidia constant--that's how many
-            # groups fit onto one compute device
-            for dev in ctx.devices)
+    max_groups = dev.max_compute_units * 4 * 8
+    # 4 to overfill the device
+    # 8 is an Nvidia constant--that's how many
+    # groups fit onto one compute device
 
     if n < min_work_items:
         group_count = 1
@@ -74,6 +74,39 @@ def _get_common_dtype(obj1, obj2):
 
 
 
+@decorator
+def elwise_kernel_runner(kernel_getter, *args, **kwargs):
+    """Take a kernel getter of the same signature as the kernel 
+    and return a function that invokes that kernel.
+
+    Assumes that the zeroth entry in *args* is an :class:`Array`.
+    """
+
+    knl = kernel_getter(*args)
+
+    repr_ary = args[0]
+    assert isinstance(repr_ary, Array)
+
+    queue = kwargs.pop("queue", None) or repr_ary.queue
+    gs, ls = repr_ary.get_sizes(queue)
+
+    if kwargs:
+        raise TypeError("only the 'queue' keyword argument is supported")
+
+    actual_args = []
+    for arg in args:
+        if isinstance(arg, Array):
+            actual_args.append(arg.data)
+        else:
+            actual_args.append(arg)
+    actual_args.append(repr_ary.mem_size)
+
+    return knl(queue, gs, ls, *actual_args)
+
+
+
+
+
 class DefaultAllocator:
     def __init__(self, context, flags=cl.mem_flags.READ_WRITE):
         self.context = context
@@ -132,7 +165,9 @@ class Array(object):
 
         self.base = base
 
-        self._global_size, self._local_size = splay(context, self.mem_size)
+    #@memoize_method FIXME: reenable
+    def get_sizes(self, queue):
+        return splay(self.queue, self.mem_size)
 
     def set(self, ary, queue=None, async=False):
         assert ary.size == self.size
@@ -170,38 +205,28 @@ class Array(object):
         raise TypeError("pyopencl arrays are not hashable.")
 
     # kernel invocation wrappers ----------------------------------------------
-    def _axpbyz(self, selffac, other, otherfac, out, queue=None):
+    @elwise_kernel_runner
+    @staticmethod
+    def _axpbyz(out, afac, a, bfac, b, queue=None):
         """Compute ``out = selffac * self + otherfac*other``,
         where `other` is a vector.."""
         assert self.shape == other.shape
 
-        knl = elementwise.get_axpbyz_kernel(
+        return elementwise.get_axpbyz_kernel(
                 self.context, self.dtype, other.dtype, out.dtype)
 
-        knl(queue or self.queue, self._global_size, self._local_size,
-                selffac, self.data, otherfac, other.data,
-                out.data, self.mem_size)
-
-        return out
-
-    def _axpbz(self, selffac, other, out, queue=None):
+    @elwise_kernel_runner
+    def _axpbz(self, out, selffac, other, queue=None):
         """Compute ``out = selffac * self + other``, where `other` is a scalar."""
-        knl = elementwise.get_axpbz_kernel(self.context, self.dtype)
-        knl(queue or self.queue, self._global_size, self._local_size,
-                selffac, self.data, other, out.data, self.mem_size)
+        return elementwise.get_axpbz_kernel(self.context, self.dtype)
 
-        return out
-
-    def _elwise_multiply(self, other, out, queue=None):
-        knl = elementwise.get_multiply_kernel(
+    @elwise_kernel_runner
+    def _elwise_multiply(self, out, other, queue=None):
+        return elementwise.get_multiply_kernel(
                 self.context, self.dtype, other.dtype, out.dtype)
-        knl(queue or self.queue, self._global_size, self._local_size,
-                self.data, other.data,
-                out.data, self.mem_size)
-
-        return out
 
-    def _rdiv_scalar(self, other, out, queue=None):
+    @elwise_kernel_runner
+    def _rdiv_scalar(self, out, other, queue=None):
         """Divides an array by a scalar::
 
            y = n / self
@@ -209,24 +234,29 @@ class Array(object):
 
         assert self.dtype == numpy.float32
 
-        knl = elementwise.get_rdivide_elwise_kernel(self.context, self.dtype)
-        knl(queue or self.queue, self._global_size, self._local_size,
-                self.data, other,
-                out.data, self.mem_size)
-
-        return out
+        return elementwise.get_rdivide_elwise_kernel(self.context, self.dtype)
 
-    def _div(self, other, out, queue=None):
+    @elwise_kernel_runner
+    def _div(self, out, other, queue=None):
         """Divides an array by another array."""
 
         assert self.shape == other.shape
 
-        knl = elementwise.get_divide_kernel(self.context,
+        return elementwise.get_divide_kernel(self.context,
                 self.dtype, other.dtype, out.dtype)
-        knl(queue or self.queue, self._global_size, self._local_size,
-                self.data, other.data, out.data, self.mem_size)
 
-        return out
+    @elwise_kernel_runner
+    def _fill(result, scalar):
+        return elementwise.get_fill_kernel(result.context, result.dtype)
+
+    @elwise_kernel_runner
+    def _abs(result, arg):
+        return elementwise.get_unary_func_kernel(
+                arg.context, fname, arg.dtype)
+
+    @elwise_kernel_runner
+    def _pow_scalar(self, ):
+        return elementwise.get_pow_kernel(self.context, self.dtype)
 
     def _new_like_me(self, dtype=None):
         if dtype is None:
@@ -236,11 +266,12 @@ class Array(object):
                 queue=self.queue)
 
     # operators ---------------------------------------------------------------
-    def mul_add(self, selffac, other, otherfac, add_timer=None, queue=None):
+    def mul_add(self, selffac, other, otherfac, queue=None):
         """Return `selffac * self + otherfac*other`.
         """
         result = self._new_like_me(_get_common_dtype(self, other))
-        return self._axpbyz(selffac, other, otherfac, result, add_timer)
+        self._axpbyz(result, selffac, self, other, otherfac)
+        return result
 
     def __add__(self, other):
         """Add an array with an array or an array with a scalar."""
@@ -248,14 +279,16 @@ class Array(object):
         if isinstance(other, Array):
             # add another vector
             result = self._new_like_me(_get_common_dtype(self, other))
-            return self._axpbyz(1, other, 1, result)
+            self._axpbyz(result, 1, self, 1, other)
+            return result
         else:
             # add a scalar
             if other == 0:
                 return self
             else:
                 result = self._new_like_me()
-                return self._axpbz(1, other, result)
+                self._axpbz(result, 1, other)
+                return result
 
     __radd__ = __add__
 
@@ -264,14 +297,15 @@ class Array(object):
 
         if isinstance(other, Array):
             result = self._new_like_me(_get_common_dtype(self, other))
-            return self._axpbyz(1, other, -1, result)
+            self._axpbyz(result, 1, other, -1)
+            return result
         else:
             if other == 0:
                 return self
             else:
                 # create a new array for the result
                 result = self._new_like_me()
-                return self._axpbz(1, -other, result)
+                return self._axpbz(result, 1, -other)
 
     def __rsub__(self,other):
         """Substracts an array by a scalar or an array::
@@ -280,32 +314,37 @@ class Array(object):
         """
         # other must be a scalar
         result = self._new_like_me()
-        return self._axpbz(-1, other, result)
+        return self._axpbz(result, -1, other)
 
     def __iadd__(self, other):
-        return self._axpbyz(1, other, 1, self)
+        self._axpbyz(self, 1, self, 1, other)
+        return self
 
     def __isub__(self, other):
-        return self._axpbyz(1, other, -1, self)
+        return self._axpbyz(self, 1, self, -1, other)
 
     def __neg__(self):
         result = self._new_like_me()
-        return self._axpbz(-1, 0, result)
+        return self._axpbz(result, -1, self)
 
     def __mul__(self, other):
         if isinstance(other, Array):
             result = self._new_like_me(_get_common_dtype(self, other))
-            return self._elwise_multiply(other, result)
+            self._elwise_multiply(result, other)
+            return result
         else:
             result = self._new_like_me()
-            return self._axpbz(other, 0, result)
+            self._axpbz(result, other, self, 0)
+            return result
 
     def __rmul__(self, scalar):
         result = self._new_like_me()
-        return self._axpbz(scalar, 0, result)
+        self._axpbz(result, scalar, self, 0)
+        return result
 
     def __imul__(self, scalar):
-        return self._axpbz(scalar, 0, self)
+        self._axpbz(self, scalar, 0)
+        return self
 
     def __div__(self, other):
         """Divides an array by an array or a scalar::
@@ -314,14 +353,15 @@ class Array(object):
         """
         if isinstance(other, Array):
             result = self._new_like_me(_get_common_dtype(self, other))
-            return self._div(other, result)
+            return self._div(result, other)
         else:
             if other == 1:
                 return self
             else:
                 # create a new array for the result
                 result = self._new_like_me()
-                return self._axpbz(1/other, 0, result)
+                self._axpbz(result, 1/other, 0)
+                return result
 
     __truediv__ = __div__
 
@@ -333,29 +373,22 @@ class Array(object):
 
         if isinstance(other, Array):
             result = self._new_like_me(_get_common_dtype(self, other))
-
-            knl = elementwise.get_divide_kernel(self.context)
-            knl(self.queue, self._global_size, self._local_size,
-                    other.data, self.data, result.data,
-                    self.mem_size)
-
-            return result
+            other._div(result, self)
         else:
             if other == 1:
                 return self
             else:
                 # create a new array for the result
                 result = self._new_like_me()
-                return self._rdiv_scalar(other, result)
+                self._rdiv_scalar(result, other)
+
+        return result
 
     __rtruediv__ = __div__
 
     def fill(self, value, queue=None):
         """fills the array with the specified value"""
-        knl = elementwise.get_fill_kernel(self.context, self.dtype)
-        knl(queue or self.queue, self._global_size, self._local_size,
-                value, self.data, self.mem_size)
-
+        self._fill(self, value, queue=queue)
         return self
 
     def __len__(self):
@@ -370,8 +403,6 @@ class Array(object):
         of `self`.
         """
 
-        result = self._new_like_me()
-
         if self.dtype.kind == "f":
             fname = "fabs"
         elif self.dtype.kind in ["u", "i"]:
@@ -379,11 +410,8 @@ class Array(object):
         else:
             raise TypeError("unsupported dtype in __abs__()")
 
-        knl = elementwise.get_unary_func_kernel(
-                self.context, fname, self.dtype)
-        knl(self.queue, self._global_size, self._local_size,
-                self.data,result.data, self.mem_size)
-
+        result = self._new_like_me()
+        self._abs(result, self)
         return result
 
     def __pow__(self, other):
@@ -406,7 +434,6 @@ class Array(object):
             return result
         else:
             result = self._new_like_me()
-            knl = elementwise.get_pow_kernel(self.context, self.dtype)
             knl(self.queue, self._global_size, self._local_size,
                     other, self.data, result.data,
                     self.mem_size)
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index 0fc9aaedf2226fffb03c21745846033c3ee28bca..4ec64b73d9e5cff11b784db6553a76fda5886b2f 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -152,9 +152,9 @@ class ElementwiseKernel:
             queue = repr_vec.queue
         invocation_args.append(repr_vec.mem_size)
 
+        gs, ls = repr_vec.get_sizes(queue)
         self.kernel.set_args(*invocation_args)
-        return cl.enqueue_nd_range_kernel(queue, self.kernel,
-                repr_vec._global_size, repr_vec._local_size)
+        return cl.enqueue_nd_range_kernel(queue, self.kernel, gs, ls)
 
 
 
@@ -166,9 +166,9 @@ def get_take_kernel(context, dtype, idx_dtype, vec_count=1):
             "tp": dtype_to_ctype(dtype),
             }
 
-    args = ([VectorArg(idx_dtype, "idx")]
-            + [VectorArg(dtype, "src"+str(i))for i in range(vec_count)] 
-            + [VectorArg(dtype, "dest"+str(i))for i in range(vec_count)])
+    args = ([VectorArg(dtype, "dest"+str(i))for i in range(vec_count)]
+            + [VectorArg(idx_dtype, "idx")]
+            + [VectorArg(dtype, "src"+str(i))for i in range(vec_count)] )
     body = (
             ("%(idx_tp)s src_idx = idx[i];\n" % ctx)
             + "\n".join(
@@ -188,15 +188,15 @@ def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1):
             }
 
     args = [
+            VectorArg(dtype, "dest%d" % i)
+                for i in range(vec_count)
+            ] + [
             VectorArg(idx_dtype, "gmem_dest_idx"),
             VectorArg(idx_dtype, "gmem_src_idx"),
             ] + [
             VectorArg(dtype, "src%d" % i)
                 for i in range(vec_count)
             ] + [
-            VectorArg(dtype, "dest%d" % i)
-                for i in range(vec_count)
-            ] + [
             ScalarArg(idx_dtype, "offset%d" % i)
                 for i in range(vec_count) if with_offsets
             ]
@@ -228,11 +228,11 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
             }
 
     args = [
-            VectorArg(idx_dtype, "gmem_dest_idx"),
-            ] + [
             VectorArg(dtype, "dest%d" % i)
                 for i in range(vec_count)
             ] + [
+            VectorArg(idx_dtype, "gmem_dest_idx"),
+            ]  + [
             VectorArg(dtype, "src%d" % i)
                 for i in range(vec_count)
             ]
@@ -262,6 +262,9 @@ def get_copy_kernel(context, dtype_dest, dtype_src):
 @context_dependent_memoize
 def get_linear_combination_kernel(summand_descriptors,
         dtype_z):
+    # TODO: Port this!
+    raise NotImplementedError
+
     from pycuda.tools import dtype_to_ctype
     from pycuda.elementwise import \
             VectorArg, ScalarArg, get_elwise_module
@@ -311,7 +314,7 @@ def get_linear_combination_kernel(summand_descriptors,
 @context_dependent_memoize
 def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
     return get_elwise_kernel(context,
-            "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % {
+            "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y" % {
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
@@ -322,7 +325,7 @@ def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
 @context_dependent_memoize
 def get_axpbz_kernel(context, dtype):
     return get_elwise_kernel(context,
-            "%(tp)s a, %(tp)s *x,%(tp)s b, %(tp)s *z" % {
+            "%(tp)s *z, %(tp)s a, %(tp)s *x,%(tp)s b" % {
                 "tp": dtype_to_ctype(dtype)},
             "z[i] = a * x[i] + b",
             "axpb")
@@ -330,7 +333,7 @@ def get_axpbz_kernel(context, dtype):
 @context_dependent_memoize
 def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z):
     return get_elwise_kernel(context,
-            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
+            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % {
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
@@ -341,7 +344,7 @@ def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z):
 @context_dependent_memoize
 def get_divide_kernel(context, dtype_x, dtype_y, dtype_z):
     return get_elwise_kernel(context,
-            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
+            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y, " % {
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
@@ -352,7 +355,7 @@ def get_divide_kernel(context, dtype_x, dtype_y, dtype_z):
 @context_dependent_memoize
 def get_rdivide_elwise_kernel(context, dtype):
     return get_elwise_kernel(context,
-            "%(tp)s *x, %(tp)s y, %(tp)s *z" % {
+            "%(tp)s *z, %(tp)s *x, %(tp)s y" % {
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = y / x[i]",
@@ -361,7 +364,7 @@ def get_rdivide_elwise_kernel(context, dtype):
 @context_dependent_memoize
 def get_fill_kernel(context, dtype):
     return get_elwise_kernel(context,
-            "%(tp)s a, %(tp)s *z" % {
+            "%(tp)s *z, %(tp)s a" % {
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = a",
@@ -370,7 +373,7 @@ def get_fill_kernel(context, dtype):
 @context_dependent_memoize
 def get_reverse_kernel(context, dtype):
     return get_elwise_kernel(context,
-            "%(tp)s *y, %(tp)s *z" % {
+            "%(tp)s *z, %(tp)s *y" % {
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = y[n-1-i]",
@@ -389,7 +392,7 @@ def get_arange_kernel(context, dtype):
 @context_dependent_memoize
 def get_pow_kernel(context, dtype):
     return get_elwise_kernel(context,
-            "%(tp)s value, %(tp)s *y, %(tp)s *z" % {
+            "%(tp)s *z, %(tp)s value, %(tp)s *y, " % {
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = pow(y[i], value)",
@@ -398,7 +401,7 @@ def get_pow_kernel(context, dtype):
 @context_dependent_memoize
 def get_pow_array_kernel(context, dtype_x, dtype_y, dtype_z):
     return get_elwise_kernel(context,
-            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
+            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % {
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
@@ -409,21 +412,21 @@ def get_pow_array_kernel(context, dtype_x, dtype_y, dtype_z):
 @context_dependent_memoize
 def get_fmod_kernel(context):
     return get_elwise_kernel(context,
-            "float *arg, float *mod, float *z",
+            "float *z, float *arg, float *mod",
             "z[i] = fmod(arg[i], mod[i])",
             "fmod_kernel")
 
 @context_dependent_memoize
 def get_modf_kernel(context):
     return get_elwise_kernel(context,
-            "float *x, float *intpart ,float *fracpart",
+            "float *intpart ,float *fracpart, float *x",
             "fracpart[i] = modf(x[i], &intpart[i])",
             "modf_kernel")
 
 @context_dependent_memoize
 def get_frexp_kernel(context):
     return get_elwise_kernel(context,
-            "float *x, float *significand, float *exponent",
+            "float *significand, float *exponent, float *x",
             """
                 int expt = 0;
                 significand[i] = frexp(x[i], &expt);
@@ -434,7 +437,7 @@ def get_frexp_kernel(context):
 @context_dependent_memoize
 def get_ldexp_kernel(context):
     return get_elwise_kernel(context,
-            "float *sig, float *expt, float *z",
+            "float *z, float *sig, float *expt",
             "z[i] = ldexp(sig[i], (int) expt[i])",
             "ldexp_kernel")
 
@@ -444,7 +447,7 @@ def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None):
         out_dtype = in_dtype
 
     return get_elwise_kernel(context,
-            "%(tp_in)s *y, %(tp_out)s *z" % {
+            "%(tp_out)s *z, %(tp_in)s *y" % {
                 "tp_in": dtype_to_ctype(in_dtype),
                 "tp_out": dtype_to_ctype(out_dtype),
                 },
@@ -457,10 +460,10 @@ def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None):
 @context_dependent_memoize
 def get_if_positive_kernel(context, crit_dtype, dtype):
     return get_elwise_kernel(context, [
+            VectorArg(dtype, "result"),
             VectorArg(crit_dtype, "crit"),
             VectorArg(dtype, "then_"),
             VectorArg(dtype, "else_"),
-            VectorArg(dtype, "result"),
             ],
             "result[i] = crit[i] > 0 ? then_[i] : else_[i]",
             "if_positive")