diff --git a/doc/source/array.rst b/doc/source/array.rst
index 0e435a7c69b8954c53356d1d0d4dd54a4522611e..c6586eb0cc2c8f4e007df83381591fdeb7ef4242 100644
--- a/doc/source/array.rst
+++ b/doc/source/array.rst
@@ -1,8 +1,24 @@
-The :class:`Array` Class
+Multi-dimensional arrays on the Compute Device
 .. module:: pyopencl.array
+Vector Types
+.. class :: vec
+    All of OpenCL's supported vector types, such as `float3` and `long4` are
+    available as :mod:`numpy` data types within this class. These
+    :class:`numpy.dtype` instances have field names of `x`, `y`, `z`, and `w`
+    just like their OpenCL counterparts. They will work both for parameter passing
+    to kernels as well as for passing data back and forth between kernels and
+    Python code. For each type, a `make_type` function is also provided (e.g.
+    `make_float3(x,y,z)`).
+The :class:`Array` Class
 .. class:: DefaultAllocator(context, flags=pyopencl.mem_flags.READ_WRITE)
     An alias for :class:`pyopencl.tools.CLAllocator`.
diff --git a/doc/source/misc.rst b/doc/source/misc.rst
index c96af3d1c1edde33aac64cc308ef3ae73ae14bd9..e18d293e9860ddbc19e61f33a834807d7428b256 100644
--- a/doc/source/misc.rst
+++ b/doc/source/misc.rst
@@ -93,6 +93,7 @@ Version 2011.1
 * Make construction of :class:`pyopencl.array.Array` more flexible (*cqa* argument.)
 * Add :ref:`memory-pools`.
+* Add vector types, see :class:`pyopencl.array.vec`.
 Version 0.92
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index c05cf74bc4fcc51448b6aee27648870e9cccb505..83f9d3a79fcdd25d3ac6d901ae5ab9b5729c3a4e 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -13,6 +13,7 @@ except ImportError:
+import numpy as np
 from pyopencl._cl import *
 import inspect as _inspect
@@ -225,8 +226,7 @@ def _add_functionality():
             if arg_dtype is None:
-                import numpy
-                arg_type_chars.append(numpy.dtype(arg_dtype).char)
+                arg_type_chars.append(np.dtype(arg_dtype).char)
         self._arg_type_chars = arg_type_chars
diff --git a/pyopencl/array.py b/pyopencl/array.py
index d068dfd5f457c7f6aa765016bef0162f600a015b..71a2206f1b1cf46ec9c0b28bb2f0d64256d0aeb6 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
-import numpy
+import numpy as np
 import pyopencl.elementwise as elementwise
 import pyopencl as cl
 #from pytools import memoize_method
+# {{{ vector types
+class vec:
+    pass
+def _create_vector_types():
+    field_names = ["x", "y", "z", "w"]
+    name_to_dtype = {}
+    dtype_to_name = {}
+    counts = [2, 3, 4, 8, 16]
+    for base_name, base_type in [
+        ('char', np.int8),
+        ('uchar', np.uint8),
+        ('short', np.int16),
+        ('ushort', np.uint16),
+        ('int', np.uint32),
+        ('uint', np.uint32),
+        ('long', np.int64),
+        ('ulong', np.uint64),
+        ('float', np.float32),
+        ('double', np.float64),
+        ]:
+        for count in counts:
+            name = "%s%d" % (base_name, count)
+            titles = field_names[:count]
+            if len(titles) < count:
+                titles.extend((count-len(titles))*[None])
+            dtype = np.dtype(dict(
+                names=["s%d" % i for i in range(count)],
+                formats=[base_type]*count,
+                titles=titles))
+            name_to_dtype[name] = dtype
+            dtype_to_name[dtype] = name
+            setattr(vec, name, dtype)
+            my_field_names = ",".join(field_names[:count])
+            my_field_names_defaulted = ",".join(
+                    "%s=0" % fn for fn in field_names[:count])
+            setattr(vec, "make_"+name, 
+                    staticmethod(eval(
+                        "lambda %s: array((%s), dtype=my_dtype)"
+                        % (my_field_names_defaulted, my_field_names),
+                        dict(array=np.array, my_dtype=dtype))))
+    vec._dtype_to_c_name = dtype_to_name
+    vec._c_name_to_dtype = name_to_dtype
+# }}}
+# {{{ helper functionality
 def splay(queue, n):
     dev = queue.device
@@ -126,6 +184,10 @@ def _should_be_cqa(what):
             "versions 2011.x of PyOpenCL." % (what, what),
             DeprecationWarning, 3)
+# }}}
+# {{{ array class
 class Array(object):
     """A :mod:`pyopencl` Array is used to do array-based calculation on
     a compute device.
@@ -183,7 +245,7 @@ class Array(object):
         self.queue = queue
         self.shape = shape
-        self.dtype = numpy.dtype(dtype)
+        self.dtype = np.dtype(dtype)
         if order not in ["C", "F"]:
             raise ValueError("order must be either 'C' or 'F'")
         self.order = order
@@ -222,7 +284,7 @@ class Array(object):
     def get(self, queue=None, ary=None, async=False):
         if ary is None:
-            ary = numpy.empty(self.shape, self.dtype, order=self.order)
+            ary = np.empty(self.shape, self.dtype, order=self.order)
             if ary.size != self.size:
                 raise TypeError("'ary' has non-matching type")
@@ -543,7 +605,9 @@ class Array(object):
     def __gt__(self, other):
         raise NotImplementedError
+# }}}
+# {{{ creation helpers
 def _to_device(queue, ary, allocator=None, async=False):
     if ary.flags.f_contiguous:
@@ -630,7 +694,7 @@ def _arange(queue, *args, **kwargs):
     inf.step = None
     inf.dtype = None
-    if isinstance(args[-1], numpy.dtype):
+    if isinstance(args[-1], np.dtype):
         dtype = args[-1]
         args = args[:-1]
         explicit_dtype = True
@@ -667,10 +731,10 @@ def _arange(queue, *args, **kwargs):
     if inf.step is None:
         inf.step = 1
     if inf.dtype is None:
-        inf.dtype = numpy.array([inf.start, inf.stop, inf.step]).dtype
+        inf.dtype = np.array([inf.start, inf.stop, inf.step]).dtype
     # actual functionality ----------------------------------------------------
-    dtype = numpy.dtype(inf.dtype)
+    dtype = np.dtype(inf.dtype)
     start = dtype.type(inf.start)
     step = dtype.type(inf.step)
     stop = dtype.type(inf.stop)
@@ -707,9 +771,9 @@ def arange(*args, **kwargs):
     return _arange(*args, **kwargs)
+# }}}
+# {{{ take/put
 def _take(result, ary, indices):
@@ -900,8 +964,9 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None):
     return out
+# }}}
+# {{{ conditionals
 def _if_positive(result, criterion, then_, else_):
@@ -943,10 +1008,9 @@ def minimum(a, b, out=None, queue=None):
     return if_positive(a.mul_add(1, b, -1, queue=queue), b, a,
             queue=queue, out=out)
+# }}}
-# reductions ------------------------------------------------------------------
+# {{{ reductions
 _builtin_min = min
 _builtin_max = max
@@ -988,6 +1052,8 @@ def _make_subset_minmax_kernel(what):
 subset_min = _make_subset_minmax_kernel("min")
 subset_max = _make_subset_minmax_kernel("max")
+# }}}
diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py
index cdd53163e88745f8c8d858f357e78a3bef7c1310..c2cea99cfe41dff89009d538a17fdafbba9954ce 100644
--- a/pyopencl/clrandom.py
+++ b/pyopencl/clrandom.py
@@ -178,7 +178,7 @@ md5_code = """
   d += 0x10325476;
-import numpy
+import numpy as np
@@ -186,7 +186,7 @@ import numpy
 def get_rand_kernel(context, dtype):
     from pyopencl.elementwise import get_elwise_kernel
-    if dtype == numpy.float32:
+    if dtype == np.float32:
         return get_elwise_kernel(context,
             "float *dest, unsigned int seed",
             md5_code + """
@@ -200,7 +200,7 @@ def get_rand_kernel(context, dtype):
                 dest[i] = d*POW_2_M32;
-    elif dtype == numpy.float64:
+    elif dtype == np.float64:
         return get_elwise_kernel(context,
             "double *dest, unsigned int seed",
             md5_code + """
@@ -215,7 +215,7 @@ def get_rand_kernel(context, dtype):
-    elif dtype in [numpy.int32, numpy.uint32]:
+    elif dtype in [np.int32, np.uint32]:
         return get_elwise_kernel(context,
             "unsigned int *dest, unsigned int seed",
             md5_code + """
@@ -239,13 +239,13 @@ def _rand(output, seed):
     return get_rand_kernel(output.context, output.dtype)
 def fill_rand(result):
-    _rand(result, numpy.random.randint(2**31-1))
+    _rand(result, np.random.randint(2**31-1))
 def rand(context, queue, shape, dtype):
     from pyopencl.array import Array
     result = Array(queue, shape, dtype)
-    _rand(result, numpy.random.randint(2**31-1))
+    _rand(result, np.random.randint(2**31-1))
     return result
@@ -257,7 +257,7 @@ if __name__ == "__main__":
     if "generate" in sys.argv[1:]:
         N = 256
         print N, "MB"
-        r = rand(ctx, queue, (N*2**18,), numpy.uint32)
+        r = rand(ctx, queue, (N*2**18,), np.uint32)
         print "generated"
         print "written"
@@ -265,9 +265,9 @@ if __name__ == "__main__":
         from pylab import plot, show, subplot
         N = 250
-        r1 = rand(ctx, queue, (N,), numpy.uint32)
-        r2 = rand(ctx, queue, (N,), numpy.int32)
-        r3 = rand(ctx, queue, (N,), numpy.float32)
+        r1 = rand(ctx, queue, (N,), np.uint32)
+        r2 = rand(ctx, queue, (N,), np.int32)
+        r3 = rand(ctx, queue, (N,), np.float32)
         subplot(131); plot( r1.get(),"x-")
         subplot(132); plot( r2.get(),"x-")
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index 70ae34cd885d3974605d0e5dceae09ad56160ce7..f3994ed621d2fe282269a1ce16c026943f110097 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
 from pyopencl.tools import context_dependent_memoize
-import numpy
+import numpy as np
 import pyopencl as cl
 from pyopencl.tools import dtype_to_ctype, VectorArg, ScalarArg
@@ -84,13 +84,13 @@ def get_elwise_kernel_and_types(context, arguments, operation,
         parsed_args = arguments
     for arg in parsed_args:
-        if numpy.float64  == arg.dtype:
+        if np.float64  == arg.dtype:
             preamble = (
                     "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n\n\n"
                     + preamble)
-    parsed_args.append(ScalarArg(numpy.uintp, "n"))
+    parsed_args.append(ScalarArg(np.uintp, "n"))
     prg = get_elwise_program(context, parsed_args, operation, name,
             keep, options, preamble, **kwargs)
@@ -297,7 +297,7 @@ def get_linear_combination_kernel(summand_descriptors,
         summands.append("a%d*x%d[i]" % (i, i))
     args.append(VectorArg(dtype_z, "z"))
-    args.append(ScalarArg(numpy.uintp, "n"))
+    args.append(ScalarArg(np.uintp, "n"))
     mod = get_elwise_module(args,
             "z[i] = " + " + ".join(summands),
diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py
index db894764f2a89372b8d5825326ffabe67ed06b68..9348253d7fefe1b657bf69c8ef9872ce075c555d 100644
--- a/pyopencl/reduction.py
+++ b/pyopencl/reduction.py
@@ -194,7 +194,7 @@ def  get_reduction_source(
     from mako.template import Template
     from pytools import all
     from pyopencl.tools import has_double_support
-    src = Template(KERNEL).render(
+    src = str(Template(KERNEL).render(
@@ -206,7 +206,7 @@ def  get_reduction_source(
             has_double_support(dev) for dev in devices)
-        )
+        ))
     from pytools import Record
     class ReductionInfo(Record):
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 0bdfa292600a6f7fb6f824a293316d585ba8f309..76470a6190ec20b254d530467e5c97d099b251ef 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
-import numpy
+import numpy as np
 from decorator import decorator
 import pyopencl as cl
+import pyopencl.array as cl_array
@@ -109,35 +110,38 @@ def dtype_to_ctype(dtype):
     if dtype is None:
         raise ValueError("dtype may not be None")
-    dtype = numpy.dtype(dtype)
-    if dtype == numpy.int64:
+    dtype = np.dtype(dtype)
+    if dtype == np.int64:
         return "long"
-    elif dtype == numpy.uint64:
+    elif dtype == np.uint64:
         return "unsigned long"
-    elif dtype == numpy.int32:
+    elif dtype == np.int32:
         return "int"
-    elif dtype == numpy.uint32:
+    elif dtype == np.uint32:
         return "unsigned int"
-    elif dtype == numpy.int16:
+    elif dtype == np.int16:
         return "short int"
-    elif dtype == numpy.uint16:
+    elif dtype == np.uint16:
         return "short unsigned int"
-    elif dtype == numpy.int8:
+    elif dtype == np.int8:
         return "signed char"
-    elif dtype == numpy.uint8:
+    elif dtype == np.uint8:
         return "unsigned char"
-    elif dtype == numpy.bool:
+    elif dtype == np.bool:
         return "bool"
-    elif dtype == numpy.float32:
+    elif dtype == np.float32:
         return "float"
-    elif dtype == numpy.float64:
+    elif dtype == np.float64:
         return "double"
-    elif dtype == numpy.complex64:
+    elif dtype == np.complex64:
         return "complex float"
-    elif dtype == numpy.complex128:
+    elif dtype == np.complex128:
         return "complex double"
-        raise ValueError, "unable to map dtype '%s'" % dtype
+        try:
+            return cl_array.vec._dtype_to_c_name[dtype]
+        except KeyError:
+            raise ValueError, "unable to map dtype '%s'" % dtype
 # }}}
@@ -147,7 +151,7 @@ def dtype_to_ctype(dtype):
 # {{{ C argument lists --------------------------------------------------------
 class Argument:
     def __init__(self, dtype, name, vector_len=1):
-        self.dtype = numpy.dtype(dtype)
+        self.dtype = np.dtype(dtype)
         self.name = name
         self.vector_len = vector_len
@@ -219,19 +223,23 @@ def parse_c_arg(c_arg):
         vector_len = 1
-    if tp == "float": dtype = numpy.float32
-    elif tp == "double": dtype = numpy.float64
-    elif tp in ["int", "signed int"]: dtype = numpy.int32
-    elif tp in ["unsigned", "unsigned int"]: dtype = numpy.uint32
-    elif tp in ["long", "long int"]: dtype = numpy.int64
+    if tp == "float": dtype = np.float32
+    elif tp == "double": dtype = np.float64
+    elif tp in ["int", "signed int"]: dtype = np.int32
+    elif tp in ["unsigned", "unsigned int"]: dtype = np.uint32
+    elif tp in ["long", "long int"]: dtype = np.int64
     elif tp in ["unsigned long", "unsigned long int"]:
-        dtype = numpy.uint64
-    elif tp in ["short", "short int"]: dtype = numpy.int16
-    elif tp in ["unsigned short", "unsigned short int"]: dtype = numpy.uint16
-    elif tp in ["char"]: dtype = numpy.int8
-    elif tp in ["unsigned char"]: dtype = numpy.uint8
-    elif tp in ["bool"]: dtype = numpy.bool
-    else: raise ValueError, "unknown type '%s'" % tp
+        dtype = np.uint64
+    elif tp in ["short", "short int"]: dtype = np.int16
+    elif tp in ["unsigned short", "unsigned short int"]: dtype = np.uint16
+    elif tp in ["char"]: dtype = np.int8
+    elif tp in ["unsigned char"]: dtype = np.uint8
+    elif tp in ["bool"]: dtype = np.bool
+    else:
+        try:
+            return cl_array.vec._c_name_to_dtype[tp]
+        except KeyError:
+            raise ValueError("unknown type '%s'" % tp)
     return arg_class(dtype, name, vector_len)
diff --git a/test/test_array.py b/test/test_array.py
index 863530c2bf213e9c585e084b2b295a3bd9abe87a..adabdcfc8780e655b83faafa14b233b409e673c4 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1,5 +1,5 @@
 #! /usr/bin/env python
-import numpy
+import numpy as np
 import numpy.linalg as la
 import sys
 import pytools.test
@@ -29,14 +29,14 @@ def test_pow_array(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
-    a = numpy.array([1,2,3,4,5]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
     result = pow(a_gpu,a_gpu).get()
-    assert (numpy.abs(a**a - result) < 1e-3).all()
+    assert (np.abs(a**a - result) < 1e-3).all()
     result = (a_gpu**a_gpu).get()
-    assert (numpy.abs(pow(a, a) - result) < 1e-3).all()
+    assert (np.abs(pow(a, a) - result) < 1e-3).all()
@@ -46,11 +46,11 @@ def test_pow_number(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
     result = pow(a_gpu, 2).get()
-    assert (numpy.abs(a**2 - result) < 1e-3).all()
+    assert (np.abs(a**2 - result) < 1e-3).all()
@@ -59,7 +59,7 @@ def test_abs(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
-    a = -cl_array.arange(queue, 111, dtype=numpy.float32)
+    a = -cl_array.arange(queue, 111, dtype=np.float32)
     res = a.get()
     for i in range(111):
@@ -79,7 +79,7 @@ def test_len(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
     a_cpu = cl_array.to_device(queue, a)
     assert len(a_cpu) == 10
@@ -96,11 +96,11 @@ def test_multiply(ctx_getter):
     for sz in [10, 50000]:
         for dtype, scalars in [
-            (numpy.float32, [2]),
-            #(numpy.complex64, [2, 2j])
+            (np.float32, [2]),
+            #(np.complex64, [2, 2j])
             for scalar in scalars:
-                a = numpy.arange(sz).astype(dtype)
+                a = np.arange(sz).astype(dtype)
                 a_gpu = cl_array.to_device(queue, a)
                 a_doubled = (scalar * a_gpu).get()
@@ -113,7 +113,7 @@ def test_multiply_array(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
     b_gpu = cl_array.to_device(queue, a)
@@ -132,7 +132,7 @@ def test_addition_array(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
     a_added = (a_gpu+a_gpu).get()
@@ -148,7 +148,7 @@ def test_addition_scalar(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
     a_added = (7+a_gpu).get()
@@ -161,8 +161,8 @@ def test_addition_scalar(ctx_getter):
 def test_substract_array(ctx_getter):
     """Test the substraction of two arrays."""
     #test data
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
-    b = numpy.array([10,20,30,40,50,60,70,80,90,100]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
+    b = np.array([10,20,30,40,50,60,70,80,90,100]).astype(np.float32)
     context = ctx_getter()
     queue = cl.CommandQueue(context)
@@ -187,7 +187,7 @@ def test_substract_scalar(ctx_getter):
     queue = cl.CommandQueue(context)
     #test data
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
     #convert a to a gpu object
     a_gpu = cl_array.to_device(queue, a)
@@ -208,7 +208,7 @@ def test_divide_scalar(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
     result = (a_gpu/2).get()
@@ -228,17 +228,17 @@ def test_divide_array(ctx_getter):
     queue = cl.CommandQueue(context)
     #test data
-    a = numpy.array([10,20,30,40,50,60,70,80,90,100]).astype(numpy.float32)
-    b = numpy.array([10,10,10,10,10,10,10,10,10,10]).astype(numpy.float32)
+    a = np.array([10,20,30,40,50,60,70,80,90,100]).astype(np.float32)
+    b = np.array([10,10,10,10,10,10,10,10,10,10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
     b_gpu = cl_array.to_device(queue, b)
     a_divide = (a_gpu/b_gpu).get()
-    assert (numpy.abs(a/b - a_divide) < 1e-3).all()
+    assert (np.abs(a/b - a_divide) < 1e-3).all()
     a_divide = (b_gpu/a_gpu).get()
-    assert (numpy.abs(b/a - a_divide) < 1e-3).all()
+    assert (np.abs(b/a - a_divide) < 1e-3).all()
@@ -251,9 +251,9 @@ def test_random(ctx_getter):
     from pyopencl.clrandom import rand as clrand
     if has_double_support(context.devices[0]):
-        dtypes = [numpy.float32, numpy.float64]
+        dtypes = [np.float32, np.float64]
-        dtypes = [numpy.float32]
+        dtypes = [np.float32]
     for dtype in dtypes:
         a = clrand(context, queue, (10, 100), dtype=dtype).get()
@@ -271,7 +271,7 @@ def test_nan_arithmetic(ctx_getter):
     def make_nan_contaminated_vector(size):
         shape = (size,)
-        a = numpy.random.randn(*shape).astype(numpy.float32)
+        a = np.random.randn(*shape).astype(np.float32)
         #for i in range(0, shape[0], 3):
             #a[i] = float('nan')
         from random import randrange
@@ -290,7 +290,7 @@ def test_nan_arithmetic(ctx_getter):
     ab_gpu = (a_gpu*b_gpu).get()
     for i in range(size):
-        assert numpy.isnan(ab[i]) == numpy.isnan(ab_gpu[i])
+        assert np.isnan(ab[i]) == np.isnan(ab_gpu[i])
@@ -302,8 +302,8 @@ def test_elwise_kernel(ctx_getter):
     from pyopencl.clrandom import rand as clrand
-    a_gpu = clrand(context, queue, (50,), numpy.float32)
-    b_gpu = clrand(context, queue, (50,), numpy.float32)
+    a_gpu = clrand(context, queue, (50,), np.float32)
+    b_gpu = clrand(context, queue, (50,), np.float32)
     from pyopencl.elementwise import ElementwiseKernel
     lin_comb = ElementwiseKernel(context,
@@ -324,8 +324,8 @@ def test_take(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
-    idx = cl_array.arange(queue, 0, 200000, 2, dtype=numpy.uint32)
-    a = cl_array.arange(queue, 0, 600000, 3, dtype=numpy.float32)
+    idx = cl_array.arange(queue, 0, 200000, 2, dtype=np.uint32)
+    a = cl_array.arange(queue, 0, 600000, 3, dtype=np.float32)
     result = cl_array.take(a, idx)
     assert ((3*idx).get() == result.get()).all()
@@ -338,8 +338,8 @@ def test_arange(ctx_getter):
     queue = cl.CommandQueue(context)
     n = 5000
-    a = cl_array.arange(queue, n, dtype=numpy.float32)
-    assert (numpy.arange(n, dtype=numpy.float32) == a.get()).all()
+    a = cl_array.arange(queue, n, dtype=np.float32)
+    assert (np.arange(n, dtype=np.float32) == a.get()).all()
@@ -350,7 +350,7 @@ def test_reverse(ctx_getter):
     queue = cl.CommandQueue(context)
     n = 5000
-    a = numpy.arange(n).astype(numpy.float32)
+    a = np.arange(n).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
     a_gpu = a_gpu.reverse()
@@ -367,10 +367,10 @@ def test_sum(ctx_getter):
     from pyopencl.clrandom import rand as clrand
-    a_gpu = clrand(context, queue, (200000,), numpy.float32)
+    a_gpu = clrand(context, queue, (200000,), np.float32)
     a = a_gpu.get()
-    sum_a = numpy.sum(a)
+    sum_a = np.sum(a)
     sum_a_gpu = cl_array.sum(a_gpu).get()
     assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
@@ -386,16 +386,16 @@ def test_minmax(ctx_getter):
     from pyopencl.clrandom import rand as clrand
     if has_double_support(context.devices[0]):
-        dtypes = [numpy.float64, numpy.float32, numpy.int32]
+        dtypes = [np.float64, np.float32, np.int32]
-        dtypes = [numpy.float32, numpy.int32]
+        dtypes = [np.float32, np.int32]
     for what in ["min", "max"]:
         for dtype in dtypes:
             a_gpu = clrand(context, queue, (200000,), dtype)
             a = a_gpu.get()
-            op_a = getattr(numpy, what)(a)
+            op_a = getattr(np, what)(a)
             op_a_gpu = getattr(cl_array, what)(a_gpu).get()
             assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
@@ -415,16 +415,16 @@ def test_subset_minmax(ctx_getter):
     l_m = l_a - l_a // gran + 1
     if has_double_support(context.devices[0]):
-        dtypes = [numpy.float64, numpy.float32, numpy.int32]
+        dtypes = [np.float64, np.float32, np.int32]
-        dtypes = [numpy.float32, numpy.int32]
+        dtypes = [np.float32, np.int32]
     for dtype in dtypes:
         a_gpu = clrand(context, queue, (l_a,), dtype)
         a = a_gpu.get()
         meaningful_indices_gpu = cl_array.zeros(
-                queue, l_m, dtype=numpy.int32)
+                queue, l_m, dtype=np.int32)
         meaningful_indices = meaningful_indices_gpu.get()
         j = 0
         for i in range(len(meaningful_indices)):
@@ -437,7 +437,7 @@ def test_subset_minmax(ctx_getter):
                 queue, meaningful_indices)
         b = a[meaningful_indices]
-        min_a = numpy.min(b)
+        min_a = np.min(b)
         min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get()
         assert min_a_gpu == min_a
@@ -451,12 +451,12 @@ def test_dot(ctx_getter):
     queue = cl.CommandQueue(context)
     from pyopencl.clrandom import rand as clrand
-    a_gpu = clrand(context, queue, (200000,), numpy.float32)
+    a_gpu = clrand(context, queue, (200000,), np.float32)
     a = a_gpu.get()
-    b_gpu = clrand(context, queue, (200000,), numpy.float32)
+    b_gpu = clrand(context, queue, (200000,), np.float32)
     b = b_gpu.get()
-    dot_ab = numpy.dot(a, b)
+    dot_ab = np.dot(a, b)
     dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()
@@ -492,8 +492,8 @@ def test_if_positive(ctx_getter):
     from pyopencl.clrandom import rand as clrand
     l = 20000
-    a_gpu = clrand(context, queue, (l,), numpy.float32)
-    b_gpu = clrand(context, queue, (l,), numpy.float32)
+    a_gpu = clrand(context, queue, (l,), np.float32)
+    b_gpu = clrand(context, queue, (l,), np.float32)
     a = a_gpu.get()
     b = b_gpu.get()
@@ -501,10 +501,10 @@ def test_if_positive(ctx_getter):
     min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu)
-    print(numpy.maximum(a, b))
+    print(np.maximum(a, b))
-    assert la.norm(max_a_b_gpu.get()- numpy.maximum(a, b)) == 0
-    assert la.norm(min_a_b_gpu.get()- numpy.minimum(a, b)) == 0
+    assert la.norm(max_a_b_gpu.get()- np.maximum(a, b)) == 0
+    assert la.norm(min_a_b_gpu.get()- np.minimum(a, b)) == 0
 def test_take_put(ctx_getter):
@@ -514,11 +514,11 @@ def test_take_put(ctx_getter):
     for n in [5, 17, 333]:
         one_field_size = 8
         buf_gpu = cl_array.zeros(queue,
-                n*one_field_size, dtype=numpy.float32)
+                n*one_field_size, dtype=np.float32)
         dest_indices = cl_array.to_device(queue,
-                numpy.array([ 0,  1,  2,  3, 32, 33, 34, 35], dtype=numpy.uint32))
+                np.array([ 0,  1,  2,  3, 32, 33, 34, 35], dtype=np.uint32))
         read_map = cl_array.to_device(queue,
-                numpy.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=numpy.uint32))
+                np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32))
                 arrays=[buf_gpu for i in range(n)],
@@ -537,20 +537,20 @@ def test_astype(ctx_getter):
     if not has_double_support(context.devices[0]):
-    a_gpu = clrand(context, queue, (2000,), dtype=numpy.float32)
+    a_gpu = clrand(context, queue, (2000,), dtype=np.float32)
-    a = a_gpu.get().astype(numpy.float64)
-    a2 = a_gpu.astype(numpy.float64).get()
+    a = a_gpu.get().astype(np.float64)
+    a2 = a_gpu.astype(np.float64).get()
-    assert a2.dtype == numpy.float64
+    assert a2.dtype == np.float64
     assert la.norm(a - a2) == 0, (a, a2)
-    a_gpu = clrand(context, queue, (2000,), dtype=numpy.float64)
+    a_gpu = clrand(context, queue, (2000,), dtype=np.float64)
-    a = a_gpu.get().astype(numpy.float32)
-    a2 = a_gpu.astype(numpy.float32).get()
+    a = a_gpu.get().astype(np.float32)
+    a2 = a_gpu.astype(np.float32).get()
-    assert a2.dtype == numpy.float32
+    assert a2.dtype == np.float32
     assert la.norm(a - a2)/la.norm(a) < 1e-7
diff --git a/test/test_clmath.py b/test/test_clmath.py
index 77ae77fb6372d239a752283245357d160b40dc92..b8f648de08e83a5711ed372f7ba6250a92176ec9 100644
--- a/test/test_clmath.py
+++ b/test/test_clmath.py
@@ -1,6 +1,6 @@
 from __future__ import division
 import math
-import numpy
+import numpy as np
 import pytools.test
 def have_cl():
@@ -54,21 +54,21 @@ def make_unary_function_test(name, limits=(0, 1), threshold=0):
         queue = cl.CommandQueue(context)
         gpu_func = getattr(clmath, name)
-        cpu_func = getattr(numpy, numpy_func_names.get(name, name))
+        cpu_func = getattr(np, numpy_func_names.get(name, name))
         if has_double_support(context.devices[0]):
-            dtypes = [numpy.float32, numpy.float64]
+            dtypes = [np.float32, np.float64]
-            dtypes = [numpy.float32]
+            dtypes = [np.float32]
         for s in sizes:
             for dtype in dtypes:
                 args = cl_array.arange(queue, a, b, (b-a)/s, 
-                        dtype=numpy.float32)
+                        dtype=np.float32)
                 gpu_results = gpu_func(args).get()
                 cpu_results = cpu_func(args.get())
-                max_err = numpy.max(numpy.abs(cpu_results - gpu_results))
+                max_err = np.max(np.abs(cpu_results - gpu_results))
                 assert (max_err <= threshold).all(), \
                         (max_err, name, dtype)
@@ -107,8 +107,8 @@ def test_fmod(ctx_getter):
     queue = cl.CommandQueue(context)
     for s in sizes:
-        a = cl_array.arange(queue, s, dtype=numpy.float32)/10
-        a2 = cl_array.arange(queue, s, dtype=numpy.float32)/45.2 + 0.1
+        a = cl_array.arange(queue, s, dtype=np.float32)/10
+        a2 = cl_array.arange(queue, s, dtype=np.float32)/45.2 + 0.1
         b = clmath.fmod(a, a2)
         a = a.get()
@@ -124,8 +124,8 @@ def test_ldexp(ctx_getter):
     queue = cl.CommandQueue(context)
     for s in sizes:
-        a = cl_array.arange(queue, s, dtype=numpy.float32)
-        a2 = cl_array.arange(queue, s, dtype=numpy.float32)*1e-3
+        a = cl_array.arange(queue, s, dtype=np.float32)
+        a2 = cl_array.arange(queue, s, dtype=np.float32)*1e-3
         b = clmath.ldexp(a,a2)
         a = a.get()
@@ -141,7 +141,7 @@ def test_modf(ctx_getter):
     queue = cl.CommandQueue(context)
     for s in sizes:
-        a = cl_array.arange(queue, s, dtype=numpy.float32)/10
+        a = cl_array.arange(queue, s, dtype=np.float32)/10
         fracpart, intpart = clmath.modf(a)
         a = a.get()
@@ -160,7 +160,7 @@ def test_frexp(ctx_getter):
     queue = cl.CommandQueue(context)
     for s in sizes:
-        a = cl_array.arange(queue, s, dtype=numpy.float32)/10
+        a = cl_array.arange(queue, s, dtype=np.float32)/10
         significands, exponents = clmath.frexp(a)
         a = a.get()
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index d312260a52049f5c2ca3e1f111d14807878a6ef9..3b334cb9f0ddfe927acfd695bb6309823c68649d 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -1,5 +1,5 @@
 from __future__ import division
-import numpy
+import numpy as np
 import numpy.linalg as la
 import pytools.test
@@ -17,6 +17,7 @@ def have_cl():
 if have_cl():
     import pyopencl as cl
+    import pyopencl.array as cl_array
     from pyopencl.tools import pytest_generate_tests_for_pyopencl \
             as pytest_generate_tests
@@ -203,7 +204,7 @@ class TestCL:
             { a[get_global_id(0)] *= (b+c); }
-        a = numpy.random.rand(50000)
+        a = np.random.rand(50000)
         queue = cl.CommandQueue(context)
         mf = cl.mem_flags
         a_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)
@@ -220,9 +221,9 @@ class TestCL:
         except cl.LogicError:
-        prg.mult(queue, a.shape, None, a_buf, numpy.float32(2), numpy.int32(3))
+        prg.mult(queue, a.shape, None, a_buf, np.float32(2), np.int32(3))
-        a_result = numpy.empty_like(a)
+        a_result = np.empty_like(a)
         cl.enqueue_read_buffer(queue, a_buf, a_result).wait()
@@ -253,7 +254,7 @@ class TestCL:
-        a = numpy.random.rand(1024, 1024, 4).astype(numpy.float32)
+        a = np.random.rand(1024, 1024, 4).astype(np.float32)
         queue = cl.CommandQueue(context)
         mf = cl.mem_flags
         a_img = cl.Image(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
@@ -264,9 +265,9 @@ class TestCL:
         samp = cl.Sampler(context, False,
-        prg.copy_image(queue, a.shape, None, a_dest, a_img, samp, numpy.int32(a.shape[0]))
+        prg.copy_image(queue, a.shape, None, a_dest, a_img, samp, np.int32(a.shape[0]))
-        a_result = numpy.empty_like(a)
+        a_result = np.empty_like(a)
         cl.enqueue_read_buffer(queue, a_dest, a_result, is_blocking=True)
@@ -279,8 +280,8 @@ class TestCL:
         queue = cl.CommandQueue(context)
         mf = cl.mem_flags
-        a = numpy.random.rand(50000).astype(numpy.float32)
-        b = numpy.empty_like(a)
+        a = np.random.rand(50000).astype(np.float32)
+        b = np.empty_like(a)
         buf1 = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
         buf2 = cl.Buffer(context, mf.WRITE_ONLY, b.nbytes)
@@ -324,9 +325,26 @@ class TestCL:
             assert MemoryPool.bin_number(asize) == bin_nr, s
             assert asize < asize*(1+1/8)
+    @pytools.test.mark_test.opencl
+    def test_vector_args(self, ctx_getter):
+        context = ctx_getter()
+        queue = cl.CommandQueue(context)
+        prg = cl.Program(context, """
+            __kernel void set_vec(float4 x, __global float4 *dest)
+            { dest[get_global_id(0)] = x; }
+            """).build()
+        x = cl_array.vec.make_float4(1,2,3,4)
+        dest = np.empty(50000, cl_array.vec.float4)
+        mf = cl.mem_flags
+        dest_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=dest)
+        prg.set_vec(queue, dest.shape, None, x, dest_buf)
+        cl.enqueue_read_buffer(queue, dest_buf, dest).wait()
+        assert (dest == x).all()