diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index c05cf74bc4fcc51448b6aee27648870e9cccb505..83f9d3a79fcdd25d3ac6d901ae5ab9b5729c3a4e 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -13,6 +13,7 @@ except ImportError:
 
 
 
+import numpy as np
 from pyopencl._cl import *
 import inspect as _inspect
 
@@ -225,8 +226,7 @@ def _add_functionality():
             if arg_dtype is None:
                 arg_type_chars.append(None)
             else:
-                import numpy
-                arg_type_chars.append(numpy.dtype(arg_dtype).char)
+                arg_type_chars.append(np.dtype(arg_dtype).char)
 
         self._arg_type_chars = arg_type_chars
 
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 043c53c4f4259928c55780e94f08e93d930ab284..a88b8cce07cf35a9b51babcfec8781a438742331 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -30,7 +30,7 @@ OTHER DEALINGS IN THE SOFTWARE.
 
 
 
-import numpy
+import numpy as np
 import pyopencl.elementwise as elementwise
 import pyopencl as cl
 #from pytools import memoize_method
@@ -190,7 +190,7 @@ class Array(object):
         self.queue = queue
 
         self.shape = shape
-        self.dtype = numpy.dtype(dtype)
+        self.dtype = np.dtype(dtype)
         if order not in ["C", "F"]:
             raise ValueError("order must be either 'C' or 'F'")
         self.order = order
@@ -229,7 +229,7 @@ class Array(object):
 
     def get(self, queue=None, ary=None, async=False):
         if ary is None:
-            ary = numpy.empty(self.shape, self.dtype, order=self.order)
+            ary = np.empty(self.shape, self.dtype, order=self.order)
         else:
             if ary.size != self.size:
                 raise TypeError("'ary' has non-matching type")
@@ -639,7 +639,7 @@ def _arange(queue, *args, **kwargs):
     inf.step = None
     inf.dtype = None
 
-    if isinstance(args[-1], numpy.dtype):
+    if isinstance(args[-1], np.dtype):
         dtype = args[-1]
         args = args[:-1]
         explicit_dtype = True
@@ -676,10 +676,10 @@ def _arange(queue, *args, **kwargs):
     if inf.step is None:
         inf.step = 1
     if inf.dtype is None:
-        inf.dtype = numpy.array([inf.start, inf.stop, inf.step]).dtype
+        inf.dtype = np.array([inf.start, inf.stop, inf.step]).dtype
 
     # actual functionality ----------------------------------------------------
-    dtype = numpy.dtype(inf.dtype)
+    dtype = np.dtype(inf.dtype)
     start = dtype.type(inf.start)
     step = dtype.type(inf.step)
     stop = dtype.type(inf.stop)
diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py
index cdd53163e88745f8c8d858f357e78a3bef7c1310..c2cea99cfe41dff89009d538a17fdafbba9954ce 100644
--- a/pyopencl/clrandom.py
+++ b/pyopencl/clrandom.py
@@ -178,7 +178,7 @@ md5_code = """
   d += 0x10325476;
 """
 
-import numpy
+import numpy as np
 
 
 
@@ -186,7 +186,7 @@ import numpy
 @context_dependent_memoize
 def get_rand_kernel(context, dtype):
     from pyopencl.elementwise import get_elwise_kernel
-    if dtype == numpy.float32:
+    if dtype == np.float32:
         return get_elwise_kernel(context,
             "float *dest, unsigned int seed",
             md5_code + """
@@ -200,7 +200,7 @@ def get_rand_kernel(context, dtype):
                 dest[i] = d*POW_2_M32;
             """,
             "md5_rng_float")
-    elif dtype == numpy.float64:
+    elif dtype == np.float64:
         return get_elwise_kernel(context,
             "double *dest, unsigned int seed",
             md5_code + """
@@ -215,7 +215,7 @@ def get_rand_kernel(context, dtype):
             }
             """,
             "md5_rng_float")
-    elif dtype in [numpy.int32, numpy.uint32]:
+    elif dtype in [np.int32, np.uint32]:
         return get_elwise_kernel(context,
             "unsigned int *dest, unsigned int seed",
             md5_code + """
@@ -239,13 +239,13 @@ def _rand(output, seed):
     return get_rand_kernel(output.context, output.dtype)
 
 def fill_rand(result):
-    _rand(result, numpy.random.randint(2**31-1))
+    _rand(result, np.random.randint(2**31-1))
 
 def rand(context, queue, shape, dtype):
     from pyopencl.array import Array
 
     result = Array(queue, shape, dtype)
-    _rand(result, numpy.random.randint(2**31-1))
+    _rand(result, np.random.randint(2**31-1))
     return result
 
 
@@ -257,7 +257,7 @@ if __name__ == "__main__":
     if "generate" in sys.argv[1:]:
         N = 256
         print N, "MB"
-        r = rand(ctx, queue, (N*2**18,), numpy.uint32)
+        r = rand(ctx, queue, (N*2**18,), np.uint32)
         print "generated"
         r.get().tofile("random.dat")
         print "written"
@@ -265,9 +265,9 @@ if __name__ == "__main__":
     else:
         from pylab import plot, show, subplot
         N = 250
-        r1 = rand(ctx, queue, (N,), numpy.uint32)
-        r2 = rand(ctx, queue, (N,), numpy.int32)
-        r3 = rand(ctx, queue, (N,), numpy.float32)
+        r1 = rand(ctx, queue, (N,), np.uint32)
+        r2 = rand(ctx, queue, (N,), np.int32)
+        r3 = rand(ctx, queue, (N,), np.float32)
 
         subplot(131); plot( r1.get(),"x-")
         subplot(132); plot( r2.get(),"x-")
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index 70ae34cd885d3974605d0e5dceae09ad56160ce7..f3994ed621d2fe282269a1ce16c026943f110097 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -31,7 +31,7 @@ OTHER DEALINGS IN THE SOFTWARE.
 
 
 from pyopencl.tools import context_dependent_memoize
-import numpy
+import numpy as np
 import pyopencl as cl
 from pyopencl.tools import dtype_to_ctype, VectorArg, ScalarArg
 
@@ -84,13 +84,13 @@ def get_elwise_kernel_and_types(context, arguments, operation,
         parsed_args = arguments
 
     for arg in parsed_args:
-        if numpy.float64  == arg.dtype:
+        if np.float64  == arg.dtype:
             preamble = (
                     "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n\n\n"
                     + preamble)
             break
 
-    parsed_args.append(ScalarArg(numpy.uintp, "n"))
+    parsed_args.append(ScalarArg(np.uintp, "n"))
 
     prg = get_elwise_program(context, parsed_args, operation, name,
             keep, options, preamble, **kwargs)
@@ -297,7 +297,7 @@ def get_linear_combination_kernel(summand_descriptors,
         summands.append("a%d*x%d[i]" % (i, i))
 
     args.append(VectorArg(dtype_z, "z"))
-    args.append(ScalarArg(numpy.uintp, "n"))
+    args.append(ScalarArg(np.uintp, "n"))
 
     mod = get_elwise_module(args,
             "z[i] = " + " + ".join(summands),
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 0bdfa292600a6f7fb6f824a293316d585ba8f309..eaf1ac1127e2eb63b3e26cf91405474cc0c84626 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -29,7 +29,7 @@ OTHER DEALINGS IN THE SOFTWARE.
 
 
 
-import numpy
+import numpy as np
 from decorator import decorator
 import pyopencl as cl
 
@@ -109,32 +109,32 @@ def dtype_to_ctype(dtype):
     if dtype is None:
         raise ValueError("dtype may not be None")
 
-    dtype = numpy.dtype(dtype)
-    if dtype == numpy.int64:
+    dtype = np.dtype(dtype)
+    if dtype == np.int64:
         return "long"
-    elif dtype == numpy.uint64:
+    elif dtype == np.uint64:
         return "unsigned long"
-    elif dtype == numpy.int32:
+    elif dtype == np.int32:
         return "int"
-    elif dtype == numpy.uint32:
+    elif dtype == np.uint32:
         return "unsigned int"
-    elif dtype == numpy.int16:
+    elif dtype == np.int16:
         return "short int"
-    elif dtype == numpy.uint16:
+    elif dtype == np.uint16:
         return "short unsigned int"
-    elif dtype == numpy.int8:
+    elif dtype == np.int8:
         return "signed char"
-    elif dtype == numpy.uint8:
+    elif dtype == np.uint8:
         return "unsigned char"
-    elif dtype == numpy.bool:
+    elif dtype == np.bool:
         return "bool"
-    elif dtype == numpy.float32:
+    elif dtype == np.float32:
         return "float"
-    elif dtype == numpy.float64:
+    elif dtype == np.float64:
         return "double"
-    elif dtype == numpy.complex64:
+    elif dtype == np.complex64:
         return "complex float"
-    elif dtype == numpy.complex128:
+    elif dtype == np.complex128:
         return "complex double"
     else:
         raise ValueError, "unable to map dtype '%s'" % dtype
@@ -147,7 +147,7 @@ def dtype_to_ctype(dtype):
 # {{{ C argument lists --------------------------------------------------------
 class Argument:
     def __init__(self, dtype, name, vector_len=1):
-        self.dtype = numpy.dtype(dtype)
+        self.dtype = np.dtype(dtype)
         self.name = name
         self.vector_len = vector_len
 
@@ -219,18 +219,18 @@ def parse_c_arg(c_arg):
     else:
         vector_len = 1
 
-    if tp == "float": dtype = numpy.float32
-    elif tp == "double": dtype = numpy.float64
-    elif tp in ["int", "signed int"]: dtype = numpy.int32
-    elif tp in ["unsigned", "unsigned int"]: dtype = numpy.uint32
-    elif tp in ["long", "long int"]: dtype = numpy.int64
+    if tp == "float": dtype = np.float32
+    elif tp == "double": dtype = np.float64
+    elif tp in ["int", "signed int"]: dtype = np.int32
+    elif tp in ["unsigned", "unsigned int"]: dtype = np.uint32
+    elif tp in ["long", "long int"]: dtype = np.int64
     elif tp in ["unsigned long", "unsigned long int"]:
-        dtype = numpy.uint64
-    elif tp in ["short", "short int"]: dtype = numpy.int16
-    elif tp in ["unsigned short", "unsigned short int"]: dtype = numpy.uint16
-    elif tp in ["char"]: dtype = numpy.int8
-    elif tp in ["unsigned char"]: dtype = numpy.uint8
-    elif tp in ["bool"]: dtype = numpy.bool
+        dtype = np.uint64
+    elif tp in ["short", "short int"]: dtype = np.int16
+    elif tp in ["unsigned short", "unsigned short int"]: dtype = np.uint16
+    elif tp in ["char"]: dtype = np.int8
+    elif tp in ["unsigned char"]: dtype = np.uint8
+    elif tp in ["bool"]: dtype = np.bool
     else: raise ValueError, "unknown type '%s'" % tp
 
     return arg_class(dtype, name, vector_len)
diff --git a/test/test_array.py b/test/test_array.py
index 863530c2bf213e9c585e084b2b295a3bd9abe87a..adabdcfc8780e655b83faafa14b233b409e673c4 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1,5 +1,5 @@
 #! /usr/bin/env python
-import numpy
+import numpy as np
 import numpy.linalg as la
 import sys
 import pytools.test
@@ -29,14 +29,14 @@ def test_pow_array(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = numpy.array([1,2,3,4,5]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
 
     result = pow(a_gpu,a_gpu).get()
-    assert (numpy.abs(a**a - result) < 1e-3).all()
+    assert (np.abs(a**a - result) < 1e-3).all()
 
     result = (a_gpu**a_gpu).get()
-    assert (numpy.abs(pow(a, a) - result) < 1e-3).all()
+    assert (np.abs(pow(a, a) - result) < 1e-3).all()
 
 
 
@@ -46,11 +46,11 @@ def test_pow_number(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
 
     result = pow(a_gpu, 2).get()
-    assert (numpy.abs(a**2 - result) < 1e-3).all()
+    assert (np.abs(a**2 - result) < 1e-3).all()
 
 
 
@@ -59,7 +59,7 @@ def test_abs(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = -cl_array.arange(queue, 111, dtype=numpy.float32)
+    a = -cl_array.arange(queue, 111, dtype=np.float32)
     res = a.get()
 
     for i in range(111):
@@ -79,7 +79,7 @@ def test_len(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
     a_cpu = cl_array.to_device(queue, a)
     assert len(a_cpu) == 10
 
@@ -96,11 +96,11 @@ def test_multiply(ctx_getter):
 
     for sz in [10, 50000]:
         for dtype, scalars in [
-            (numpy.float32, [2]),
-            #(numpy.complex64, [2, 2j])
+            (np.float32, [2]),
+            #(np.complex64, [2, 2j])
             ]:
             for scalar in scalars:
-                a = numpy.arange(sz).astype(dtype)
+                a = np.arange(sz).astype(dtype)
                 a_gpu = cl_array.to_device(queue, a)
                 a_doubled = (scalar * a_gpu).get()
 
@@ -113,7 +113,7 @@ def test_multiply_array(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
 
     a_gpu = cl_array.to_device(queue, a)
     b_gpu = cl_array.to_device(queue, a)
@@ -132,7 +132,7 @@ def test_addition_array(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
     a_added = (a_gpu+a_gpu).get()
 
@@ -148,7 +148,7 @@ def test_addition_scalar(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
     a_added = (7+a_gpu).get()
 
@@ -161,8 +161,8 @@ def test_addition_scalar(ctx_getter):
 def test_substract_array(ctx_getter):
     """Test the substraction of two arrays."""
     #test data
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
-    b = numpy.array([10,20,30,40,50,60,70,80,90,100]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
+    b = np.array([10,20,30,40,50,60,70,80,90,100]).astype(np.float32)
 
     context = ctx_getter()
     queue = cl.CommandQueue(context)
@@ -187,7 +187,7 @@ def test_substract_scalar(ctx_getter):
     queue = cl.CommandQueue(context)
 
     #test data
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
 
     #convert a to a gpu object
     a_gpu = cl_array.to_device(queue, a)
@@ -208,7 +208,7 @@ def test_divide_scalar(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = numpy.array([1,2,3,4,5,6,7,8,9,10]).astype(numpy.float32)
+    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
 
     result = (a_gpu/2).get()
@@ -228,17 +228,17 @@ def test_divide_array(ctx_getter):
     queue = cl.CommandQueue(context)
 
     #test data
-    a = numpy.array([10,20,30,40,50,60,70,80,90,100]).astype(numpy.float32)
-    b = numpy.array([10,10,10,10,10,10,10,10,10,10]).astype(numpy.float32)
+    a = np.array([10,20,30,40,50,60,70,80,90,100]).astype(np.float32)
+    b = np.array([10,10,10,10,10,10,10,10,10,10]).astype(np.float32)
 
     a_gpu = cl_array.to_device(queue, a)
     b_gpu = cl_array.to_device(queue, b)
 
     a_divide = (a_gpu/b_gpu).get()
-    assert (numpy.abs(a/b - a_divide) < 1e-3).all()
+    assert (np.abs(a/b - a_divide) < 1e-3).all()
 
     a_divide = (b_gpu/a_gpu).get()
-    assert (numpy.abs(b/a - a_divide) < 1e-3).all()
+    assert (np.abs(b/a - a_divide) < 1e-3).all()
 
 
 
@@ -251,9 +251,9 @@ def test_random(ctx_getter):
     from pyopencl.clrandom import rand as clrand
 
     if has_double_support(context.devices[0]):
-        dtypes = [numpy.float32, numpy.float64]
+        dtypes = [np.float32, np.float64]
     else:
-        dtypes = [numpy.float32]
+        dtypes = [np.float32]
 
     for dtype in dtypes:
         a = clrand(context, queue, (10, 100), dtype=dtype).get()
@@ -271,7 +271,7 @@ def test_nan_arithmetic(ctx_getter):
 
     def make_nan_contaminated_vector(size):
         shape = (size,)
-        a = numpy.random.randn(*shape).astype(numpy.float32)
+        a = np.random.randn(*shape).astype(np.float32)
         #for i in range(0, shape[0], 3):
             #a[i] = float('nan')
         from random import randrange
@@ -290,7 +290,7 @@ def test_nan_arithmetic(ctx_getter):
     ab_gpu = (a_gpu*b_gpu).get()
 
     for i in range(size):
-        assert numpy.isnan(ab[i]) == numpy.isnan(ab_gpu[i])
+        assert np.isnan(ab[i]) == np.isnan(ab_gpu[i])
 
 
 
@@ -302,8 +302,8 @@ def test_elwise_kernel(ctx_getter):
 
     from pyopencl.clrandom import rand as clrand
 
-    a_gpu = clrand(context, queue, (50,), numpy.float32)
-    b_gpu = clrand(context, queue, (50,), numpy.float32)
+    a_gpu = clrand(context, queue, (50,), np.float32)
+    b_gpu = clrand(context, queue, (50,), np.float32)
 
     from pyopencl.elementwise import ElementwiseKernel
     lin_comb = ElementwiseKernel(context,
@@ -324,8 +324,8 @@ def test_take(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    idx = cl_array.arange(queue, 0, 200000, 2, dtype=numpy.uint32)
-    a = cl_array.arange(queue, 0, 600000, 3, dtype=numpy.float32)
+    idx = cl_array.arange(queue, 0, 200000, 2, dtype=np.uint32)
+    a = cl_array.arange(queue, 0, 600000, 3, dtype=np.float32)
     result = cl_array.take(a, idx)
     assert ((3*idx).get() == result.get()).all()
 
@@ -338,8 +338,8 @@ def test_arange(ctx_getter):
     queue = cl.CommandQueue(context)
 
     n = 5000
-    a = cl_array.arange(queue, n, dtype=numpy.float32)
-    assert (numpy.arange(n, dtype=numpy.float32) == a.get()).all()
+    a = cl_array.arange(queue, n, dtype=np.float32)
+    assert (np.arange(n, dtype=np.float32) == a.get()).all()
 
 
 
@@ -350,7 +350,7 @@ def test_reverse(ctx_getter):
     queue = cl.CommandQueue(context)
 
     n = 5000
-    a = numpy.arange(n).astype(numpy.float32)
+    a = np.arange(n).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
 
     a_gpu = a_gpu.reverse()
@@ -367,10 +367,10 @@ def test_sum(ctx_getter):
 
     from pyopencl.clrandom import rand as clrand
 
-    a_gpu = clrand(context, queue, (200000,), numpy.float32)
+    a_gpu = clrand(context, queue, (200000,), np.float32)
     a = a_gpu.get()
 
-    sum_a = numpy.sum(a)
+    sum_a = np.sum(a)
     sum_a_gpu = cl_array.sum(a_gpu).get()
 
     assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
@@ -386,16 +386,16 @@ def test_minmax(ctx_getter):
     from pyopencl.clrandom import rand as clrand
 
     if has_double_support(context.devices[0]):
-        dtypes = [numpy.float64, numpy.float32, numpy.int32]
+        dtypes = [np.float64, np.float32, np.int32]
     else:
-        dtypes = [numpy.float32, numpy.int32]
+        dtypes = [np.float32, np.int32]
 
     for what in ["min", "max"]:
         for dtype in dtypes:
             a_gpu = clrand(context, queue, (200000,), dtype)
             a = a_gpu.get()
 
-            op_a = getattr(numpy, what)(a)
+            op_a = getattr(np, what)(a)
             op_a_gpu = getattr(cl_array, what)(a_gpu).get()
 
             assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
@@ -415,16 +415,16 @@ def test_subset_minmax(ctx_getter):
     l_m = l_a - l_a // gran + 1
 
     if has_double_support(context.devices[0]):
-        dtypes = [numpy.float64, numpy.float32, numpy.int32]
+        dtypes = [np.float64, np.float32, np.int32]
     else:
-        dtypes = [numpy.float32, numpy.int32]
+        dtypes = [np.float32, np.int32]
 
     for dtype in dtypes:
         a_gpu = clrand(context, queue, (l_a,), dtype)
         a = a_gpu.get()
 
         meaningful_indices_gpu = cl_array.zeros(
-                queue, l_m, dtype=numpy.int32)
+                queue, l_m, dtype=np.int32)
         meaningful_indices = meaningful_indices_gpu.get()
         j = 0
         for i in range(len(meaningful_indices)):
@@ -437,7 +437,7 @@ def test_subset_minmax(ctx_getter):
                 queue, meaningful_indices)
         b = a[meaningful_indices]
 
-        min_a = numpy.min(b)
+        min_a = np.min(b)
         min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get()
 
         assert min_a_gpu == min_a
@@ -451,12 +451,12 @@ def test_dot(ctx_getter):
     queue = cl.CommandQueue(context)
 
     from pyopencl.clrandom import rand as clrand
-    a_gpu = clrand(context, queue, (200000,), numpy.float32)
+    a_gpu = clrand(context, queue, (200000,), np.float32)
     a = a_gpu.get()
-    b_gpu = clrand(context, queue, (200000,), numpy.float32)
+    b_gpu = clrand(context, queue, (200000,), np.float32)
     b = b_gpu.get()
 
-    dot_ab = numpy.dot(a, b)
+    dot_ab = np.dot(a, b)
 
     dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()
 
@@ -492,8 +492,8 @@ def test_if_positive(ctx_getter):
     from pyopencl.clrandom import rand as clrand
 
     l = 20000
-    a_gpu = clrand(context, queue, (l,), numpy.float32)
-    b_gpu = clrand(context, queue, (l,), numpy.float32)
+    a_gpu = clrand(context, queue, (l,), np.float32)
+    b_gpu = clrand(context, queue, (l,), np.float32)
     a = a_gpu.get()
     b = b_gpu.get()
 
@@ -501,10 +501,10 @@ def test_if_positive(ctx_getter):
     min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu)
 
     print(max_a_b_gpu)
-    print(numpy.maximum(a, b))
+    print(np.maximum(a, b))
 
-    assert la.norm(max_a_b_gpu.get()- numpy.maximum(a, b)) == 0
-    assert la.norm(min_a_b_gpu.get()- numpy.minimum(a, b)) == 0
+    assert la.norm(max_a_b_gpu.get()- np.maximum(a, b)) == 0
+    assert la.norm(min_a_b_gpu.get()- np.minimum(a, b)) == 0
 
 @pytools.test.mark_test.opencl
 def test_take_put(ctx_getter):
@@ -514,11 +514,11 @@ def test_take_put(ctx_getter):
     for n in [5, 17, 333]:
         one_field_size = 8
         buf_gpu = cl_array.zeros(queue,
-                n*one_field_size, dtype=numpy.float32)
+                n*one_field_size, dtype=np.float32)
         dest_indices = cl_array.to_device(queue,
-                numpy.array([ 0,  1,  2,  3, 32, 33, 34, 35], dtype=numpy.uint32))
+                np.array([ 0,  1,  2,  3, 32, 33, 34, 35], dtype=np.uint32))
         read_map = cl_array.to_device(queue,
-                numpy.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=numpy.uint32))
+                np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32))
 
         cl_array.multi_take_put(
                 arrays=[buf_gpu for i in range(n)],
@@ -537,20 +537,20 @@ def test_astype(ctx_getter):
     if not has_double_support(context.devices[0]):
         return
 
-    a_gpu = clrand(context, queue, (2000,), dtype=numpy.float32)
+    a_gpu = clrand(context, queue, (2000,), dtype=np.float32)
 
-    a = a_gpu.get().astype(numpy.float64)
-    a2 = a_gpu.astype(numpy.float64).get()
+    a = a_gpu.get().astype(np.float64)
+    a2 = a_gpu.astype(np.float64).get()
 
-    assert a2.dtype == numpy.float64
+    assert a2.dtype == np.float64
     assert la.norm(a - a2) == 0, (a, a2)
 
-    a_gpu = clrand(context, queue, (2000,), dtype=numpy.float64)
+    a_gpu = clrand(context, queue, (2000,), dtype=np.float64)
 
-    a = a_gpu.get().astype(numpy.float32)
-    a2 = a_gpu.astype(numpy.float32).get()
+    a = a_gpu.get().astype(np.float32)
+    a2 = a_gpu.astype(np.float32).get()
 
-    assert a2.dtype == numpy.float32
+    assert a2.dtype == np.float32
     assert la.norm(a - a2)/la.norm(a) < 1e-7
 
 
diff --git a/test/test_clmath.py b/test/test_clmath.py
index 77ae77fb6372d239a752283245357d160b40dc92..b8f648de08e83a5711ed372f7ba6250a92176ec9 100644
--- a/test/test_clmath.py
+++ b/test/test_clmath.py
@@ -1,6 +1,6 @@
 from __future__ import division
 import math
-import numpy
+import numpy as np
 import pytools.test
 
 def have_cl():
@@ -54,21 +54,21 @@ def make_unary_function_test(name, limits=(0, 1), threshold=0):
         queue = cl.CommandQueue(context)
 
         gpu_func = getattr(clmath, name)
-        cpu_func = getattr(numpy, numpy_func_names.get(name, name))
+        cpu_func = getattr(np, numpy_func_names.get(name, name))
 
         if has_double_support(context.devices[0]):
-            dtypes = [numpy.float32, numpy.float64]
+            dtypes = [np.float32, np.float64]
         else:
-            dtypes = [numpy.float32]
+            dtypes = [np.float32]
 
         for s in sizes:
             for dtype in dtypes:
                 args = cl_array.arange(queue, a, b, (b-a)/s, 
-                        dtype=numpy.float32)
+                        dtype=np.float32)
                 gpu_results = gpu_func(args).get()
                 cpu_results = cpu_func(args.get())
 
-                max_err = numpy.max(numpy.abs(cpu_results - gpu_results))
+                max_err = np.max(np.abs(cpu_results - gpu_results))
                 assert (max_err <= threshold).all(), \
                         (max_err, name, dtype)
 
@@ -107,8 +107,8 @@ def test_fmod(ctx_getter):
     queue = cl.CommandQueue(context)
 
     for s in sizes:
-        a = cl_array.arange(queue, s, dtype=numpy.float32)/10
-        a2 = cl_array.arange(queue, s, dtype=numpy.float32)/45.2 + 0.1
+        a = cl_array.arange(queue, s, dtype=np.float32)/10
+        a2 = cl_array.arange(queue, s, dtype=np.float32)/45.2 + 0.1
         b = clmath.fmod(a, a2)
 
         a = a.get()
@@ -124,8 +124,8 @@ def test_ldexp(ctx_getter):
     queue = cl.CommandQueue(context)
 
     for s in sizes:
-        a = cl_array.arange(queue, s, dtype=numpy.float32)
-        a2 = cl_array.arange(queue, s, dtype=numpy.float32)*1e-3
+        a = cl_array.arange(queue, s, dtype=np.float32)
+        a2 = cl_array.arange(queue, s, dtype=np.float32)*1e-3
         b = clmath.ldexp(a,a2)
 
         a = a.get()
@@ -141,7 +141,7 @@ def test_modf(ctx_getter):
     queue = cl.CommandQueue(context)
 
     for s in sizes:
-        a = cl_array.arange(queue, s, dtype=numpy.float32)/10
+        a = cl_array.arange(queue, s, dtype=np.float32)/10
         fracpart, intpart = clmath.modf(a)
 
         a = a.get()
@@ -160,7 +160,7 @@ def test_frexp(ctx_getter):
     queue = cl.CommandQueue(context)
 
     for s in sizes:
-        a = cl_array.arange(queue, s, dtype=numpy.float32)/10
+        a = cl_array.arange(queue, s, dtype=np.float32)/10
         significands, exponents = clmath.frexp(a)
 
         a = a.get()
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index d312260a52049f5c2ca3e1f111d14807878a6ef9..99c75b8c9decb9ebe29c1274d2606473e81e6835 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -1,5 +1,5 @@
 from __future__ import division
-import numpy
+import numpy as np
 import numpy.linalg as la
 import pytools.test
 
@@ -203,7 +203,7 @@ class TestCL:
             { a[get_global_id(0)] *= (b+c); }
             """).build()
 
-        a = numpy.random.rand(50000)
+        a = np.random.rand(50000)
         queue = cl.CommandQueue(context)
         mf = cl.mem_flags
         a_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)
@@ -220,9 +220,9 @@ class TestCL:
         except cl.LogicError:
             pass
 
-        prg.mult(queue, a.shape, None, a_buf, numpy.float32(2), numpy.int32(3))
+        prg.mult(queue, a.shape, None, a_buf, np.float32(2), np.int32(3))
 
-        a_result = numpy.empty_like(a)
+        a_result = np.empty_like(a)
         cl.enqueue_read_buffer(queue, a_buf, a_result).wait()
 
     @pytools.test.mark_test.opencl
@@ -253,7 +253,7 @@ class TestCL:
             }
             """).build()
 
-        a = numpy.random.rand(1024, 1024, 4).astype(numpy.float32)
+        a = np.random.rand(1024, 1024, 4).astype(np.float32)
         queue = cl.CommandQueue(context)
         mf = cl.mem_flags
         a_img = cl.Image(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
@@ -264,9 +264,9 @@ class TestCL:
         samp = cl.Sampler(context, False,
                 cl.addressing_mode.CLAMP,
                 cl.filter_mode.NEAREST)
-        prg.copy_image(queue, a.shape, None, a_dest, a_img, samp, numpy.int32(a.shape[0]))
+        prg.copy_image(queue, a.shape, None, a_dest, a_img, samp, np.int32(a.shape[0]))
 
-        a_result = numpy.empty_like(a)
+        a_result = np.empty_like(a)
         cl.enqueue_read_buffer(queue, a_dest, a_result, is_blocking=True)
         print(a_result.dtype)
 
@@ -279,8 +279,8 @@ class TestCL:
         queue = cl.CommandQueue(context)
         mf = cl.mem_flags
 
-        a = numpy.random.rand(50000).astype(numpy.float32)
-        b = numpy.empty_like(a)
+        a = np.random.rand(50000).astype(np.float32)
+        b = np.empty_like(a)
 
         buf1 = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
         buf2 = cl.Buffer(context, mf.WRITE_ONLY, b.nbytes)