diff --git a/.gitignore b/.gitignore
index 2ed0d45a708d9cc002238f4da0be8e79e36b6a02..9594664744138d82b3369eb1ccc5246b956e862c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,3 +45,4 @@ tmp
 temp*
 setuptools.pth
 distribute-*.tar.gz
+core
diff --git a/examples/benchmark-all.py b/examples/benchmark-all.py
index 0d08979b76e89ee1b75e8b3a07d93e86beab23f0..09c423c68dbc3fda1ee871b365969f0cb89af858 100644
--- a/examples/benchmark-all.py
+++ b/examples/benchmark-all.py
@@ -18,22 +18,22 @@ for i in range(1000):
                 c_result[i] = c_result[i] * (a[i] + b[i])
                 c_result[i] = c_result[i] * (a[i] / 2.0)
 time2 = time()
-print "Execution time of test without OpenCL: ", time2 - time1, "s"
+print("Execution time of test without OpenCL: ", time2 - time1, "s")
 
 
 for platform in cl.get_platforms():
     for device in platform.get_devices():
-        print "==============================================================="
-        print "Platform name:", platform.name
-        print "Platform profile:", platform.profile
-        print "Platform vendor:", platform.vendor
-        print "Platform version:", platform.version
-        print "---------------------------------------------------------------"
-        print "Device name:", device.name
-        print "Device type:", cl.device_type.to_string(device.type)
-        print "Device memory: ", device.global_mem_size//1024//1024, 'MB'
-        print "Device max clock speed:", device.max_clock_frequency, 'MHz'
-        print "Device compute units:", device.max_compute_units
+        print("===============================================================")
+        print("Platform name:", platform.name)
+        print("Platform profile:", platform.profile)
+        print("Platform vendor:", platform.vendor)
+        print("Platform version:", platform.version)
+        print("---------------------------------------------------------------")
+        print("Device name:", device.name)
+        print("Device type:", cl.device_type.to_string(device.type))
+        print("Device memory: ", device.global_mem_size//1024//1024, 'MB')
+        print("Device max clock speed:", device.max_clock_frequency, 'MHz')
+        print("Device compute units:", device.max_compute_units)
 
         # Simnple speed test
         ctx = cl.Context([device])
@@ -64,7 +64,7 @@ for platform in cl.get_platforms():
         exec_evt.wait()
         elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start)
 
-        print "Execution time of test: %g s" % elapsed
+        print("Execution time of test: %g s" % elapsed)
 
         c = numpy.empty_like(a)
         cl.enqueue_read_buffer(queue, dest_buf, c).wait()
@@ -73,6 +73,6 @@ for platform in cl.get_platforms():
                 if c[i] != c_result[i]:
                         error = 1
         if error:
-                print "Results doesn't match!!"
+                print("Results doesn't match!!")
         else:
-                print "Results OK"
+                print("Results OK")
diff --git a/examples/demo.py b/examples/demo.py
index 44bc0a58e24142f6263c70c5e77849960d9c4da2..98fb46f8b49e6ebca02c9481e186127045e25b9a 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -27,4 +27,4 @@ prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
 a_plus_b = numpy.empty_like(a)
 cl.enqueue_read_buffer(queue, dest_buf, a_plus_b).wait()
 
-print la.norm(a_plus_b - (a+b)), la.norm(a_plus_b)
+print(la.norm(a_plus_b - (a+b)), la.norm(a_plus_b))
diff --git a/examples/demo_mandelbrot.py b/examples/demo_mandelbrot.py
index 4919dbb086c8bd2fd1f8ff431b420e90ecf1286e..a9b3d0d88293d8b35cd05f7ea4f035af8a9c53ff 100644
--- a/examples/demo_mandelbrot.py
+++ b/examples/demo_mandelbrot.py
@@ -4,7 +4,7 @@
 # I adapted it for PyOpenCL. Hopefully it is useful to someone.
 # July 2010, HolgerRapp@gmx.net
 #
-# Original readme below these lines. 
+# Original readme below these lines.
 
 # Mandelbrot calculate using GPU, Serial numpy and faster numpy
 # Use to show the speed difference between CPU and GPU calculations
@@ -134,7 +134,7 @@ if __name__ == '__main__':
             end_main = time.time()
 
             secs = end_main - start_main
-            print "Main took", secs
+            print("Main took", secs)
 
             self.mandel = (output.reshape((h,w)) /
                     float(output.max()) * 255.).astype(np.uint8)
diff --git a/examples/dump-properties.py b/examples/dump-properties.py
index 3f3f94dd47d0c3baeada520db85dafb84f897e4a..d8e5636f61d6402da5c1cc45aad054d623078501 100644
--- a/examples/dump-properties.py
+++ b/examples/dump-properties.py
@@ -9,16 +9,16 @@ def print_info(obj, info_cls):
             except:
                 info_value = "<error>"
 
-            print "%s: %s" % (info_name, info_value)
+            print("%s: %s" % (info_name, info_value))
 
 for platform in cl.get_platforms():
-    print 75*"="
-    print platform
-    print 75*"="
+    print(75*"=")
+    print(platform)
+    print(75*"=")
     print_info(platform, cl.platform_info)
 
     for device in platform.get_devices():
-        print 75*"-"
-        print device
-        print 75*"-"
+        print(75*"-")
+        print(device)
+        print(75*"-")
         print_info(device, cl.device_info)
diff --git a/examples/narray.py b/examples/narray.py
index 17450d73c0977f597c59947ccfb70c9f5b3a79ff..40ba945042b8d6337d7d4139deb1991d20532d81 100644
--- a/examples/narray.py
+++ b/examples/narray.py
@@ -24,13 +24,13 @@ __kernel void demo(__global uint *demo)
 try:
     prg.build()
 except:
-    print "Error:"
-    print prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG)
+    print("Error:")
+    print(prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG))
     raise
 
 prg.demo(queue, (500,), None, demo_buf)
 cl.enqueue_read_buffer(queue, demo_buf, demo_r).wait()
 
 for res in demo_r:
-    print res
+    print(res)
 
diff --git a/examples/transpose.py b/examples/transpose.py
index 7c571fdf07d6d6a27cc4d49afdaef81b5b5f2c85..21cc8cd8037bdc5b48ebbb166ddbfc24d52b05ca 100644
--- a/examples/transpose.py
+++ b/examples/transpose.py
@@ -1,7 +1,7 @@
 # Transposition of a matrix
 # originally for PyCUDA by Hendrik Riedmann <riedmann@dam.brown.edu>
 
-from __future__ import division
+
 
 import pyopencl as cl
 import numpy
@@ -117,7 +117,7 @@ def transpose_using_cl(ctx, queue, cpu_src, cls):
 
 def check_transpose():
     for cls in [NaiveTranspose, SillyTranspose, TransposeWithLocal]:
-        print "checking", cls.__name__
+        print("checking", cls.__name__)
         ctx = cl.create_some_context()
 
         for dev in ctx.devices:
@@ -127,7 +127,7 @@ def check_transpose():
 
         for i in numpy.arange(10, 13, 0.125):
             size = int(((2**i) // 32) * 32)
-            print size
+            print(size)
 
             source = numpy.random.rand(size, size).astype(numpy.float32)
             result = transpose_using_cl(ctx, queue, source, NaiveTranspose)
@@ -182,7 +182,7 @@ def benchmark_transpose():
             time = sum(evt.profile.end - evt.profile.start for evt in events)
 
             mem_bw = 2*source.nbytes*count/(time*1e-9)
-            print "benchmarking", name, size, mem_bw/1e9, "GB/s"
+            print("benchmarking", name, size, mem_bw/1e9, "GB/s")
             meth_mem_bws.append(mem_bw)
 
             a_buf.release()
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 0108257ceb3ab0b94b0f523da0f379bf19f1e3f2..607b8696237460a03566c49463a3042cd2de7fe1 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -183,8 +183,7 @@ def _add_functionality():
                     "size as the third positional argument instead.",
                     DeprecationWarning, stacklevel=2)
 
-        from types import NoneType
-        if isinstance(args[0], (NoneType, tuple)) and not had_local_size:
+        if isinstance(args[0], (type(None), tuple)) and not had_local_size:
             local_size = args[0]
             args = args[1:]
         elif not had_local_size:
diff --git a/pyopencl/array.py b/pyopencl/array.py
index a75430518db373dec74d22a90bd0dc923fd712cb..8dd2a29275483523b695a8a6b4e4aa585ef50dfc 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -425,7 +425,7 @@ class Array(object):
 
         return result
 
-    __rtruediv__ = __div__
+    __rtruediv__ = __rdiv__
 
     def fill(self, value, queue=None):
         """fills the array with the specified value"""
diff --git a/setup.py b/setup.py
index 9d88540dc1c80bb643599d3144187ae27b73492c..9c54f4a19c062940291ec768f19d600f20bed04c 100644
--- a/setup.py
+++ b/setup.py
@@ -70,7 +70,13 @@ def main():
         EXTRA_DEFINES["HAVE_GL"] = 1
 
     ver_dic = {}
-    execfile("pyopencl/version.py", ver_dic)
+    exec(compile(open("pyopencl/version.py").read(), "pyopencl/version.py", 'exec'), ver_dic)
+
+    try:
+        from distutils.command.build_py import build_py_2to3 as build_py
+    except ImportError:
+        # 2.x
+        from distutils.command.build_py import build_py
 
     setup(name="pyopencl",
             # metadata
@@ -107,7 +113,7 @@ def main():
             * Broad support. PyOpenCL was tested and works with Apple's, AMD's, and Nvidia's 
               CL implementations.
             """,
-            author=u"Andreas Kloeckner",
+            author="Andreas Kloeckner",
             author_email="inform@tiker.net",
             license = "MIT",
             url="http://mathema.tician.de/software/pyopencl",
@@ -121,6 +127,7 @@ def main():
               'Natural Language :: English',
               'Programming Language :: C++',
               'Programming Language :: Python',
+            'Programming Language :: Python :: 3',
               'Topic :: Scientific/Engineering',
               'Topic :: Scientific/Engineering :: Mathematics',
               'Topic :: Scientific/Engineering :: Physics',
@@ -147,11 +154,14 @@ def main():
                     include_dirs=INCLUDE_DIRS + EXTRA_INCLUDE_DIRS,
                     library_dirs=LIBRARY_DIRS + conf["CL_LIB_DIR"],
                     libraries=LIBRARIES + conf["CL_LIBNAME"],
-                    define_macros=list(EXTRA_DEFINES.iteritems()),
+                    define_macros=list(EXTRA_DEFINES.items()),
                     extra_compile_args=conf["CXXFLAGS"],
                     extra_link_args=conf["LDFLAGS"],
                     ),
-                ])
+                ],
+
+            # 2to3 invocation
+            cmdclass={'build_py': build_py})
 
 
 
diff --git a/src/wrapper/numpy_init.hpp b/src/wrapper/numpy_init.hpp
index 03e182c493b4c7ad53923e94287678f1ab3a4280..3e8a99828f4c25b378d3a3e45899b1c1b99675e7 100644
--- a/src/wrapper/numpy_init.hpp
+++ b/src/wrapper/numpy_init.hpp
@@ -1,4 +1,4 @@
-#ifndef _FAYHVVAAA_PYCUDA_HEADER_SEEN_NUMPY_INIT_HPP
+#ifndef _FAYHVVAAA_PYOPENCL_HEADER_SEEN_NUMPY_INIT_HPP
 
 
 
@@ -8,12 +8,25 @@
 
 
 
-namespace 
+namespace
 {
-  static struct array_importer
+  static struct pyublas_array_importer
   {
-    array_importer()
-    { import_array(); }
+    static bool do_import_array()
+    {
+#if PY_VERSION_HEX >= 0x03000000
+      import_array1(false);
+#else
+      import_array();
+#endif
+      return true;
+    }
+
+    pyublas_array_importer()
+    {
+      if (!do_import_array())
+        throw std::runtime_error("numpy failed to initialize");
+    }
   } _array_importer;
 }
 
diff --git a/test/test_array.py b/test/test_array.py
index 1fe64d6b7a353d32142399a42eeaff3fc54b4f8a..88863a6e124bcd1ba039a2e7ad87bb0f66bf429d 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -492,8 +492,8 @@ def test_if_positive(ctx_getter):
     max_a_b_gpu = cl_array.maximum(a_gpu, b_gpu)
     min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu)
 
-    print max_a_b_gpu
-    print numpy.maximum(a, b)
+    print(max_a_b_gpu)
+    print(numpy.maximum(a, b))
 
     assert la.norm(max_a_b_gpu.get()- numpy.maximum(a, b)) == 0
     assert la.norm(min_a_b_gpu.get()- numpy.minimum(a, b)) == 0
@@ -554,7 +554,7 @@ if __name__ == "__main__":
 
     import sys
     if len(sys.argv) > 1:
-        exec sys.argv[1]
+        exec(sys.argv[1])
     else:
         from py.test.cmdline import main
         main([__file__])
diff --git a/test/test_clmath.py b/test/test_clmath.py
index 58b6a09bb958f9823a7e75b45198490e1ea7bcaf..085cec9a70cffc83398bc83041886d8fe107bc10 100644
--- a/test/test_clmath.py
+++ b/test/test_clmath.py
@@ -1,4 +1,4 @@
-from __future__ import division
+
 import math
 import numpy
 import pytools.test
@@ -44,7 +44,8 @@ numpy_func_names = {
 
 
 
-def make_unary_function_test(name, (a, b)=(0, 1), threshold=0):
+def make_unary_function_test(name, xxx_todo_changeme=(0, 1), threshold=0):
+    (a, b) = xxx_todo_changeme
     def test(ctx_getter):
         context = ctx_getter()
         queue = cl.CommandQueue(context)
@@ -176,7 +177,7 @@ if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the tests.
     import sys
     if len(sys.argv) > 1:
-        exec sys.argv[1]
+        exec(sys.argv[1])
     else:
         from py.test.cmdline import main
         main([__file__])
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index d55c4d1825aaf5e0981560ee1e4a3f3f0215e2bb..ddd7c0e149b54d74887e5a9b86ed18d38d271356 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -64,8 +64,8 @@ class TestCL:
                     info = getattr(info_cls, info_name)
 
                     if find_quirk(CRASH_QUIRKS, cl_obj, info):
-                        print "not executing get_info", type(cl_obj), info_name
-                        print "(known crash quirk for %s)" % platform.name
+                        print("not executing get_info", type(cl_obj), info_name)
+                        print("(known crash quirk for %s)" % platform.name)
                         continue
 
                     try:
@@ -82,10 +82,10 @@ class TestCL:
                         try:
                             getattr(cl_obj, info_name.lower())
                         except:
-                            print "failed attr-based get_info", type(cl_obj), info_name
+                            print("failed attr-based get_info", type(cl_obj), info_name)
 
                             if find_quirk(QUIRKS, cl_obj, info):
-                                print "(known quirk for %s)" % platform.name
+                                print("(known quirk for %s)" % platform.name)
                             else:
                                 failure_count[0] += 1
 
@@ -268,7 +268,7 @@ class TestCL:
 
         a_result = numpy.empty_like(a)
         cl.enqueue_read_buffer(queue, a_dest, a_result, is_blocking=True)
-        print a_result.dtype
+        print(a_result.dtype)
 
         assert la.norm(a_result - a) == 0
 
@@ -301,7 +301,7 @@ if __name__ == "__main__":
 
     import sys
     if len(sys.argv) > 1:
-        exec sys.argv[1]
+        exec(sys.argv[1])
     else:
         from py.test.cmdline import main
         main([__file__])