diff --git a/examples/dump-performance.py b/examples/dump-performance.py
new file mode 100644
index 0000000000000000000000000000000000000000..97fa0544aaf357f0769a5ff6d9aa623c26dda4ef
--- /dev/null
+++ b/examples/dump-performance.py
@@ -0,0 +1,38 @@
+from __future__ import division
+import pyopencl as cl
+import pyopencl.characterize.performance as perf
+
+
+
+
+def main():
+    ctx = cl.create_some_context()
+
+    prof_overhead, latency = perf.get_profiling_overhead(ctx)
+    print "command latency: %g s" % latency
+    print "profiling overhead: %g s -> %.1f %%" % (
+            prof_overhead, 100*prof_overhead/latency)
+    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+    print "empty kernel: %g s" % perf.get_empty_kernel_time(queue)
+    print "float32 add: %g GOps/s" % (perf.get_add_rate(queue)/1e9)
+
+    for tx_type in [
+            perf.HostToDeviceTransfer,
+            perf.DeviceToHostTransfer,
+            perf.DeviceToDeviceTransfer]:
+        print "----------------------------------------"
+        print tx_type.__name__
+        print "----------------------------------------"
+
+        print "latency: %g s" % perf.transfer_latency(queue, tx_type)
+        for i in range(6, 28, 2):
+            bs = 1<<i
+            print "bandwidth @ %d bytes: %g GB/s" % (
+                    bs, perf.transfer_bandwidth(queue, tx_type, bs)/1e9)
+
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyopencl/characterize/__init__.py b/pyopencl/characterize/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5283f0c7fa3ef49b3544f6aa23f907dc713668d
--- /dev/null
+++ b/pyopencl/characterize/__init__.py
@@ -0,0 +1,304 @@
+from __future__ import division
+
+import pyopencl as cl
+import numpy as np
+
+class CLCharacterizationWarning(UserWarning):
+    pass
+
+def has_double_support(dev):
+    for ext in dev.extensions.split(" "):
+        if ext == "cl_khr_fp64":
+            return True
+    return False
+
+
+
+
+def has_amd_double_support(dev):
+    """"Fix to allow incomplete amd double support in low end boards"""
+
+    for ext in dev.extensions.split(" "):
+        if ext == "cl_amd_fp64":
+            return True
+    return False
+
+
+
+
+def reasonable_work_group_size_multiple(dev, ctx=None):
+    try:
+        return dev.warp_size_nv
+    except AttributeError:
+        pass
+
+    if ctx is None:
+        ctx = cl.Context([dev])
+    prg = cl.Program(ctx, """
+        void knl(float *a)
+        {
+            a[get_global_id(0)] = 0;
+        }
+        """)
+    return prg.knl.get_work_group_info(
+            cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+            dev)
+
+
+
+
+def nv_compute_capability(dev):
+    """If *dev* is an Nvidia GPU :class:`pyopencl.Device`, return a tuple
+    *(major, minor)* indicating the device's compute capability.
+    """
+
+    try:
+        return (dev.compute_capability_major_nv,
+                dev.compute_capability_minor_nv)
+    except:
+        return None
+
+
+
+
+def usable_local_mem_size(dev, nargs=None):
+    """Return an estimate of the usable local memory size.
+    :arg nargs: Number of 32-bit arguments passed.
+    """
+
+    usable_local_mem_size = dev.local_mem_size
+
+    nv_compute_cap = nv_compute_capability(dev)
+
+    if (nv_compute_cap is not None
+            and nv_compute_cap < (2,0)):
+        # pre-Fermi use local mem for parameter passing
+        if nargs is None:
+            # assume maximum
+            usable_local_mem_size -= 256
+        else:
+            usable_local_mem_size -= 4*nargs
+
+    return usable_local_mem_size
+
+
+
+
+def simultaneous_work_items_on_local_access(dev):
+    """Return the number of work items that access local
+    memory simultaneously and thereby may conflict with
+    each other.
+    """
+    nv_compute_cap = nv_compute_capability(dev)
+
+    if nv_compute_cap is not None:
+        if nv_compute_cap < (2,0):
+            return 16
+        else:
+            if nv_compute_cap >= (3,0):
+                from warnings import warn
+                warn("wildly guessing conflicting local access size on '%s'"
+                        % dev,
+                        CLCharacterizationWarning)
+
+            return 32
+
+    if dev.type == cl.device_type.GPU:
+        from warnings import warn
+        warn("wildly guessing conflicting local access size on '%s'"
+                % dev,
+                CLCharacterizationWarning)
+        return 16
+    elif dev.type == cl.device_type.CPU:
+        return 1
+    else:
+        from warnings import warn
+        warn("wildly guessing conflicting local access size on '%s'"
+                % dev,
+                CLCharacterizationWarning)
+        return 16
+
+
+
+
+
+def local_memory_access_granularity(dev):
+    """Return the number of bytes per bank in local memory."""
+    return 4
+
+
+
+
+def local_memory_bank_count(dev):
+    """Return the number of banks present in local memory.
+    """
+    nv_compute_cap = nv_compute_capability(dev)
+
+    if nv_compute_cap is not None:
+        if nv_compute_cap < (2,0):
+            return 16
+        else:
+            if nv_compute_cap >= (3,0):
+                from warnings import warn
+                warn("wildly guessing conflicting local access size on '%s'"
+                        % dev,
+                        CLCharacterizationWarning)
+
+            return 32
+
+    if dev.type == cl.device_type.GPU:
+        from warnings import warn
+        warn("wildly guessing conflicting local access size on '%s'"
+                % dev,
+                CLCharacterizationWarning)
+        return 16
+    elif dev.type == cl.device_type.CPU:
+        if dev.local_mem_type == cl.device_local_mem_type.GLOBAL:
+            raise RuntimeError("asking for a bank count is meaningless for cache-based lmem")
+
+    from warnings import warn
+    warn("wildly guessing conflicting local access size on '%s'"
+            % dev,
+            CLCharacterizationWarning)
+    return 16
+
+
+
+def why_not_local_access_conflict_free(dev, itemsize,
+        array_shape, array_stored_shape=None):
+    """
+    :param itemsize: size of accessed data in bytes
+    :param array_shape: array dimensions, fastest-moving last
+        (C order)
+
+    :returns: a tuple (multiplicity, explanation), where *multiplicity*
+        is the number of work items that will conflict on a bank when accessing
+        local memory. *explanation* is a string detailing the found conflict.
+    """
+    # FIXME: Treat 64-bit access on NV CC 2.x + correctly
+
+    if array_stored_shape is None:
+        array_stored_shape = array_shape
+
+    rank = len(array_shape)
+
+    array_shape = array_shape[::-1]
+    array_stored_shape = array_stored_shape[::-1]
+
+    gran = local_memory_access_granularity(dev)
+    if itemsize != gran:
+        from warnings import warn
+        warn("local conflict info might be inaccurate "
+                "for itemsize != %d" % gran,
+                CLCharacterizationWarning)
+
+    sim_wi = simultaneous_work_items_on_local_access(dev)
+    bank_count = local_memory_bank_count(dev)
+
+    conflicts = []
+
+    for work_item_axis in range(rank):
+
+        bank_accesses = {}
+        for work_item_id in xrange(sim_wi):
+            addr = 0
+            addr_mult = itemsize
+
+            idx = []
+            left_over_idx = work_item_id
+            for axis, (ax_size, ax_stor_size) in enumerate(
+                    zip(array_shape, array_stored_shape)):
+
+                if axis >= work_item_axis:
+                    left_over_idx, ax_idx = divmod(left_over_idx, ax_size)
+                    addr += addr_mult*ax_idx
+                    idx.append(ax_idx)
+                else:
+                    idx.append(0)
+
+                addr_mult *= ax_stor_size
+
+            if left_over_idx:
+                # out-of-bounds, assume not taking place
+                continue
+
+            bank = (addr // gran) % bank_count
+            bank_accesses.setdefault(bank, []).append(
+                    "w.item %s -> %s" % (work_item_id, idx[::-1]))
+
+        conflict_multiplicity = max(
+                len(acc) for acc in bank_accesses.itervalues())
+
+        if conflict_multiplicity > 1:
+            for bank, acc in bank_accesses.iteritems():
+                if len(acc) == conflict_multiplicity:
+                    conflicts.append(
+                            (conflict_multiplicity,
+                                "%dx conflict on axis %d (from right, 0-based): "
+                                "%s access bank %d" % (
+                                    conflict_multiplicity,
+                                    work_item_axis,
+                                    ", ".join(acc), bank)))
+
+    if conflicts:
+        return max(conflicts)
+    else:
+        return 1, None
+
+
+
+
+def get_fast_inaccurate_build_options(dev):
+    """Return a list of flags valid on device *dev* that enable fast, but
+    potentially inaccurate floating point math.
+    """
+    return ["-cl-mad-enable", "-cl-fast-relaxed-math",
+        "-cl-no-signed-zeros", "-cl-strict-aliasing"]
+
+
+
+
+def get_simd_group_size(dev, type_size):
+    """Return an estimate of how many work items will be executed across SIMD
+    lanes. This returns the size of what Nvidia calls a warp and what AMD calls
+    a wavefront.
+
+    Only refers to implicit SIMD.
+
+    :arg type_size: number of bytes in vector entry type.
+    """
+    try:
+        return dev.warp_size_nv
+    except:
+        pass
+
+    lc_vendor = dev.platform.vendor.lower()
+    if "nvidia" in lc_vendor:
+        return 32
+
+    if ("advanced micro" in lc_vendor or "ati" in lc_vendor):
+        if dev.type == cl.device_type.GPU:
+            return 64
+        elif dev.type == cl.device_type.CPU:
+            return 1
+        else:
+            raise RuntimeError("unexpected AMD device type")
+
+    if dev.type == cl.device_type.CPU:
+        # implicit assumption: Impl. will vectorize
+
+        if type_size == 1:
+            return dev.preferred_vector_width_char
+        elif type_size == 2:
+            return dev.preferred_vector_width_short
+        elif type_size == 4:
+            return dev.preferred_vector_width_float
+        elif type_size == 8:
+            return dev.preferred_vector_width_double
+        else:
+            from warnings import warn
+            warn("unexpected dtype size in get_simd_group on CPU device, "
+                    "guessing group width 1")
+            return 1
+
+    return None
diff --git a/pyopencl/characterize/performance.py b/pyopencl/characterize/performance.py
new file mode 100644
index 0000000000000000000000000000000000000000..a64570ce1347247ce0ec00e210286a8681fc48d0
--- /dev/null
+++ b/pyopencl/characterize/performance.py
@@ -0,0 +1,223 @@
+from __future__ import division
+import pyopencl as cl
+import numpy as np
+
+
+
+
+# {{{ timing helpers
+
+class Timer:
+    def __init__(self, queue):
+        self.queue = queue
+
+    def start(self):
+        pass
+
+    def stop(self):
+        pass
+
+    def add_event(self, evt):
+        pass
+
+    def get_elapsed(self):
+        pass
+
+
+
+
+class WallTimer(Timer):
+    def start(self):
+        from time import time
+        self.queue.finish()
+        self.start = time()
+
+    def stop(self):
+        from time import time
+        self.queue.finish()
+        self.end = time()
+
+    def get_elapsed(self):
+        return self.end-self.start
+
+
+
+
+def _get_time(queue, f, timer_factory=None, desired_duration=0.1,
+        warmup_rounds=3):
+
+    if timer_factory is None:
+        timer_factory = WallTimer
+
+    count = 1
+
+    while True:
+        timer = timer_factory(queue)
+
+        for i in xrange(warmup_rounds):
+            f()
+        warmup_rounds = 0
+
+        timer.start()
+        for i in xrange(count):
+            timer.add_event(f())
+        timer.stop()
+
+        elapsed = timer.get_elapsed()
+        if elapsed < desired_duration:
+            if elapsed == 0:
+                count *= 5
+            else:
+                new_count = int(desired_duration/elapsed)
+
+                new_count = max(2*count, new_count)
+                new_count = min(10*count, new_count)
+                count = new_count
+
+        else:
+            return elapsed/count
+
+# }}}
+
+
+
+
+# {{{ transfer measurements
+
+class HostDeviceTransferBase(object):
+    def __init__(self, queue, block_size):
+        self.queue = queue
+        self.host_buf = np.empty(block_size, dtype=np.uint8)
+        self.dev_buf = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size)
+
+class HostToDeviceTransfer(HostDeviceTransferBase):
+    def do(self):
+        return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf)
+
+class DeviceToHostTransfer(HostDeviceTransferBase):
+    def do(self):
+        return cl.enqueue_copy(self. queue, self.host_buf, self.dev_buf)
+
+class DeviceToDeviceTransfer(object):
+    def __init__(self, queue, block_size):
+        self.queue = queue
+        self.dev_buf_1 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size)
+        self.dev_buf_2 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size)
+
+    def do(self):
+        return cl.enqueue_copy(self. queue, self.dev_buf_2, self.dev_buf_1)
+
+class HostToDeviceTransfer(HostDeviceTransferBase):
+    def do(self):
+        return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf)
+
+
+def transfer_latency(queue, transfer_type, timer_factory=None):
+    transfer = transfer_type(queue, 1)
+    return _get_time(queue, transfer.do, timer_factory=timer_factory)
+
+def transfer_bandwidth(queue, transfer_type, block_size, timer_factory=None):
+    """Measures one-sided bandwidth."""
+
+    transfer = transfer_type(queue, block_size)
+    return block_size/_get_time(queue, transfer.do, timer_factory=timer_factory)
+
+# }}}
+
+
+
+
+def get_profiling_overhead(ctx, timer_factory=None):
+    no_prof_queue = cl.CommandQueue(ctx)
+    transfer = DeviceToDeviceTransfer(no_prof_queue, 1)
+    no_prof_time = _get_time(no_prof_queue, transfer.do, timer_factory=timer_factory)
+
+    prof_queue = cl.CommandQueue(ctx,
+            properties=cl.command_queue_properties.PROFILING_ENABLE)
+    transfer = DeviceToDeviceTransfer(prof_queue, 1)
+    prof_time = _get_time(prof_queue, transfer.do, timer_factory=timer_factory)
+
+    return prof_time - no_prof_time, prof_time
+
+def get_empty_kernel_time(queue, timer_factory=None):
+    prg = cl.Program(queue.context, """
+        __kernel void empty()
+        { }
+        """).build()
+
+    knl = prg.empty
+
+    def f():
+        knl(queue, (1,), None)
+
+    return _get_time(queue, f, timer_factory=timer_factory)
+
+def _get_full_machine_kernel_rate(queue, src, args, name="benchmark", timer_factory=None):
+    prg = cl.Program(queue.context, src).build()
+
+    knl = getattr(prg, name)
+
+    dev = queue.device
+    global_size = 4 * dev.max_compute_units
+    def f():
+        knl(queue, (global_size,), None, *args)
+
+    rates = []
+    num_dips = 0
+
+    while True:
+        elapsed = _get_time(queue, f, timer_factory=timer_factory)
+        rate = global_size/elapsed
+        print global_size, rate, num_dips
+
+        keep_trying = not rates
+
+        if rates and rate > 1.05*max(rates): # big improvement
+            keep_trying = True
+            num_dips = 0
+
+        if rates and rate < 0.9*max(rates) and num_dips < 3: # big dip
+            keep_trying = True
+            num_dips += 1
+
+        if keep_trying:
+            global_size *= 2
+            last_rate = rate
+            rates.append(rate)
+        else:
+            rates.append(rate)
+            return max(rates)
+
+def get_add_rate(queue, type="float", timer_factory=None):
+    return 50*10*_get_full_machine_kernel_rate(queue, """
+        typedef %(op_t)s op_t;
+        __kernel void benchmark()
+        {
+            local op_t tgt[1024];
+            op_t val = get_global_id(0);
+
+            for (int i = 0; i < 10; ++i)
+            {
+                val += val; val += val; val += val; val += val; val += val;
+                val += val; val += val; val += val; val += val; val += val;
+
+                val += val; val += val; val += val; val += val; val += val;
+                val += val; val += val; val += val; val += val; val += val;
+
+                val += val; val += val; val += val; val += val; val += val;
+                val += val; val += val; val += val; val += val; val += val;
+
+                val += val; val += val; val += val; val += val; val += val;
+                val += val; val += val; val += val; val += val; val += val;
+
+                val += val; val += val; val += val; val += val; val += val;
+                val += val; val += val; val += val; val += val; val += val;
+            }
+            tgt[get_local_id(0)] = val;
+        }
+        """ % dict(op_t=type), ())
+
+
+
+
+# vim: foldmethod=marker:filetype=pyopencl