diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index d76c7b433a3bdde9fa69123ca0cdc1d496b7de08..204139a196b9433df7dd1a3a51fbe5141f23dd62 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -276,17 +276,6 @@ RADIX_SORT_PREAMBLE_TPL = Template(r"""//CL//
     #endif
 
     <%
-      def get_count_branch(known_bits):
-          if len(known_bits) == bits:
-              return "s.c%s" % known_bits
-
-          b = len(known_bits)
-          boundary_mnr = known_bits + "1" + (bits-b-1)*"0"
-
-          return ("((mnr < %s) ? %s : %s)" % (
-              int(boundary_mnr, 2),
-              get_count_branch(known_bits+"0"),
-              get_count_branch(known_bits+"1")))
     %>
 
     index_t get_count(scan_t s, int mnr)
@@ -375,7 +364,7 @@ class RadixSort(object):
     on the compute device.
     """
     def __init__(self, context, arguments, key_expr, sort_arg_names,
-            bits_at_a_time=4, index_dtype=np.int32, key_dtype=np.uint32,
+            bits_at_a_time=3, index_dtype=np.int32, key_dtype=np.uint32,
             options=[]):
         """
         :arg arguments: A string of comma-separated C argument declarations.
@@ -417,6 +406,17 @@ class RadixSort(object):
                     if arg.name in sort_arg_names]
                 + [ ScalarArg(np.int32, "base_bit") ])
 
+        def get_count_branch(known_bits):
+            if len(known_bits) == self.bits:
+                return "s.c%s" % known_bits
+
+            boundary_mnr = known_bits + "1" + (self.bits-len(known_bits)-1)*"0"
+
+            return ("((mnr < %s) ? %s : %s)" % (
+                int(boundary_mnr, 2),
+                get_count_branch(known_bits+"0"),
+                get_count_branch(known_bits+"1")))
+
         codegen_args = dict(
                 bits=self.bits,
                 key_ctype=dtype_to_ctype(self.key_dtype),
@@ -426,6 +426,7 @@ class RadixSort(object):
                 padded_bin=_padded_bin,
                 scan_ctype=scan_ctype,
                 sort_arg_names=sort_arg_names,
+                get_count_branch=get_count_branch,
                 )
 
         preamble = scan_t_cdecl+RADIX_SORT_PREAMBLE_TPL.render(**codegen_args)
diff --git a/pyopencl/scan.py b/pyopencl/scan.py
index 526f679c8f46fc0b7a782b33f5f32a21933a375e..03bfef17e1bc9b51f442aac9b8863f439efedb72 100644
--- a/pyopencl/scan.py
+++ b/pyopencl/scan.py
@@ -988,8 +988,10 @@ class GenericScanKernel(_GenericScanKernelBase):
             # (about the widest vector a CPU can support, also taking
             # into account that CPUs don't hide latency by large work groups
             max_scan_wg_size = 16
+            wg_size_multiples = 16
         else:
             max_scan_wg_size = min(dev.max_work_group_size for dev in self.devices)
+            wg_size_multiples = 64
 
         avail_local_mem = min(
                 dev.local_mem_size
@@ -998,14 +1000,17 @@ class GenericScanKernel(_GenericScanKernelBase):
         # k_group_size should be a power of two because of in-kernel
         # division by that number.
 
-        k_group_size = 128
+        solutions = []
+        for k_exp in range(0, 7):
+            for wg_size in range(wg_size_multiples, max_scan_wg_size+1,
+                    wg_size_multiples):
 
-        while (
-                self.get_local_mem_use(
-                    max_scan_wg_size, k_group_size) + 256  > avail_local_mem):
-            k_group_size //= 2
+                k_group_size = 2**k_exp
+                if (self.get_local_mem_use(
+                    wg_size, k_group_size) + 256  <= avail_local_mem):
+                    solutions.append((wg_size*k_group_size, k_group_size, wg_size))
 
-        assert k_group_size > 1
+        _, k_group_size, max_scan_wg_size = max(solutions)
 
         while True:
             candidate_scan_info = self.build_scan_kernel(
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 8ebf0cd007fad808398e85d0661c7a8ed2860f19..e63ca2f63a19eefb9cdf8eca436a6fcdd555111c 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -401,7 +401,7 @@ def match_dtype_to_c_struct(device, name, dtype, context=None):
 
     src = r"""
         #define pycl_offsetof(st, m) \
-                 ((size_t) ( (char *)&((st *)0)->m - (char *)0 ))
+                 ((size_t) ((__local char *) &(dummy.m) - (__local char *)&dummy ))
 
         %(pre_decls)s
 
@@ -410,6 +410,7 @@ def match_dtype_to_c_struct(device, name, dtype, context=None):
         __kernel void get_size_and_offsets(__global size_t *result)
         {
             result[0] = sizeof(%(my_type)s);
+            __local %(my_type)s dummy;
             %(offset_code)s
         }
     """ % dict(
@@ -428,9 +429,24 @@ def match_dtype_to_c_struct(device, name, dtype, context=None):
 
     import pyopencl.array
     result_buf = cl.array.empty(queue, 1+len(fields), np.uintp)
-    knl(queue, (1,), None, result_buf.data)
+    knl(queue, (1,), (1,), result_buf.data)
+    queue.finish()
     size_and_offsets = result_buf.get()
 
+    size = int(size_and_offsets[0])
+
+    from pytools import any
+    offsets = size_and_offsets[1:]
+    if any(ofs >= size for ofs in offsets):
+        # offsets not plausible
+
+        if dtype.itemsize == size:
+            # If sizes match, use numpy's idea of the offsets.
+            offsets = [offset
+                    for field_name, (field_dtype, offset) in fields]
+        else:
+            raise RuntimeError("cannot discover struct layout on '%s'" % device)
+
     result_buf.data.release()
     del knl
     del prg
@@ -440,7 +456,7 @@ def match_dtype_to_c_struct(device, name, dtype, context=None):
     dtype_arg_dict = dict(
             names=[field_name for field_name, (field_dtype, offset) in fields],
             formats=[field_dtype for field_name, (field_dtype, offset) in fields],
-            offsets=[int(x) for x in size_and_offsets[1:]],
+            offsets=[int(x) for x in offsets],
             itemsize=int(size_and_offsets[0]),
             )
     dtype = np.dtype(dtype_arg_dict)
diff --git a/test/test_array.py b/test/test_array.py
index 9e680727c5be30be7c33831ba7d3a207f58b999c..cdc1cfefd5b3ea3c6fb79fbafab4c839d9dace01 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -816,7 +816,9 @@ scan_test_counts = [
     2 ** 20 - 2 ** 18,
     2 ** 20 - 2 ** 18 + 5,
     2 ** 20 + 1,
-    2 ** 20, 2 ** 24
+    2 ** 20,
+    2 ** 23 + 3,
+    2 ** 24 + 5
     ]
 
 @pytools.test.mark_test.opencl
@@ -1070,8 +1072,8 @@ def test_sort(ctx_factory):
 
         numpy_elapsed = numpy_end-dev_end
         dev_elapsed = dev_end-dev_start
-        print  "  dev: %.2f s numpy: %.2f ratio: %.1fx" % (
-                dev_elapsed, numpy_elapsed, dev_elapsed/numpy_elapsed)
+        print  "  dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % (
+                1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed)
         assert (a_dev_sorted.get() == a_sorted).all()