diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9d960a6f4eb920f920fb7398c1816e447e97e05e..6ee07ae45d90f7fa66cf6d8ca7e219d5cde2017b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -53,6 +53,19 @@ Python 3.5 POCL:
   except:
   - tags
 
+Python 3.6 POCL:
+  script:
+  - export PY_EXE=python3.6
+  - export PYOPENCL_TEST=portable
+  - export EXTRA_INSTALL="numpy mako"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python3.6
+  - pocl
+  except:
+  - tags
+
 Documentation:
   script:
   - EXTRA_INSTALL="numpy mako"
diff --git a/examples/sym-exp-complexity.py b/examples/sym-exp-complexity.py
new file mode 100644
index 0000000000000000000000000000000000000000..bde21c42abdd2cf006903a968945c9ffe280cd26
--- /dev/null
+++ b/examples/sym-exp-complexity.py
@@ -0,0 +1,88 @@
+import numpy as np
+import pyopencl as cl
+import loopy as lp
+from sumpy.kernel import LaplaceKernel, HelmholtzKernel
+from sumpy.expansion.local import (
+        LaplaceConformingVolumeTaylorLocalExpansion,
+        HelmholtzConformingVolumeTaylorLocalExpansion,
+        )
+from sumpy.expansion.multipole import (
+        LaplaceConformingVolumeTaylorMultipoleExpansion,
+        HelmholtzConformingVolumeTaylorMultipoleExpansion,
+        )
+from sumpy.e2e import E2EFromCSR
+
+
+def find_flops():
+    ctx = cl.create_some_context()
+
+    if 0:
+        knl = LaplaceKernel(2)
+        m_expn_cls = LaplaceConformingVolumeTaylorMultipoleExpansion
+        l_expn_cls = LaplaceConformingVolumeTaylorLocalExpansion
+        flop_type = np.float64
+    else:
+        knl = HelmholtzKernel(2)
+        m_expn_cls = HelmholtzConformingVolumeTaylorMultipoleExpansion
+        l_expn_cls = HelmholtzConformingVolumeTaylorLocalExpansion
+        flop_type = np.complex128
+
+    orders = list(range(1, 11, 1))
+    flop_counts = []
+    for order in orders:
+        print(order)
+        m_expn = m_expn_cls(knl, order)
+        l_expn = l_expn_cls(knl, order)
+        m2l = E2EFromCSR(ctx, m_expn, l_expn)
+
+        loopy_knl = m2l.get_kernel()
+        loopy_knl = lp.add_and_infer_dtypes(
+                loopy_knl,
+                {
+                    "target_boxes,src_box_lists,src_box_starts": np.int32,
+                    "centers,src_expansions": np.float64,
+                    })
+
+        flops = lp.get_op_map(loopy_knl).filter_by(dtype=[flop_type]).sum()
+        flop_counts.append(
+                flops.eval_with_dict(
+                    dict(isrc_start=0, isrc_stop=1, ntgt_boxes=1)))
+
+    print(orders)
+    print(flop_counts)
+
+
+def plot_flops():
+    if 0:
+        case = "3D Laplace M2L"
+        orders = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+        flops = [62, 300, 914, 2221, 4567, 8405, 14172, 22538, 34113]
+        filename = "laplace-m2l-complexity-3d.pdf"
+
+    elif 0:
+        case = "2D Laplace M2L"
+        orders = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+                18, 19, 20]
+        flops = [36, 99, 193, 319, 476, 665, 889, 1143, 1429, 1747, 2097, 2479, 2893,
+                3339, 3817, 4327, 4869, 5443, 6049, 6687]
+        filename = "laplace-m2l-complexity-2d.pdf"
+    elif 1:
+        case = "2D Helmholtz M2L"
+        orders = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        flops = [45, 194, 474, 931, 1650, 2632, 3925, 5591, 7706, 10272]
+        filename = "helmholtz-m2l-complexity-2d.pdf"
+
+    import matplotlib.pyplot as plt
+    plt.rc("font", size=16)
+    plt.title(case)
+    plt.ylabel("Flop count")
+    plt.xlabel("Expansion order")
+    plt.loglog(orders, flops, "o-")
+    plt.grid()
+    plt.tight_layout()
+    plt.savefig(filename)
+
+
+if __name__ == "__main__":
+    #find_flops()
+    plot_flops()
diff --git a/sumpy/cse.py b/sumpy/cse.py
index 350a2c8d97d6ef4729657c85248d412e4128f6a1..ad44fc41c8516dfc90587ce61cab6a3e5070e6ea 100644
--- a/sumpy/cse.py
+++ b/sumpy/cse.py
@@ -183,11 +183,32 @@ class FuncArgTracker(object):
         from collections import defaultdict
         count_map = defaultdict(lambda: 0)
 
-        for arg in argset:
-            for func_i in self.arg_to_funcset[arg]:
+        # Sorted by size to make best use of the performance hack below.
+        funcsets = sorted((self.arg_to_funcset[arg] for arg in argset), key=len)
+
+        for funcset in funcsets[:-threshold+1]:
+            for func_i in funcset:
                 if func_i >= min_func_i:
                     count_map[func_i] += 1
 
+        for i, funcset in enumerate(funcsets[-threshold+1:]):
+            # When looking at the tail end of the funcsets list, items below
+            # this threshold in the count_map don't have to be considered
+            # because they can't possibly be in the output.
+            count_map_threshold = i + 1
+
+            # We pick the smaller of the two containers to iterate over to
+            # reduce the number of items we have to look at.
+            (smaller_funcs_container,
+             larger_funcs_container) = sorted([funcset, count_map], key=len)
+
+            for func_i in smaller_funcs_container:
+                if count_map[func_i] < count_map_threshold:
+                    continue
+
+                if func_i in larger_funcs_container:
+                    count_map[func_i] += 1
+
         return dict(
             (k, v) for k, v in count_map.items()
             if v >= threshold)
@@ -258,14 +279,14 @@ def match_common_args(func_class, funcs, opt_subs):
     from sumpy.tools import OrderedSet
 
     for i in range(len(funcs)):
-        common_arg_candidates = arg_tracker.get_common_arg_candidates(
+        common_arg_candidates_counts = arg_tracker.get_common_arg_candidates(
                 arg_tracker.func_to_argset[i], i + 1, threshold=2)
 
         # Sort the candidates in order of match size.
         # This makes us try combining smaller matches first.
         common_arg_candidates = OrderedSet(sorted(
-                common_arg_candidates.keys(),
-                key=lambda k: (common_arg_candidates[k], k)))
+                common_arg_candidates_counts.keys(),
+                key=lambda k: (common_arg_candidates_counts[k], k)))
 
         while common_arg_candidates:
             j = common_arg_candidates.pop(last=False)
diff --git a/test/test_kernels.py b/test/test_kernels.py
index c6d93225cf32fc0d9910a376c57c4796434f0413..628e2893f3c67a18296badc0fe477dca67fcd4f6 100644
--- a/test/test_kernels.py
+++ b/test/test_kernels.py
@@ -1,7 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
-from six.moves import range
+from __future__ import division, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -25,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from six.moves import range
+
 import numpy as np
 import numpy.linalg as la
 import sys