diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index fc5a1ca68681835a6786980a175fbb8d57c4d453..e71ea2c6d053188c0e2211fdf7868c6a75cc9af0 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -23,6 +23,18 @@ Python 3.4 AMD CPU:
   - amd-cl-cpu
   except:
   - tags
+Python 2.7 POCL:
+  script:
+  - export PY_EXE=python2.7
+  - export PYOPENCL_TEST=portable
+  - export EXTRA_INSTALL="numpy mako"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python2.7
+  - pocl
+  except:
+  - tags
 # PyPy AMD CPU:
 #   script:
 #   - export PY_EXE=pypy
diff --git a/build-helpers/loopy.spec b/build-helpers/loopy.spec
index e24a24db9bda96f149e4279a6c37c1603476c3a4..7650624932eb4b6aa95e888410e8a0aa1a6d518a 100644
--- a/build-helpers/loopy.spec
+++ b/build-helpers/loopy.spec
@@ -9,7 +9,7 @@ a = Analysis(['bin/loopy'],
              hiddenimports=[],
              hookspath=None,
              runtime_hooks=None,
-             excludes=["hedge", "meshpy", "pyopencl", "pycparser", "PIL"]
+             excludes=["hedge", "meshpy", "pyopencl", "PIL"]
              )
 pyz = PYZ(a.pure)
 
diff --git a/build-helpers/make-linux-build-docker-inner.sh b/build-helpers/make-linux-build-docker-inner.sh
index 43ce1020196a4872e5031ef54af876b71ef33d32..6b72639e0de24ee7e424a6157de2064e5beee978 100755
--- a/build-helpers/make-linux-build-docker-inner.sh
+++ b/build-helpers/make-linux-build-docker-inner.sh
@@ -8,7 +8,7 @@ cd /tmp/build
 
 useradd -d /home/user -m -s /bin/bash user
 
-yum install -y git python-devel tar gcc gcc-c++ mercurial numpy
+yum install -y git python-devel tar gcc gcc-c++ mercurial numpy libffi-devel
 
 VENV_VERSION="virtualenv-1.9.1"
 rm -Rf "$VENV_VERSION"
@@ -20,7 +20,7 @@ $VENV_VERSION/virtualenv.py --system-site-packages --no-setuptools .env
 source .env/bin/activate
 
 curl -k https://ssl.tiker.net/software/ez_setup.py | python -
-curl http://git.tiker.net/pip/blob_plain/77f959a3ce9cc506efbf3a17290d387d0a6624f5:/contrib/get-pip.py | python -
+curl -k https://gitlab.tiker.net/inducer/pip/raw/7.0.3/contrib/get-pip.py | python -
 
 pip install pyinstaller
 git clone --recursive git://github.com/inducer/loopy
diff --git a/loopy/__init__.py b/loopy/__init__.py
index ecadbf4e95b4f7e12d5093c6a21929599fcdcc62..c63aa5f90d6537496ea9fe4ecb11c441070c91b4 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -1495,7 +1495,10 @@ def register_function_manglers(kernel, manglers):
 # {{{ cache control
 
 import os
-CACHING_ENABLED = "LOOPY_NO_CACHE" not in os.environ
+CACHING_ENABLED = (
+    "LOOPY_NO_CACHE" not in os.environ
+    and
+    "CG_NO_CACHE" not in os.environ)
 
 
 def set_caching_enabled(flag):
diff --git a/loopy/check.py b/loopy/check.py
index 477a6336f995c977cfcf5fb52e6d972d95a74468..3401f7b8d86eb8e89d0bdb0629b5e85d57fff771 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -450,11 +450,27 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
 
     from islpy import align_two
 
+    last_idomains = None
+    last_insn_inames = None
+
     for insn_id, idomains in six.iteritems(implemented_domains):
         insn = kernel.id_to_insn[insn_id]
 
         assert idomains
 
+        insn_inames = kernel.insn_inames(insn)
+
+        # {{{ if we've checked the same thing before, no need to check it again
+
+        if last_idomains is not None and last_insn_inames is not None:
+            if idomains == last_idomains and insn_inames == last_insn_inames:
+                continue
+
+        last_idomains = idomains
+        last_insn_inames = insn_inames
+
+        # }}}
+
         insn_impl_domain = idomains[0]
         for idomain in idomains[1:]:
             insn_impl_domain = insn_impl_domain | idomain
@@ -463,13 +479,12 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
                 assumption_non_param, insn_impl_domain)
         insn_impl_domain = (
                 (insn_impl_domain & assumptions)
-                .project_out_except(kernel.insn_inames(insn), [dim_type.set]))
+                .project_out_except(insn_inames, [dim_type.set]))
 
-        insn_inames = kernel.insn_inames(insn)
         insn_domain = kernel.get_inames_domain(insn_inames)
         assumptions, insn_domain = align_two(assumption_non_param, insn_domain)
         desired_domain = ((insn_domain & assumptions)
-            .project_out_except(kernel.insn_inames(insn), [dim_type.set]))
+            .project_out_except(insn_inames, [dim_type.set]))
 
         insn_impl_domain, desired_domain = align_two(
                 insn_impl_domain, desired_domain)
@@ -483,13 +498,18 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
                     for i in range(insn_domain.dim(dim_type.param)))
 
             lines = []
-            for kind, diff_set in [
-                    ("implemented, but not desired", i_minus_d),
-                    ("desired, but not implemented", d_minus_i)]:
+            for kind, diff_set, gist_domain in [
+                    ("implemented, but not desired", i_minus_d,
+                        desired_domain.gist(insn_impl_domain)),
+                    ("desired, but not implemented", d_minus_i,
+                        insn_impl_domain.gist(desired_domain))]:
+
+                if diff_set.is_empty():
+                    continue
+
                 diff_set = diff_set.coalesce()
                 pt = diff_set.sample_point()
-                if pt.is_void():
-                    continue
+                assert not pt.is_void()
 
                 #pt_set = isl.Set.from_point(pt)
                 #lines.append("point implemented: %s" % (pt_set <= insn_impl_domain))
@@ -503,7 +523,9 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
                         iname, pt.get_coordinate_val(tp, dim).to_python()))
 
                 lines.append(
-                        "sample point %s: %s" % (kind, ", ".join(point_axes)))
+                        "sample point in %s: %s" % (kind, ", ".join(point_axes)))
+                lines.append(
+                        "gist of %s: %s" % (kind, gist_domain))
 
             if code is not None:
                 print(79*"-")
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index a613e4882ecb916d9088173851b86e5461325c57..948c419c7989174d0824c342981f88b6fa4b8e6b 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -50,7 +50,7 @@ def get_admissible_conditional_inames_for(kernel, sched_index):
             if not has_barrier or not isinstance(tag, LocalIndexTag):
                 result.add(iname)
 
-    return result
+    return frozenset(result)
 
 
 def generate_code_for_sched_index(kernel, sched_index, codegen_state):
@@ -104,37 +104,6 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state):
                 % type(sched_item))
 
 
-def remove_inames_for_shared_hw_axes(kernel, cond_inames):
-    """
-    See if cond_inames contains references to two (or more) inames that
-    boil down to the same tag. If so, exclude them. (We shouldn't be writing
-    conditionals for such inames because we would be implicitly restricting
-    the other inames as well.)
-    """
-
-    tag_key_uses = {}
-
-    from loopy.kernel.data import HardwareParallelTag
-
-    for iname in cond_inames:
-        tag = kernel.iname_to_tag.get(iname)
-
-        if isinstance(tag, HardwareParallelTag):
-            tag_key_uses.setdefault(tag.key, []).append(iname)
-
-    multi_use_keys = set(
-            key for key, user_inames in six.iteritems(tag_key_uses)
-            if len(user_inames) > 1)
-
-    multi_use_inames = set()
-    for iname in cond_inames:
-        tag = kernel.iname_to_tag.get(iname)
-        if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys:
-            multi_use_inames.add(iname)
-
-    return frozenset(cond_inames - multi_use_inames)
-
-
 def get_required_predicates(kernel, sched_index):
     result = None
     for _, sched_item in generate_sub_sched_items(kernel.schedule, sched_index):
@@ -154,6 +123,25 @@ def get_required_predicates(kernel, sched_index):
     return result
 
 
+def group_by(l, key, merge):
+    if not l:
+        return l
+
+    result = []
+    previous = l[0]
+
+    for item in l[1:]:
+        if key(previous) == key(item):
+            previous = merge(previous, item)
+
+        else:
+            result.append(previous)
+            previous = item
+
+    result.append(previous)
+    return result
+
+
 def build_loop_nest(kernel, sched_index, codegen_state):
     # Most of the complexity of this function goes towards finding groups of
     # instructions that can be nested inside a shared conditional.
@@ -164,26 +152,29 @@ def build_loop_nest(kernel, sched_index, codegen_state):
 
     my_sched_indices = []
 
-    while sched_index < len(kernel.schedule):
-        sched_item = kernel.schedule[sched_index]
+    i = sched_index
+    while i < len(kernel.schedule):
+        sched_item = kernel.schedule[i]
 
         if isinstance(sched_item, LeaveLoop):
             break
 
-        my_sched_indices.append(sched_index)
+        my_sched_indices.append(i)
 
         if isinstance(sched_item, EnterLoop):
-            _, sched_index = gather_schedule_subloop(
-                    kernel.schedule, sched_index)
+            _, i = gather_schedule_subloop(
+                    kernel.schedule, i)
         elif isinstance(sched_item, Barrier):
-            sched_index += 1
+            i += 1
 
         elif isinstance(sched_item, RunInstruction):
-            sched_index += 1
+            i += 1
         else:
             raise RuntimeError("unexpected schedule item type: %s"
                     % type(sched_item))
 
+    del i
+
     # }}}
 
     # {{{ pass 2: find admissible conditional inames for each sibling schedule item
@@ -195,16 +186,32 @@ def build_loop_nest(kernel, sched_index, codegen_state):
         .. attribute:: schedule_index
         .. attribute:: admissible_cond_inames
         .. attribute:: required_predicates
+        .. attribute:: used_inames_within
         """
 
+    from loopy.schedule import find_used_inames_within
     sched_index_info_entries = [
             ScheduleIndexInfo(
-                schedule_index=i,
+                schedule_indices=[i],
                 admissible_cond_inames=(
                     get_admissible_conditional_inames_for(kernel, i)),
-                required_predicates=get_required_predicates(kernel, i)
+                required_predicates=get_required_predicates(kernel, i),
+                used_inames_within=find_used_inames_within(kernel, i)
                 )
-            for i in my_sched_indices]
+            for i in my_sched_indices
+            ]
+
+    sched_index_info_entries = group_by(
+            sched_index_info_entries,
+            key=lambda sii: (
+                sii.admissible_cond_inames,
+                sii.required_predicates,
+                sii.used_inames_within),
+            merge=lambda sii1, sii2: sii1.copy(
+                schedule_indices=(
+                    sii1.schedule_indices
+                    +
+                    sii2.schedule_indices)))
 
     # }}}
 
@@ -236,10 +243,10 @@ def build_loop_nest(kernel, sched_index, codegen_state):
     def build_insn_group(sched_index_info_entries, codegen_state,
             done_group_lengths=set()):
         """
-        :arg done_group_lengths: A set of group lengths (integers) that grows from
-            empty to include 1 and upwards with every recursive call.
-            It serves to prevent infinite recursion by preventing recursive
-            calls from doing anything about groups that are too small.
+        :arg done_group_lengths: A set of group lengths (integers) that grows
+            from empty to include the longest found group and downwards with every
+            recursive call.  It serves to prevent infinite recursion by preventing
+            recursive calls from doing anything about groups that are too small.
         """
 
         # The rough plan here is that build_insn_group starts out with the
@@ -259,10 +266,9 @@ def build_loop_nest(kernel, sched_index, codegen_state):
         if not sched_index_info_entries:
             return []
 
-        si_entry = sched_index_info_entries[0]
-        sched_index = si_entry.schedule_index
-        current_iname_set = si_entry.admissible_cond_inames
-        current_pred_set = (si_entry.required_predicates
+        origin_si_entry = sched_index_info_entries[0]
+        current_iname_set = origin_si_entry.admissible_cond_inames
+        current_pred_set = (origin_si_entry.required_predicates
                 - codegen_state.implemented_predicates)
 
         # {{{ grow schedule item group
@@ -293,22 +299,19 @@ def build_loop_nest(kernel, sched_index, codegen_state):
             # {{{ see which inames are actually used in group
 
             # And only generate conditionals for those.
-            from loopy.schedule import find_used_inames_within
             used_inames = set()
             for sched_index_info_entry in \
                     sched_index_info_entries[0:candidate_group_length]:
-                used_inames |= find_used_inames_within(kernel,
-                        sched_index_info_entry.schedule_index)
+                used_inames |= sched_index_info_entry.used_inames_within
 
             # }}}
 
-            only_unshared_inames = remove_inames_for_shared_hw_axes(kernel,
+            only_unshared_inames = kernel.remove_inames_for_shared_hw_axes(
                     current_iname_set & used_inames)
 
             bounds_checks = bounds_check_cache(only_unshared_inames)
 
             if (bounds_checks  # found a bounds check
-                    or bounds_checks is None  # found impossible bounds check
                     or current_pred_set
                     or candidate_group_length == 1):
                 # length-1 must always be an option to reach the recursion base
@@ -316,6 +319,11 @@ def build_loop_nest(kernel, sched_index, codegen_state):
                 found_hoists.append((candidate_group_length,
                     bounds_checks, current_pred_set))
 
+            if not bounds_checks and not current_pred_set:
+                # already no more checks possible, let's not waste time
+                # checking longer groups.
+                break
+
             candidate_group_length += 1
 
         # }}}
@@ -352,13 +360,15 @@ def build_loop_nest(kernel, sched_index, codegen_state):
             if group_length == 1:
                 # group only contains starting schedule item
                 def gen_code(inner_codegen_state):
-                    inner = generate_code_for_sched_index(
-                        kernel, sched_index, inner_codegen_state)
+                    result = []
+                    for i in origin_si_entry.schedule_indices:
+                        inner = generate_code_for_sched_index(
+                            kernel, i, inner_codegen_state)
+
+                        if inner is not None:
+                            result.append(inner)
 
-                    if inner is None:
-                        return []
-                    else:
-                        return [inner]
+                    return result
 
             else:
                 # recurse with a bigger done_group_lengths
diff --git a/loopy/compiled.py b/loopy/compiled.py
index c5928d6e7cd96b5e60d60b0c5dd13e21e33fb67f..da659eaba5e3c7c8c99993946da1e7af5bb399bc 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -27,6 +27,7 @@ THE SOFTWARE.
 """
 
 
+import sys
 import numpy as np
 from pytools import Record, memoize_method
 from loopy.diagnostic import ParameterFinderWarning
@@ -296,46 +297,141 @@ def generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, option
 
 # {{{ value arg setup
 
-def generate_value_arg_setup(gen, kernel, impl_arg_info, options):
+def generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options):
     import loopy as lp
     from loopy.kernel.array import ArrayBase
 
+    # {{{ arg counting bug handling
+
+    # For example:
+    # https://github.com/pocl/pocl/issues/197
+    # (but Apple CPU has a similar bug)
+
+    work_around_arg_count_bug = False
+    warn_about_arg_count_bug = False
+
+    from pyopencl.characterize import has_struct_arg_count_bug
+
+    devices = cl_kernel.context.devices
+
+    count_bug_per_dev = [
+            has_struct_arg_count_bug(dev)
+            for dev in devices]
+
+    if any(count_bug_per_dev):
+        if all(count_bug_per_dev):
+            work_around_arg_count_bug = True
+        else:
+            warn_about_arg_count_bug = True
+
+    # }}}
+
+    cl_arg_idx = 0
+    arg_idx_to_cl_arg_idx = {}
+
+    fp_arg_count = 0
+
     for arg_idx, arg in enumerate(impl_arg_info):
+        arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx
+
         if arg.arg_class is not lp.ValueArg:
             assert issubclass(arg.arg_class, ArrayBase)
+
+            # assume each of those generates exactly one...
+            cl_arg_idx += 1
+
             continue
 
         gen("# {{{ process %s" % arg.name)
         gen("")
 
         if not options.skip_arg_checks:
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                gen("raise RuntimeError(\"input argument '%s' must "
-                        "be supplied\")" % arg.name)
-                gen("")
-
-        if arg.dtype.kind == "i":
-            gen("# cast to int to avoid numpy scalar trouble with Boost.Python")
-            gen("%s = int(%s)" % (arg.name, arg.name))
+            gen("""
+                if {name} is None:
+                    raise RuntimeError("input argument '{name}' must "
+                        "be supplied")
+                """.format(name=arg.name))
+
+        if sys.version_info < (2, 7) and arg.dtype.kind == "i":
+            gen("# cast to long to avoid trouble with struct packing")
+            gen("%s = long(%s)" % (arg.name, arg.name))
             gen("")
 
         if arg.dtype.char == "V":
-            gen("cl_kernel.set_arg(%d, %s)" % (arg_idx, arg.name))
+            gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name))
+            cl_arg_idx += 1
+
+        elif arg.dtype.kind == "c":
+            if warn_about_arg_count_bug:
+                from warnings import warn
+                warn("{knl_name}: arguments include complex numbers, and "
+                        "some (but not all) of the target devices mishandle "
+                        "struct kernel arguments (hence the workaround is "
+                        "disabled".format(
+                            knl_name=kernel.name))
+
+            if arg.dtype == np.complex64:
+                arg_char = "f"
+            elif arg.dtype == np.complex128:
+                arg_char = "d"
+            else:
+                raise TypeError("unexpected complex type: %s" % arg.dtype)
+
+            if (work_around_arg_count_bug
+                    and arg.dtype == np.complex128
+                    and fp_arg_count + 2 <= 8):
+                gen(
+                        "buf = _lpy_pack('{arg_char}', {arg_var}.real)"
+                        .format(arg_char=arg_char, arg_var=arg.name))
+                gen(
+                        "cl_kernel.set_arg({cl_arg_idx}, buf)"
+                        .format(cl_arg_idx=cl_arg_idx))
+                cl_arg_idx += 1
+
+                gen(
+                        "buf = _lpy_pack('{arg_char}', {arg_var}.imag)"
+                        .format(arg_char=arg_char, arg_var=arg.name))
+                gen(
+                        "cl_kernel.set_arg({cl_arg_idx}, buf)"
+                        .format(cl_arg_idx=cl_arg_idx))
+                cl_arg_idx += 1
+            else:
+                gen(
+                        "buf = _lpy_pack('{arg_char}{arg_char}', "
+                        "{arg_var}.real, {arg_var}.imag)"
+                        .format(arg_char=arg_char, arg_var=arg.name))
+                gen(
+                        "cl_kernel.set_arg({cl_arg_idx}, buf)"
+                        .format(cl_arg_idx=cl_arg_idx))
+                cl_arg_idx += 1
+
+            fp_arg_count += 2
+
         else:
-            gen("cl_kernel.set_arg(%d, _lpy_pack(\"%s\", %s))"
-                    % (arg_idx, arg.dtype.char, arg.name))
+            if arg.dtype.kind == "f":
+                fp_arg_count += 1
+
+            gen("cl_kernel.set_arg(%d, _lpy_pack('%s', %s))"
+                    % (cl_arg_idx, arg.dtype.char, arg.name))
+
+            cl_arg_idx += 1
+
         gen("")
 
         gen("# }}}")
         gen("")
 
+    assert cl_arg_idx == cl_kernel.num_args
+
+    return arg_idx_to_cl_arg_idx
+
 # }}}
 
 
 # {{{ array arg setup
 
-def generate_array_arg_setup(gen, kernel, impl_arg_info, options):
+def generate_array_arg_setup(gen, kernel, impl_arg_info, options,
+        arg_idx_to_cl_arg_idx):
     import loopy as lp
 
     from loopy.kernel.array import ArrayBase
@@ -356,12 +452,12 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options):
         is_written = arg.base_name in kernel.get_written_variables()
         kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
 
-        gen("# {{{ process %s" % arg.name)
-        gen("")
-
         if not issubclass(arg.arg_class, ArrayBase):
             continue
 
+        gen("# {{{ process %s" % arg.name)
+        gen("")
+
         if not options.no_numpy:
             gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name)
             with Indentation(gen):
@@ -552,10 +648,12 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options):
             gen("del _lpy_made_by_loopy")
             gen("")
 
+        cl_arg_idx = arg_idx_to_cl_arg_idx[arg_idx]
+
         if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]:
-            gen("cl_kernel.set_arg(%d, %s.base_data)" % (arg_idx, arg.name))
+            gen("cl_kernel.set_arg(%d, %s.base_data)" % (cl_arg_idx, arg.name))
         else:
-            gen("cl_kernel.set_arg(%d, %s)" % (arg_idx, arg.name))
+            gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name))
         gen("")
 
         gen("# }}}")
@@ -567,7 +665,7 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options):
 # }}}
 
 
-def generate_invoker(kernel, impl_arg_info, options):
+def generate_invoker(kernel, cl_kernel, impl_arg_info, options):
     system_args = [
             "cl_kernel", "queue", "allocator=None", "wait_for=None",
             # ignored if options.no_numpy
@@ -584,7 +682,7 @@ def generate_invoker(kernel, impl_arg_info, options):
     gen.add_to_preamble("import pyopencl.array as _lpy_cl_array")
     gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools")
     gen.add_to_preamble("import numpy as _lpy_np")
-    gen.add_to_preamble("from pyopencl._pvt_struct import pack as _lpy_pack")
+    gen.add_to_preamble("from struct import pack as _lpy_pack")
     gen.add_to_preamble("")
 
     gen("if allocator is None:")
@@ -596,8 +694,10 @@ def generate_invoker(kernel, impl_arg_info, options):
     generate_integer_arg_finding_from_offsets(gen, kernel, impl_arg_info, options)
     generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, options)
 
-    generate_value_arg_setup(gen, kernel, impl_arg_info, options)
-    generate_array_arg_setup(gen, kernel, impl_arg_info, options)
+    arg_idx_to_cl_arg_idx = \
+            generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options)
+    generate_array_arg_setup(gen, kernel, impl_arg_info, options,
+            arg_idx_to_cl_arg_idx)
 
     # {{{ generate invocation
 
@@ -763,7 +863,7 @@ class CompiledKernel:
                 cl_kernel=cl_kernel,
                 impl_arg_info=impl_arg_info,
                 invoker=generate_invoker(
-                    kernel, impl_arg_info, self.kernel.options))
+                    kernel, cl_kernel, impl_arg_info, self.kernel.options))
 
     # {{{ debugging aids
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 24588976a2971d16d58dba03a44035cbc494397a..485de9ac2716e0c5e51ba02830d9197acbfc991d 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -635,6 +635,37 @@ class LoopKernel(RecordWithoutPickling):
 
         return result
 
+    @memoize_method
+    def remove_inames_for_shared_hw_axes(self, cond_inames):
+        """
+        See if cond_inames contains references to two (or more) inames that
+        boil down to the same tag. If so, exclude them. (We shouldn't be writing
+        conditionals for such inames because we would be implicitly restricting
+        the other inames as well.)
+        """
+
+        tag_key_uses = {}
+
+        from loopy.kernel.data import HardwareParallelTag
+
+        for iname in cond_inames:
+            tag = self.iname_to_tag.get(iname)
+
+            if isinstance(tag, HardwareParallelTag):
+                tag_key_uses.setdefault(tag.key, []).append(iname)
+
+        multi_use_keys = set(
+                key for key, user_inames in six.iteritems(tag_key_uses)
+                if len(user_inames) > 1)
+
+        multi_use_inames = set()
+        for iname in cond_inames:
+            tag = self.iname_to_tag.get(iname)
+            if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys:
+                multi_use_inames.add(iname)
+
+        return frozenset(cond_inames - multi_use_inames)
+
     # }}}
 
     # {{{ dependency wrangling
diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py
index 174506cd65a81b405053c212c9683f9dd2df2cc1..ee936680016b6808723076034c8486a49544e2bc 100644
--- a/loopy/target/pyopencl/__init__.py
+++ b/loopy/target/pyopencl/__init__.py
@@ -272,8 +272,13 @@ class PyOpenCLTarget(OpenCLTarget):
         return vec.types[base, count]
 
     def alignment_requirement(self, type_decl):
-        import pyopencl._pvt_struct as _struct
-        return _struct.calcsize(type_decl.struct_format())
+        import struct
+
+        fmt = (type_decl.struct_format()
+                .replace("F", "ff")
+                .replace("D", "dd"))
+
+        return struct.calcsize(fmt)
 
 # }}}
 
diff --git a/test/test_dg.py b/test/test_dg.py
index 581562da89210ea476700191c6d21ad2dbe7fd3d..0eb5be224d23fb295b229b3913ef479dc519e9fa 100644
--- a/test/test_dg.py
+++ b/test/test_dg.py
@@ -43,10 +43,10 @@ def test_dg_volume(ctx_factory):
 
     order = "F"
 
-    N = 3
-    Np = (N+1)*(N+2)*(N+3)//6
+    N = 3  # noqa
+    Np = (N+1)*(N+2)*(N+3)//6  # noqa
 
-    K = 10000
+    K = 10000  # noqa
 
     knl = lp.make_kernel([
             "{[n,m,k]: 0<= n,m < Np and 0<= k < K}",
@@ -146,16 +146,21 @@ def test_dg_volume(ctx_factory):
 
     parameters_dict = dict(K=K)
 
-    for variant in [
+    variants = [
             variant_basic,
             variant_more_per_work_group,
-            variant_image_d,
             variant_prefetch_d,
             variant_prefetch_fields,
             variant_k_ilp,
             variant_simple_padding,
             variant_fancy_padding
-            ]:
+            ]
+
+    if (ctx.devices[0].image_support
+            and ctx.devices[0].platform.name != "Portable Computing Language"):
+        variants.append(variant_image_d)
+
+    for variant in variants:
         lp.auto_test_vs_ref(
                 seq_knl, ctx, variant(knl), parameters=parameters_dict,
                 #codegen_kwargs=dict(with_annotation=True)
@@ -169,12 +174,12 @@ def no_test_dg_surface(ctx_factory):
 
     order = "F"
 
-    N = 3
-    Np = (N+1)*(N+2)*(N+3)//6
-    Nfp = (N+1)*(N+2)//2
-    Nfaces = 4
+    N = 3  # noqa
+    Np = (N+1)*(N+2)*(N+3)//6  # noqa
+    Nfp = (N+1)*(N+2)//2  # noqa
+    Nfaces = 4  # noqa
 
-    K = 10000
+    K = 10000  # noqa
 
     knl = lp.make_kernel(
             [
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 9dbedf320eec358d29245e35ab009ded910f1b5c..c019eb67fbaba5e6d8983665b67002837225d9ad 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -422,6 +422,10 @@ def test_magma_fermi_matrix_mul(ctx_factory):
 
     n = get_suitable_size(ctx)
 
+    if (not ctx.devices[0].image_support
+            or ctx.devices[0].platform.name == "Portable Computing Language"):
+        pytest.skip("crashes on pocl")
+
     image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT)
     if image_format not in cl.get_supported_image_formats(
             ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D):
@@ -469,6 +473,10 @@ def test_image_matrix_mul(ctx_factory):
 
     n = get_suitable_size(ctx)
 
+    if (not ctx.devices[0].image_support
+            or ctx.devices[0].platform.name == "Portable Computing Language"):
+        pytest.skip("crashes on pocl")
+
     image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT)
     if image_format not in cl.get_supported_image_formats(
             ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D):
@@ -505,6 +513,10 @@ def test_image_matrix_mul_ilp(ctx_factory):
     ctx = ctx_factory()
     order = "C"
 
+    if (not ctx.devices[0].image_support
+            or ctx.devices[0].platform.name == "Portable Computing Language"):
+        pytest.skip("crashes on pocl")
+
     image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT)
     if image_format not in cl.get_supported_image_formats(
             ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D):
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 22c0ce47c8cfc2aea051b63d5f50603f0d406f70..1fa35101d93066c5de9a0539db17d337305be2e0 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -379,6 +379,10 @@ def test_stencil_with_overfetch(ctx_factory):
                 ],
             assumptions="n>=1")
 
+    if ctx.devices[0].platform.name == "Portable Computing Language":
+        # https://github.com/pocl/pocl/issues/205
+        pytest.skip("takes very long to compile on pocl")
+
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
 
     ref_knl = knl