From ef60e3273fdcd5658d799c36d85b8c0889e7fe99 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 5 Jul 2015 13:46:44 -0500
Subject: [PATCH] Compatibility with POCL, work around broken POCL calling
 conventions

---
 .gitlab-ci.yml                    |  12 +++
 loopy/compiled.py                 | 148 +++++++++++++++++++++++++-----
 loopy/target/pyopencl/__init__.py |   9 +-
 test/test_dg.py                   |  27 +++---
 test/test_linalg.py               |  12 +++
 test/test_loopy.py                |   4 +
 6 files changed, 175 insertions(+), 37 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index fc5a1ca68..e71ea2c6d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -23,6 +23,18 @@ Python 3.4 AMD CPU:
   - amd-cl-cpu
   except:
   - tags
+Python 2.7 POCL:
+  script:
+  - export PY_EXE=python2.7
+  - export PYOPENCL_TEST=portable
+  - export EXTRA_INSTALL="numpy mako"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python2.7
+  - pocl
+  except:
+  - tags
 # PyPy AMD CPU:
 #   script:
 #   - export PY_EXE=pypy
diff --git a/loopy/compiled.py b/loopy/compiled.py
index c5928d6e7..da659eaba 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -27,6 +27,7 @@ THE SOFTWARE.
 """
 
 
+import sys
 import numpy as np
 from pytools import Record, memoize_method
 from loopy.diagnostic import ParameterFinderWarning
@@ -296,46 +297,141 @@ def generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, option
 
 # {{{ value arg setup
 
-def generate_value_arg_setup(gen, kernel, impl_arg_info, options):
+def generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options):
     import loopy as lp
     from loopy.kernel.array import ArrayBase
 
+    # {{{ arg counting bug handling
+
+    # For example:
+    # https://github.com/pocl/pocl/issues/197
+    # (but Apple CPU has a similar bug)
+
+    work_around_arg_count_bug = False
+    warn_about_arg_count_bug = False
+
+    from pyopencl.characterize import has_struct_arg_count_bug
+
+    devices = cl_kernel.context.devices
+
+    count_bug_per_dev = [
+            has_struct_arg_count_bug(dev)
+            for dev in devices]
+
+    if any(count_bug_per_dev):
+        if all(count_bug_per_dev):
+            work_around_arg_count_bug = True
+        else:
+            warn_about_arg_count_bug = True
+
+    # }}}
+
+    cl_arg_idx = 0
+    arg_idx_to_cl_arg_idx = {}
+
+    fp_arg_count = 0
+
     for arg_idx, arg in enumerate(impl_arg_info):
+        arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx
+
         if arg.arg_class is not lp.ValueArg:
             assert issubclass(arg.arg_class, ArrayBase)
+
+            # assume each of those generates exactly one...
+            cl_arg_idx += 1
+
             continue
 
         gen("# {{{ process %s" % arg.name)
         gen("")
 
         if not options.skip_arg_checks:
-            gen("if %s is None:" % arg.name)
-            with Indentation(gen):
-                gen("raise RuntimeError(\"input argument '%s' must "
-                        "be supplied\")" % arg.name)
-                gen("")
-
-        if arg.dtype.kind == "i":
-            gen("# cast to int to avoid numpy scalar trouble with Boost.Python")
-            gen("%s = int(%s)" % (arg.name, arg.name))
+            gen("""
+                if {name} is None:
+                    raise RuntimeError("input argument '{name}' must "
+                        "be supplied")
+                """.format(name=arg.name))
+
+        if sys.version_info < (2, 7) and arg.dtype.kind == "i":
+            gen("# cast to long to avoid trouble with struct packing")
+            gen("%s = long(%s)" % (arg.name, arg.name))
             gen("")
 
         if arg.dtype.char == "V":
-            gen("cl_kernel.set_arg(%d, %s)" % (arg_idx, arg.name))
+            gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name))
+            cl_arg_idx += 1
+
+        elif arg.dtype.kind == "c":
+            if warn_about_arg_count_bug:
+                from warnings import warn
+                warn("{knl_name}: arguments include complex numbers, and "
+                        "some (but not all) of the target devices mishandle "
+                        "struct kernel arguments (hence the workaround is "
+                        "disabled".format(
+                            knl_name=kernel.name))
+
+            if arg.dtype == np.complex64:
+                arg_char = "f"
+            elif arg.dtype == np.complex128:
+                arg_char = "d"
+            else:
+                raise TypeError("unexpected complex type: %s" % arg.dtype)
+
+            if (work_around_arg_count_bug
+                    and arg.dtype == np.complex128
+                    and fp_arg_count + 2 <= 8):
+                gen(
+                        "buf = _lpy_pack('{arg_char}', {arg_var}.real)"
+                        .format(arg_char=arg_char, arg_var=arg.name))
+                gen(
+                        "cl_kernel.set_arg({cl_arg_idx}, buf)"
+                        .format(cl_arg_idx=cl_arg_idx))
+                cl_arg_idx += 1
+
+                gen(
+                        "buf = _lpy_pack('{arg_char}', {arg_var}.imag)"
+                        .format(arg_char=arg_char, arg_var=arg.name))
+                gen(
+                        "cl_kernel.set_arg({cl_arg_idx}, buf)"
+                        .format(cl_arg_idx=cl_arg_idx))
+                cl_arg_idx += 1
+            else:
+                gen(
+                        "buf = _lpy_pack('{arg_char}{arg_char}', "
+                        "{arg_var}.real, {arg_var}.imag)"
+                        .format(arg_char=arg_char, arg_var=arg.name))
+                gen(
+                        "cl_kernel.set_arg({cl_arg_idx}, buf)"
+                        .format(cl_arg_idx=cl_arg_idx))
+                cl_arg_idx += 1
+
+            fp_arg_count += 2
+
         else:
-            gen("cl_kernel.set_arg(%d, _lpy_pack(\"%s\", %s))"
-                    % (arg_idx, arg.dtype.char, arg.name))
+            if arg.dtype.kind == "f":
+                fp_arg_count += 1
+
+            gen("cl_kernel.set_arg(%d, _lpy_pack('%s', %s))"
+                    % (cl_arg_idx, arg.dtype.char, arg.name))
+
+            cl_arg_idx += 1
+
         gen("")
 
         gen("# }}}")
         gen("")
 
+    assert cl_arg_idx == cl_kernel.num_args
+
+    return arg_idx_to_cl_arg_idx
+
 # }}}
 
 
 # {{{ array arg setup
 
-def generate_array_arg_setup(gen, kernel, impl_arg_info, options):
+def generate_array_arg_setup(gen, kernel, impl_arg_info, options,
+        arg_idx_to_cl_arg_idx):
     import loopy as lp
 
     from loopy.kernel.array import ArrayBase
@@ -356,12 +452,12 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options):
         is_written = arg.base_name in kernel.get_written_variables()
         kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
 
-        gen("# {{{ process %s" % arg.name)
-        gen("")
-
         if not issubclass(arg.arg_class, ArrayBase):
             continue
 
+        gen("# {{{ process %s" % arg.name)
+        gen("")
+
         if not options.no_numpy:
             gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name)
             with Indentation(gen):
@@ -552,10 +648,12 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options):
             gen("del _lpy_made_by_loopy")
             gen("")
 
+        cl_arg_idx = arg_idx_to_cl_arg_idx[arg_idx]
+
         if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]:
-            gen("cl_kernel.set_arg(%d, %s.base_data)" % (arg_idx, arg.name))
+            gen("cl_kernel.set_arg(%d, %s.base_data)" % (cl_arg_idx, arg.name))
         else:
-            gen("cl_kernel.set_arg(%d, %s)" % (arg_idx, arg.name))
+            gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name))
         gen("")
 
         gen("# }}}")
@@ -567,7 +665,7 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options):
 # }}}
 
 
-def generate_invoker(kernel, impl_arg_info, options):
+def generate_invoker(kernel, cl_kernel, impl_arg_info, options):
     system_args = [
             "cl_kernel", "queue", "allocator=None", "wait_for=None",
             # ignored if options.no_numpy
@@ -584,7 +682,7 @@ def generate_invoker(kernel, impl_arg_info, options):
     gen.add_to_preamble("import pyopencl.array as _lpy_cl_array")
     gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools")
     gen.add_to_preamble("import numpy as _lpy_np")
-    gen.add_to_preamble("from pyopencl._pvt_struct import pack as _lpy_pack")
+    gen.add_to_preamble("from struct import pack as _lpy_pack")
     gen.add_to_preamble("")
 
     gen("if allocator is None:")
@@ -596,8 +694,10 @@ def generate_invoker(kernel, impl_arg_info, options):
     generate_integer_arg_finding_from_offsets(gen, kernel, impl_arg_info, options)
     generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, options)
 
-    generate_value_arg_setup(gen, kernel, impl_arg_info, options)
-    generate_array_arg_setup(gen, kernel, impl_arg_info, options)
+    arg_idx_to_cl_arg_idx = \
+            generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options)
+    generate_array_arg_setup(gen, kernel, impl_arg_info, options,
+            arg_idx_to_cl_arg_idx)
 
     # {{{ generate invocation
 
@@ -763,7 +863,7 @@ class CompiledKernel:
                 cl_kernel=cl_kernel,
                 impl_arg_info=impl_arg_info,
                 invoker=generate_invoker(
-                    kernel, impl_arg_info, self.kernel.options))
+                    kernel, cl_kernel, impl_arg_info, self.kernel.options))
 
     # {{{ debugging aids
 
diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py
index 174506cd6..ee9366800 100644
--- a/loopy/target/pyopencl/__init__.py
+++ b/loopy/target/pyopencl/__init__.py
@@ -272,8 +272,13 @@ class PyOpenCLTarget(OpenCLTarget):
         return vec.types[base, count]
 
     def alignment_requirement(self, type_decl):
-        import pyopencl._pvt_struct as _struct
-        return _struct.calcsize(type_decl.struct_format())
+        import struct
+
+        fmt = (type_decl.struct_format()
+                .replace("F", "ff")
+                .replace("D", "dd"))
+
+        return struct.calcsize(fmt)
 
 # }}}
 
diff --git a/test/test_dg.py b/test/test_dg.py
index 581562da8..0eb5be224 100644
--- a/test/test_dg.py
+++ b/test/test_dg.py
@@ -43,10 +43,10 @@ def test_dg_volume(ctx_factory):
 
     order = "F"
 
-    N = 3
-    Np = (N+1)*(N+2)*(N+3)//6
+    N = 3  # noqa
+    Np = (N+1)*(N+2)*(N+3)//6  # noqa
 
-    K = 10000
+    K = 10000  # noqa
 
     knl = lp.make_kernel([
             "{[n,m,k]: 0<= n,m < Np and 0<= k < K}",
@@ -146,16 +146,21 @@ def test_dg_volume(ctx_factory):
 
     parameters_dict = dict(K=K)
 
-    for variant in [
+    variants = [
             variant_basic,
             variant_more_per_work_group,
-            variant_image_d,
             variant_prefetch_d,
             variant_prefetch_fields,
             variant_k_ilp,
             variant_simple_padding,
             variant_fancy_padding
-            ]:
+            ]
+
+    if (ctx.devices[0].image_support
+            and ctx.devices[0].platform.name != "Portable Computing Language"):
+        variants.append(variant_image_d)
+
+    for variant in variants:
         lp.auto_test_vs_ref(
                 seq_knl, ctx, variant(knl), parameters=parameters_dict,
                 #codegen_kwargs=dict(with_annotation=True)
@@ -169,12 +174,12 @@ def no_test_dg_surface(ctx_factory):
 
     order = "F"
 
-    N = 3
-    Np = (N+1)*(N+2)*(N+3)//6
-    Nfp = (N+1)*(N+2)//2
-    Nfaces = 4
+    N = 3  # noqa
+    Np = (N+1)*(N+2)*(N+3)//6  # noqa
+    Nfp = (N+1)*(N+2)//2  # noqa
+    Nfaces = 4  # noqa
 
-    K = 10000
+    K = 10000  # noqa
 
     knl = lp.make_kernel(
             [
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 9dbedf320..c019eb67f 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -422,6 +422,10 @@ def test_magma_fermi_matrix_mul(ctx_factory):
 
     n = get_suitable_size(ctx)
 
+    if (not ctx.devices[0].image_support
+            or ctx.devices[0].platform.name == "Portable Computing Language"):
+        pytest.skip("crashes on pocl")
+
     image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT)
     if image_format not in cl.get_supported_image_formats(
             ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D):
@@ -469,6 +473,10 @@ def test_image_matrix_mul(ctx_factory):
 
     n = get_suitable_size(ctx)
 
+    if (not ctx.devices[0].image_support
+            or ctx.devices[0].platform.name == "Portable Computing Language"):
+        pytest.skip("crashes on pocl")
+
     image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT)
     if image_format not in cl.get_supported_image_formats(
             ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D):
@@ -505,6 +513,10 @@ def test_image_matrix_mul_ilp(ctx_factory):
     ctx = ctx_factory()
     order = "C"
 
+    if (not ctx.devices[0].image_support
+            or ctx.devices[0].platform.name == "Portable Computing Language"):
+        pytest.skip("crashes on pocl")
+
     image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT)
     if image_format not in cl.get_supported_image_formats(
             ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D):
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 22c0ce47c..1fa35101d 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -379,6 +379,10 @@ def test_stencil_with_overfetch(ctx_factory):
                 ],
             assumptions="n>=1")
 
+    if ctx.devices[0].platform.name == "Portable Computing Language":
+        # https://github.com/pocl/pocl/issues/205
+        pytest.skip("takes very long to compile on pocl")
+
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
 
     ref_knl = knl
-- 
GitLab