From ef60e3273fdcd5658d799c36d85b8c0889e7fe99 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Sun, 5 Jul 2015 13:46:44 -0500 Subject: [PATCH] Compatibility with POCL, work around broken POCL calling conventions --- .gitlab-ci.yml | 12 +++ loopy/compiled.py | 148 +++++++++++++++++++++++++----- loopy/target/pyopencl/__init__.py | 9 +- test/test_dg.py | 27 +++--- test/test_linalg.py | 12 +++ test/test_loopy.py | 4 + 6 files changed, 175 insertions(+), 37 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index fc5a1ca68..e71ea2c6d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -23,6 +23,18 @@ Python 3.4 AMD CPU: - amd-cl-cpu except: - tags +Python 2.7 POCL: + script: + - export PY_EXE=python2.7 + - export PYOPENCL_TEST=portable + - export EXTRA_INSTALL="numpy mako" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python2.7 + - pocl + except: + - tags # PyPy AMD CPU: # script: # - export PY_EXE=pypy diff --git a/loopy/compiled.py b/loopy/compiled.py index c5928d6e7..da659eaba 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -27,6 +27,7 @@ THE SOFTWARE. """ +import sys import numpy as np from pytools import Record, memoize_method from loopy.diagnostic import ParameterFinderWarning @@ -296,46 +297,141 @@ def generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, option # {{{ value arg setup -def generate_value_arg_setup(gen, kernel, impl_arg_info, options): +def generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options): import loopy as lp from loopy.kernel.array import ArrayBase + # {{{ arg counting bug handling + + # For example: + # https://github.com/pocl/pocl/issues/197 + # (but Apple CPU has a similar bug) + + work_around_arg_count_bug = False + warn_about_arg_count_bug = False + + from pyopencl.characterize import has_struct_arg_count_bug + + devices = cl_kernel.context.devices + + count_bug_per_dev = [ + has_struct_arg_count_bug(dev) + for dev in devices] + + if any(count_bug_per_dev): + if all(count_bug_per_dev): + work_around_arg_count_bug = True + else: + warn_about_arg_count_bug = True + + # }}} + + cl_arg_idx = 0 + arg_idx_to_cl_arg_idx = {} + + fp_arg_count = 0 + for arg_idx, arg in enumerate(impl_arg_info): + arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx + if arg.arg_class is not lp.ValueArg: assert issubclass(arg.arg_class, ArrayBase) + + # assume each of those generates exactly one... + cl_arg_idx += 1 + continue gen("# {{{ process %s" % arg.name) gen("") if not options.skip_arg_checks: - gen("if %s is None:" % arg.name) - with Indentation(gen): - gen("raise RuntimeError(\"input argument '%s' must " - "be supplied\")" % arg.name) - gen("") - - if arg.dtype.kind == "i": - gen("# cast to int to avoid numpy scalar trouble with Boost.Python") - gen("%s = int(%s)" % (arg.name, arg.name)) + gen(""" + if {name} is None: + raise RuntimeError("input argument '{name}' must " + "be supplied") + """.format(name=arg.name)) + + if sys.version_info < (2, 7) and arg.dtype.kind == "i": + gen("# cast to long to avoid trouble with struct packing") + gen("%s = long(%s)" % (arg.name, arg.name)) gen("") if arg.dtype.char == "V": - gen("cl_kernel.set_arg(%d, %s)" % (arg_idx, arg.name)) + gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name)) + cl_arg_idx += 1 + + elif arg.dtype.kind == "c": + if warn_about_arg_count_bug: + from warnings import warn + warn("{knl_name}: arguments include complex numbers, and " + "some (but not all) of the target devices mishandle " + "struct kernel arguments (hence the workaround is " + "disabled".format( + knl_name=kernel.name)) + + if arg.dtype == np.complex64: + arg_char = "f" + elif arg.dtype == np.complex128: + arg_char = "d" + else: + raise TypeError("unexpected complex type: %s" % arg.dtype) + + if (work_around_arg_count_bug + and arg.dtype == np.complex128 + and fp_arg_count + 2 <= 8): + gen( + "buf = _lpy_pack('{arg_char}', {arg_var}.real)" + .format(arg_char=arg_char, arg_var=arg.name)) + gen( + "cl_kernel.set_arg({cl_arg_idx}, buf)" + .format(cl_arg_idx=cl_arg_idx)) + cl_arg_idx += 1 + + gen( + "buf = _lpy_pack('{arg_char}', {arg_var}.imag)" + .format(arg_char=arg_char, arg_var=arg.name)) + gen( + "cl_kernel.set_arg({cl_arg_idx}, buf)" + .format(cl_arg_idx=cl_arg_idx)) + cl_arg_idx += 1 + else: + gen( + "buf = _lpy_pack('{arg_char}{arg_char}', " + "{arg_var}.real, {arg_var}.imag)" + .format(arg_char=arg_char, arg_var=arg.name)) + gen( + "cl_kernel.set_arg({cl_arg_idx}, buf)" + .format(cl_arg_idx=cl_arg_idx)) + cl_arg_idx += 1 + + fp_arg_count += 2 + else: - gen("cl_kernel.set_arg(%d, _lpy_pack(\"%s\", %s))" - % (arg_idx, arg.dtype.char, arg.name)) + if arg.dtype.kind == "f": + fp_arg_count += 1 + + gen("cl_kernel.set_arg(%d, _lpy_pack('%s', %s))" + % (cl_arg_idx, arg.dtype.char, arg.name)) + + cl_arg_idx += 1 + gen("") gen("# }}}") gen("") + assert cl_arg_idx == cl_kernel.num_args + + return arg_idx_to_cl_arg_idx + # }}} # {{{ array arg setup -def generate_array_arg_setup(gen, kernel, impl_arg_info, options): +def generate_array_arg_setup(gen, kernel, impl_arg_info, options, + arg_idx_to_cl_arg_idx): import loopy as lp from loopy.kernel.array import ArrayBase @@ -356,12 +452,12 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options): is_written = arg.base_name in kernel.get_written_variables() kernel_arg = kernel.impl_arg_to_arg.get(arg.name) - gen("# {{{ process %s" % arg.name) - gen("") - if not issubclass(arg.arg_class, ArrayBase): continue + gen("# {{{ process %s" % arg.name) + gen("") + if not options.no_numpy: gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name) with Indentation(gen): @@ -552,10 +648,12 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options): gen("del _lpy_made_by_loopy") gen("") + cl_arg_idx = arg_idx_to_cl_arg_idx[arg_idx] + if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]: - gen("cl_kernel.set_arg(%d, %s.base_data)" % (arg_idx, arg.name)) + gen("cl_kernel.set_arg(%d, %s.base_data)" % (cl_arg_idx, arg.name)) else: - gen("cl_kernel.set_arg(%d, %s)" % (arg_idx, arg.name)) + gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name)) gen("") gen("# }}}") @@ -567,7 +665,7 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options): # }}} -def generate_invoker(kernel, impl_arg_info, options): +def generate_invoker(kernel, cl_kernel, impl_arg_info, options): system_args = [ "cl_kernel", "queue", "allocator=None", "wait_for=None", # ignored if options.no_numpy @@ -584,7 +682,7 @@ def generate_invoker(kernel, impl_arg_info, options): gen.add_to_preamble("import pyopencl.array as _lpy_cl_array") gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools") gen.add_to_preamble("import numpy as _lpy_np") - gen.add_to_preamble("from pyopencl._pvt_struct import pack as _lpy_pack") + gen.add_to_preamble("from struct import pack as _lpy_pack") gen.add_to_preamble("") gen("if allocator is None:") @@ -596,8 +694,10 @@ def generate_invoker(kernel, impl_arg_info, options): generate_integer_arg_finding_from_offsets(gen, kernel, impl_arg_info, options) generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, options) - generate_value_arg_setup(gen, kernel, impl_arg_info, options) - generate_array_arg_setup(gen, kernel, impl_arg_info, options) + arg_idx_to_cl_arg_idx = \ + generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options) + generate_array_arg_setup(gen, kernel, impl_arg_info, options, + arg_idx_to_cl_arg_idx) # {{{ generate invocation @@ -763,7 +863,7 @@ class CompiledKernel: cl_kernel=cl_kernel, impl_arg_info=impl_arg_info, invoker=generate_invoker( - kernel, impl_arg_info, self.kernel.options)) + kernel, cl_kernel, impl_arg_info, self.kernel.options)) # {{{ debugging aids diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py index 174506cd6..ee9366800 100644 --- a/loopy/target/pyopencl/__init__.py +++ b/loopy/target/pyopencl/__init__.py @@ -272,8 +272,13 @@ class PyOpenCLTarget(OpenCLTarget): return vec.types[base, count] def alignment_requirement(self, type_decl): - import pyopencl._pvt_struct as _struct - return _struct.calcsize(type_decl.struct_format()) + import struct + + fmt = (type_decl.struct_format() + .replace("F", "ff") + .replace("D", "dd")) + + return struct.calcsize(fmt) # }}} diff --git a/test/test_dg.py b/test/test_dg.py index 581562da8..0eb5be224 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -43,10 +43,10 @@ def test_dg_volume(ctx_factory): order = "F" - N = 3 - Np = (N+1)*(N+2)*(N+3)//6 + N = 3 # noqa + Np = (N+1)*(N+2)*(N+3)//6 # noqa - K = 10000 + K = 10000 # noqa knl = lp.make_kernel([ "{[n,m,k]: 0<= n,m < Np and 0<= k < K}", @@ -146,16 +146,21 @@ def test_dg_volume(ctx_factory): parameters_dict = dict(K=K) - for variant in [ + variants = [ variant_basic, variant_more_per_work_group, - variant_image_d, variant_prefetch_d, variant_prefetch_fields, variant_k_ilp, variant_simple_padding, variant_fancy_padding - ]: + ] + + if (ctx.devices[0].image_support + and ctx.devices[0].platform.name != "Portable Computing Language"): + variants.append(variant_image_d) + + for variant in variants: lp.auto_test_vs_ref( seq_knl, ctx, variant(knl), parameters=parameters_dict, #codegen_kwargs=dict(with_annotation=True) @@ -169,12 +174,12 @@ def no_test_dg_surface(ctx_factory): order = "F" - N = 3 - Np = (N+1)*(N+2)*(N+3)//6 - Nfp = (N+1)*(N+2)//2 - Nfaces = 4 + N = 3 # noqa + Np = (N+1)*(N+2)*(N+3)//6 # noqa + Nfp = (N+1)*(N+2)//2 # noqa + Nfaces = 4 # noqa - K = 10000 + K = 10000 # noqa knl = lp.make_kernel( [ diff --git a/test/test_linalg.py b/test/test_linalg.py index 9dbedf320..c019eb67f 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -422,6 +422,10 @@ def test_magma_fermi_matrix_mul(ctx_factory): n = get_suitable_size(ctx) + if (not ctx.devices[0].image_support + or ctx.devices[0].platform.name == "Portable Computing Language"): + pytest.skip("crashes on pocl") + image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) if image_format not in cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D): @@ -469,6 +473,10 @@ def test_image_matrix_mul(ctx_factory): n = get_suitable_size(ctx) + if (not ctx.devices[0].image_support + or ctx.devices[0].platform.name == "Portable Computing Language"): + pytest.skip("crashes on pocl") + image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) if image_format not in cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D): @@ -505,6 +513,10 @@ def test_image_matrix_mul_ilp(ctx_factory): ctx = ctx_factory() order = "C" + if (not ctx.devices[0].image_support + or ctx.devices[0].platform.name == "Portable Computing Language"): + pytest.skip("crashes on pocl") + image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) if image_format not in cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D): diff --git a/test/test_loopy.py b/test/test_loopy.py index 22c0ce47c..1fa35101d 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -379,6 +379,10 @@ def test_stencil_with_overfetch(ctx_factory): ], assumptions="n>=1") + if ctx.devices[0].platform.name == "Portable Computing Language": + # https://github.com/pocl/pocl/issues/205 + pytest.skip("takes very long to compile on pocl") + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl -- GitLab