diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index fc5a1ca68681835a6786980a175fbb8d57c4d453..e71ea2c6d053188c0e2211fdf7868c6a75cc9af0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -23,6 +23,18 @@ Python 3.4 AMD CPU: - amd-cl-cpu except: - tags +Python 2.7 POCL: + script: + - export PY_EXE=python2.7 + - export PYOPENCL_TEST=portable + - export EXTRA_INSTALL="numpy mako" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python2.7 + - pocl + except: + - tags # PyPy AMD CPU: # script: # - export PY_EXE=pypy diff --git a/loopy/compiled.py b/loopy/compiled.py index c5928d6e7cd96b5e60d60b0c5dd13e21e33fb67f..da659eaba5e3c7c8c99993946da1e7af5bb399bc 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -27,6 +27,7 @@ THE SOFTWARE. """ +import sys import numpy as np from pytools import Record, memoize_method from loopy.diagnostic import ParameterFinderWarning @@ -296,46 +297,141 @@ def generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, option # {{{ value arg setup -def generate_value_arg_setup(gen, kernel, impl_arg_info, options): +def generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options): import loopy as lp from loopy.kernel.array import ArrayBase + # {{{ arg counting bug handling + + # For example: + # https://github.com/pocl/pocl/issues/197 + # (but Apple CPU has a similar bug) + + work_around_arg_count_bug = False + warn_about_arg_count_bug = False + + from pyopencl.characterize import has_struct_arg_count_bug + + devices = cl_kernel.context.devices + + count_bug_per_dev = [ + has_struct_arg_count_bug(dev) + for dev in devices] + + if any(count_bug_per_dev): + if all(count_bug_per_dev): + work_around_arg_count_bug = True + else: + warn_about_arg_count_bug = True + + # }}} + + cl_arg_idx = 0 + arg_idx_to_cl_arg_idx = {} + + fp_arg_count = 0 + for arg_idx, arg in enumerate(impl_arg_info): + arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx + if arg.arg_class is not lp.ValueArg: assert issubclass(arg.arg_class, ArrayBase) + + # assume each of those generates exactly one... + cl_arg_idx += 1 + continue gen("# {{{ process %s" % arg.name) gen("") if not options.skip_arg_checks: - gen("if %s is None:" % arg.name) - with Indentation(gen): - gen("raise RuntimeError(\"input argument '%s' must " - "be supplied\")" % arg.name) - gen("") - - if arg.dtype.kind == "i": - gen("# cast to int to avoid numpy scalar trouble with Boost.Python") - gen("%s = int(%s)" % (arg.name, arg.name)) + gen(""" + if {name} is None: + raise RuntimeError("input argument '{name}' must " + "be supplied") + """.format(name=arg.name)) + + if sys.version_info < (2, 7) and arg.dtype.kind == "i": + gen("# cast to long to avoid trouble with struct packing") + gen("%s = long(%s)" % (arg.name, arg.name)) gen("") if arg.dtype.char == "V": - gen("cl_kernel.set_arg(%d, %s)" % (arg_idx, arg.name)) + gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name)) + cl_arg_idx += 1 + + elif arg.dtype.kind == "c": + if warn_about_arg_count_bug: + from warnings import warn + warn("{knl_name}: arguments include complex numbers, and " + "some (but not all) of the target devices mishandle " + "struct kernel arguments (hence the workaround is " + "disabled".format( + knl_name=kernel.name)) + + if arg.dtype == np.complex64: + arg_char = "f" + elif arg.dtype == np.complex128: + arg_char = "d" + else: + raise TypeError("unexpected complex type: %s" % arg.dtype) + + if (work_around_arg_count_bug + and arg.dtype == np.complex128 + and fp_arg_count + 2 <= 8): + gen( + "buf = _lpy_pack('{arg_char}', {arg_var}.real)" + .format(arg_char=arg_char, arg_var=arg.name)) + gen( + "cl_kernel.set_arg({cl_arg_idx}, buf)" + .format(cl_arg_idx=cl_arg_idx)) + cl_arg_idx += 1 + + gen( + "buf = _lpy_pack('{arg_char}', {arg_var}.imag)" + .format(arg_char=arg_char, arg_var=arg.name)) + gen( + "cl_kernel.set_arg({cl_arg_idx}, buf)" + .format(cl_arg_idx=cl_arg_idx)) + cl_arg_idx += 1 + else: + gen( + "buf = _lpy_pack('{arg_char}{arg_char}', " + "{arg_var}.real, {arg_var}.imag)" + .format(arg_char=arg_char, arg_var=arg.name)) + gen( + "cl_kernel.set_arg({cl_arg_idx}, buf)" + .format(cl_arg_idx=cl_arg_idx)) + cl_arg_idx += 1 + + fp_arg_count += 2 + else: - gen("cl_kernel.set_arg(%d, _lpy_pack(\"%s\", %s))" - % (arg_idx, arg.dtype.char, arg.name)) + if arg.dtype.kind == "f": + fp_arg_count += 1 + + gen("cl_kernel.set_arg(%d, _lpy_pack('%s', %s))" + % (cl_arg_idx, arg.dtype.char, arg.name)) + + cl_arg_idx += 1 + gen("") gen("# }}}") gen("") + assert cl_arg_idx == cl_kernel.num_args + + return arg_idx_to_cl_arg_idx + # }}} # {{{ array arg setup -def generate_array_arg_setup(gen, kernel, impl_arg_info, options): +def generate_array_arg_setup(gen, kernel, impl_arg_info, options, + arg_idx_to_cl_arg_idx): import loopy as lp from loopy.kernel.array import ArrayBase @@ -356,12 +452,12 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options): is_written = arg.base_name in kernel.get_written_variables() kernel_arg = kernel.impl_arg_to_arg.get(arg.name) - gen("# {{{ process %s" % arg.name) - gen("") - if not issubclass(arg.arg_class, ArrayBase): continue + gen("# {{{ process %s" % arg.name) + gen("") + if not options.no_numpy: gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name) with Indentation(gen): @@ -552,10 +648,12 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options): gen("del _lpy_made_by_loopy") gen("") + cl_arg_idx = arg_idx_to_cl_arg_idx[arg_idx] + if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]: - gen("cl_kernel.set_arg(%d, %s.base_data)" % (arg_idx, arg.name)) + gen("cl_kernel.set_arg(%d, %s.base_data)" % (cl_arg_idx, arg.name)) else: - gen("cl_kernel.set_arg(%d, %s)" % (arg_idx, arg.name)) + gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name)) gen("") gen("# }}}") @@ -567,7 +665,7 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options): # }}} -def generate_invoker(kernel, impl_arg_info, options): +def generate_invoker(kernel, cl_kernel, impl_arg_info, options): system_args = [ "cl_kernel", "queue", "allocator=None", "wait_for=None", # ignored if options.no_numpy @@ -584,7 +682,7 @@ def generate_invoker(kernel, impl_arg_info, options): gen.add_to_preamble("import pyopencl.array as _lpy_cl_array") gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools") gen.add_to_preamble("import numpy as _lpy_np") - gen.add_to_preamble("from pyopencl._pvt_struct import pack as _lpy_pack") + gen.add_to_preamble("from struct import pack as _lpy_pack") gen.add_to_preamble("") gen("if allocator is None:") @@ -596,8 +694,10 @@ def generate_invoker(kernel, impl_arg_info, options): generate_integer_arg_finding_from_offsets(gen, kernel, impl_arg_info, options) generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, options) - generate_value_arg_setup(gen, kernel, impl_arg_info, options) - generate_array_arg_setup(gen, kernel, impl_arg_info, options) + arg_idx_to_cl_arg_idx = \ + generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options) + generate_array_arg_setup(gen, kernel, impl_arg_info, options, + arg_idx_to_cl_arg_idx) # {{{ generate invocation @@ -763,7 +863,7 @@ class CompiledKernel: cl_kernel=cl_kernel, impl_arg_info=impl_arg_info, invoker=generate_invoker( - kernel, impl_arg_info, self.kernel.options)) + kernel, cl_kernel, impl_arg_info, self.kernel.options)) # {{{ debugging aids diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py index 174506cd65a81b405053c212c9683f9dd2df2cc1..ee936680016b6808723076034c8486a49544e2bc 100644 --- a/loopy/target/pyopencl/__init__.py +++ b/loopy/target/pyopencl/__init__.py @@ -272,8 +272,13 @@ class PyOpenCLTarget(OpenCLTarget): return vec.types[base, count] def alignment_requirement(self, type_decl): - import pyopencl._pvt_struct as _struct - return _struct.calcsize(type_decl.struct_format()) + import struct + + fmt = (type_decl.struct_format() + .replace("F", "ff") + .replace("D", "dd")) + + return struct.calcsize(fmt) # }}} diff --git a/test/test_dg.py b/test/test_dg.py index 581562da89210ea476700191c6d21ad2dbe7fd3d..0eb5be224d23fb295b229b3913ef479dc519e9fa 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -43,10 +43,10 @@ def test_dg_volume(ctx_factory): order = "F" - N = 3 - Np = (N+1)*(N+2)*(N+3)//6 + N = 3 # noqa + Np = (N+1)*(N+2)*(N+3)//6 # noqa - K = 10000 + K = 10000 # noqa knl = lp.make_kernel([ "{[n,m,k]: 0<= n,m < Np and 0<= k < K}", @@ -146,16 +146,21 @@ def test_dg_volume(ctx_factory): parameters_dict = dict(K=K) - for variant in [ + variants = [ variant_basic, variant_more_per_work_group, - variant_image_d, variant_prefetch_d, variant_prefetch_fields, variant_k_ilp, variant_simple_padding, variant_fancy_padding - ]: + ] + + if (ctx.devices[0].image_support + and ctx.devices[0].platform.name != "Portable Computing Language"): + variants.append(variant_image_d) + + for variant in variants: lp.auto_test_vs_ref( seq_knl, ctx, variant(knl), parameters=parameters_dict, #codegen_kwargs=dict(with_annotation=True) @@ -169,12 +174,12 @@ def no_test_dg_surface(ctx_factory): order = "F" - N = 3 - Np = (N+1)*(N+2)*(N+3)//6 - Nfp = (N+1)*(N+2)//2 - Nfaces = 4 + N = 3 # noqa + Np = (N+1)*(N+2)*(N+3)//6 # noqa + Nfp = (N+1)*(N+2)//2 # noqa + Nfaces = 4 # noqa - K = 10000 + K = 10000 # noqa knl = lp.make_kernel( [ diff --git a/test/test_linalg.py b/test/test_linalg.py index 9dbedf320eec358d29245e35ab009ded910f1b5c..c019eb67fbaba5e6d8983665b67002837225d9ad 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -422,6 +422,10 @@ def test_magma_fermi_matrix_mul(ctx_factory): n = get_suitable_size(ctx) + if (not ctx.devices[0].image_support + or ctx.devices[0].platform.name == "Portable Computing Language"): + pytest.skip("crashes on pocl") + image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) if image_format not in cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D): @@ -469,6 +473,10 @@ def test_image_matrix_mul(ctx_factory): n = get_suitable_size(ctx) + if (not ctx.devices[0].image_support + or ctx.devices[0].platform.name == "Portable Computing Language"): + pytest.skip("crashes on pocl") + image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) if image_format not in cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D): @@ -505,6 +513,10 @@ def test_image_matrix_mul_ilp(ctx_factory): ctx = ctx_factory() order = "C" + if (not ctx.devices[0].image_support + or ctx.devices[0].platform.name == "Portable Computing Language"): + pytest.skip("crashes on pocl") + image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) if image_format not in cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D): diff --git a/test/test_loopy.py b/test/test_loopy.py index 22c0ce47c8cfc2aea051b63d5f50603f0d406f70..1fa35101d93066c5de9a0539db17d337305be2e0 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -379,6 +379,10 @@ def test_stencil_with_overfetch(ctx_factory): ], assumptions="n>=1") + if ctx.devices[0].platform.name == "Portable Computing Language": + # https://github.com/pocl/pocl/issues/205 + pytest.skip("takes very long to compile on pocl") + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl