diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index fc5a1ca68681835a6786980a175fbb8d57c4d453..e71ea2c6d053188c0e2211fdf7868c6a75cc9af0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -23,6 +23,18 @@ Python 3.4 AMD CPU: - amd-cl-cpu except: - tags +Python 2.7 POCL: + script: + - export PY_EXE=python2.7 + - export PYOPENCL_TEST=portable + - export EXTRA_INSTALL="numpy mako" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python2.7 + - pocl + except: + - tags # PyPy AMD CPU: # script: # - export PY_EXE=pypy diff --git a/build-helpers/loopy.spec b/build-helpers/loopy.spec index e24a24db9bda96f149e4279a6c37c1603476c3a4..7650624932eb4b6aa95e888410e8a0aa1a6d518a 100644 --- a/build-helpers/loopy.spec +++ b/build-helpers/loopy.spec @@ -9,7 +9,7 @@ a = Analysis(['bin/loopy'], hiddenimports=[], hookspath=None, runtime_hooks=None, - excludes=["hedge", "meshpy", "pyopencl", "pycparser", "PIL"] + excludes=["hedge", "meshpy", "pyopencl", "PIL"] ) pyz = PYZ(a.pure) diff --git a/build-helpers/make-linux-build-docker-inner.sh b/build-helpers/make-linux-build-docker-inner.sh index 43ce1020196a4872e5031ef54af876b71ef33d32..6b72639e0de24ee7e424a6157de2064e5beee978 100755 --- a/build-helpers/make-linux-build-docker-inner.sh +++ b/build-helpers/make-linux-build-docker-inner.sh @@ -8,7 +8,7 @@ cd /tmp/build useradd -d /home/user -m -s /bin/bash user -yum install -y git python-devel tar gcc gcc-c++ mercurial numpy +yum install -y git python-devel tar gcc gcc-c++ mercurial numpy libffi-devel VENV_VERSION="virtualenv-1.9.1" rm -Rf "$VENV_VERSION" @@ -20,7 +20,7 @@ $VENV_VERSION/virtualenv.py --system-site-packages --no-setuptools .env source .env/bin/activate curl -k https://ssl.tiker.net/software/ez_setup.py | python - -curl http://git.tiker.net/pip/blob_plain/77f959a3ce9cc506efbf3a17290d387d0a6624f5:/contrib/get-pip.py | python - +curl -k https://gitlab.tiker.net/inducer/pip/raw/7.0.3/contrib/get-pip.py | python - pip install pyinstaller git clone --recursive git://github.com/inducer/loopy diff --git a/loopy/__init__.py b/loopy/__init__.py index ecadbf4e95b4f7e12d5093c6a21929599fcdcc62..c63aa5f90d6537496ea9fe4ecb11c441070c91b4 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -1495,7 +1495,10 @@ def register_function_manglers(kernel, manglers): # {{{ cache control import os -CACHING_ENABLED = "LOOPY_NO_CACHE" not in os.environ +CACHING_ENABLED = ( + "LOOPY_NO_CACHE" not in os.environ + and + "CG_NO_CACHE" not in os.environ) def set_caching_enabled(flag): diff --git a/loopy/check.py b/loopy/check.py index 477a6336f995c977cfcf5fb52e6d972d95a74468..3401f7b8d86eb8e89d0bdb0629b5e85d57fff771 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -450,11 +450,27 @@ def check_implemented_domains(kernel, implemented_domains, code=None): from islpy import align_two + last_idomains = None + last_insn_inames = None + for insn_id, idomains in six.iteritems(implemented_domains): insn = kernel.id_to_insn[insn_id] assert idomains + insn_inames = kernel.insn_inames(insn) + + # {{{ if we've checked the same thing before, no need to check it again + + if last_idomains is not None and last_insn_inames is not None: + if idomains == last_idomains and insn_inames == last_insn_inames: + continue + + last_idomains = idomains + last_insn_inames = insn_inames + + # }}} + insn_impl_domain = idomains[0] for idomain in idomains[1:]: insn_impl_domain = insn_impl_domain | idomain @@ -463,13 +479,12 @@ def check_implemented_domains(kernel, implemented_domains, code=None): assumption_non_param, insn_impl_domain) insn_impl_domain = ( (insn_impl_domain & assumptions) - .project_out_except(kernel.insn_inames(insn), [dim_type.set])) + .project_out_except(insn_inames, [dim_type.set])) - insn_inames = kernel.insn_inames(insn) insn_domain = kernel.get_inames_domain(insn_inames) assumptions, insn_domain = align_two(assumption_non_param, insn_domain) desired_domain = ((insn_domain & assumptions) - .project_out_except(kernel.insn_inames(insn), [dim_type.set])) + .project_out_except(insn_inames, [dim_type.set])) insn_impl_domain, desired_domain = align_two( insn_impl_domain, desired_domain) @@ -483,13 +498,18 @@ def check_implemented_domains(kernel, implemented_domains, code=None): for i in range(insn_domain.dim(dim_type.param))) lines = [] - for kind, diff_set in [ - ("implemented, but not desired", i_minus_d), - ("desired, but not implemented", d_minus_i)]: + for kind, diff_set, gist_domain in [ + ("implemented, but not desired", i_minus_d, + desired_domain.gist(insn_impl_domain)), + ("desired, but not implemented", d_minus_i, + insn_impl_domain.gist(desired_domain))]: + + if diff_set.is_empty(): + continue + diff_set = diff_set.coalesce() pt = diff_set.sample_point() - if pt.is_void(): - continue + assert not pt.is_void() #pt_set = isl.Set.from_point(pt) #lines.append("point implemented: %s" % (pt_set <= insn_impl_domain)) @@ -503,7 +523,9 @@ def check_implemented_domains(kernel, implemented_domains, code=None): iname, pt.get_coordinate_val(tp, dim).to_python())) lines.append( - "sample point %s: %s" % (kind, ", ".join(point_axes))) + "sample point in %s: %s" % (kind, ", ".join(point_axes))) + lines.append( + "gist of %s: %s" % (kind, gist_domain)) if code is not None: print(79*"-") diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index a613e4882ecb916d9088173851b86e5461325c57..948c419c7989174d0824c342981f88b6fa4b8e6b 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -50,7 +50,7 @@ def get_admissible_conditional_inames_for(kernel, sched_index): if not has_barrier or not isinstance(tag, LocalIndexTag): result.add(iname) - return result + return frozenset(result) def generate_code_for_sched_index(kernel, sched_index, codegen_state): @@ -104,37 +104,6 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state): % type(sched_item)) -def remove_inames_for_shared_hw_axes(kernel, cond_inames): - """ - See if cond_inames contains references to two (or more) inames that - boil down to the same tag. If so, exclude them. (We shouldn't be writing - conditionals for such inames because we would be implicitly restricting - the other inames as well.) - """ - - tag_key_uses = {} - - from loopy.kernel.data import HardwareParallelTag - - for iname in cond_inames: - tag = kernel.iname_to_tag.get(iname) - - if isinstance(tag, HardwareParallelTag): - tag_key_uses.setdefault(tag.key, []).append(iname) - - multi_use_keys = set( - key for key, user_inames in six.iteritems(tag_key_uses) - if len(user_inames) > 1) - - multi_use_inames = set() - for iname in cond_inames: - tag = kernel.iname_to_tag.get(iname) - if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys: - multi_use_inames.add(iname) - - return frozenset(cond_inames - multi_use_inames) - - def get_required_predicates(kernel, sched_index): result = None for _, sched_item in generate_sub_sched_items(kernel.schedule, sched_index): @@ -154,6 +123,25 @@ def get_required_predicates(kernel, sched_index): return result +def group_by(l, key, merge): + if not l: + return l + + result = [] + previous = l[0] + + for item in l[1:]: + if key(previous) == key(item): + previous = merge(previous, item) + + else: + result.append(previous) + previous = item + + result.append(previous) + return result + + def build_loop_nest(kernel, sched_index, codegen_state): # Most of the complexity of this function goes towards finding groups of # instructions that can be nested inside a shared conditional. @@ -164,26 +152,29 @@ def build_loop_nest(kernel, sched_index, codegen_state): my_sched_indices = [] - while sched_index < len(kernel.schedule): - sched_item = kernel.schedule[sched_index] + i = sched_index + while i < len(kernel.schedule): + sched_item = kernel.schedule[i] if isinstance(sched_item, LeaveLoop): break - my_sched_indices.append(sched_index) + my_sched_indices.append(i) if isinstance(sched_item, EnterLoop): - _, sched_index = gather_schedule_subloop( - kernel.schedule, sched_index) + _, i = gather_schedule_subloop( + kernel.schedule, i) elif isinstance(sched_item, Barrier): - sched_index += 1 + i += 1 elif isinstance(sched_item, RunInstruction): - sched_index += 1 + i += 1 else: raise RuntimeError("unexpected schedule item type: %s" % type(sched_item)) + del i + # }}} # {{{ pass 2: find admissible conditional inames for each sibling schedule item @@ -195,16 +186,32 @@ def build_loop_nest(kernel, sched_index, codegen_state): .. attribute:: schedule_index .. attribute:: admissible_cond_inames .. attribute:: required_predicates + .. attribute:: used_inames_within """ + from loopy.schedule import find_used_inames_within sched_index_info_entries = [ ScheduleIndexInfo( - schedule_index=i, + schedule_indices=[i], admissible_cond_inames=( get_admissible_conditional_inames_for(kernel, i)), - required_predicates=get_required_predicates(kernel, i) + required_predicates=get_required_predicates(kernel, i), + used_inames_within=find_used_inames_within(kernel, i) ) - for i in my_sched_indices] + for i in my_sched_indices + ] + + sched_index_info_entries = group_by( + sched_index_info_entries, + key=lambda sii: ( + sii.admissible_cond_inames, + sii.required_predicates, + sii.used_inames_within), + merge=lambda sii1, sii2: sii1.copy( + schedule_indices=( + sii1.schedule_indices + + + sii2.schedule_indices))) # }}} @@ -236,10 +243,10 @@ def build_loop_nest(kernel, sched_index, codegen_state): def build_insn_group(sched_index_info_entries, codegen_state, done_group_lengths=set()): """ - :arg done_group_lengths: A set of group lengths (integers) that grows from - empty to include 1 and upwards with every recursive call. - It serves to prevent infinite recursion by preventing recursive - calls from doing anything about groups that are too small. + :arg done_group_lengths: A set of group lengths (integers) that grows + from empty to include the longest found group and downwards with every + recursive call. It serves to prevent infinite recursion by preventing + recursive calls from doing anything about groups that are too small. """ # The rough plan here is that build_insn_group starts out with the @@ -259,10 +266,9 @@ def build_loop_nest(kernel, sched_index, codegen_state): if not sched_index_info_entries: return [] - si_entry = sched_index_info_entries[0] - sched_index = si_entry.schedule_index - current_iname_set = si_entry.admissible_cond_inames - current_pred_set = (si_entry.required_predicates + origin_si_entry = sched_index_info_entries[0] + current_iname_set = origin_si_entry.admissible_cond_inames + current_pred_set = (origin_si_entry.required_predicates - codegen_state.implemented_predicates) # {{{ grow schedule item group @@ -293,22 +299,19 @@ def build_loop_nest(kernel, sched_index, codegen_state): # {{{ see which inames are actually used in group # And only generate conditionals for those. - from loopy.schedule import find_used_inames_within used_inames = set() for sched_index_info_entry in \ sched_index_info_entries[0:candidate_group_length]: - used_inames |= find_used_inames_within(kernel, - sched_index_info_entry.schedule_index) + used_inames |= sched_index_info_entry.used_inames_within # }}} - only_unshared_inames = remove_inames_for_shared_hw_axes(kernel, + only_unshared_inames = kernel.remove_inames_for_shared_hw_axes( current_iname_set & used_inames) bounds_checks = bounds_check_cache(only_unshared_inames) if (bounds_checks # found a bounds check - or bounds_checks is None # found impossible bounds check or current_pred_set or candidate_group_length == 1): # length-1 must always be an option to reach the recursion base @@ -316,6 +319,11 @@ def build_loop_nest(kernel, sched_index, codegen_state): found_hoists.append((candidate_group_length, bounds_checks, current_pred_set)) + if not bounds_checks and not current_pred_set: + # already no more checks possible, let's not waste time + # checking longer groups. + break + candidate_group_length += 1 # }}} @@ -352,13 +360,15 @@ def build_loop_nest(kernel, sched_index, codegen_state): if group_length == 1: # group only contains starting schedule item def gen_code(inner_codegen_state): - inner = generate_code_for_sched_index( - kernel, sched_index, inner_codegen_state) + result = [] + for i in origin_si_entry.schedule_indices: + inner = generate_code_for_sched_index( + kernel, i, inner_codegen_state) + + if inner is not None: + result.append(inner) - if inner is None: - return [] - else: - return [inner] + return result else: # recurse with a bigger done_group_lengths diff --git a/loopy/compiled.py b/loopy/compiled.py index c5928d6e7cd96b5e60d60b0c5dd13e21e33fb67f..da659eaba5e3c7c8c99993946da1e7af5bb399bc 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -27,6 +27,7 @@ THE SOFTWARE. """ +import sys import numpy as np from pytools import Record, memoize_method from loopy.diagnostic import ParameterFinderWarning @@ -296,46 +297,141 @@ def generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, option # {{{ value arg setup -def generate_value_arg_setup(gen, kernel, impl_arg_info, options): +def generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options): import loopy as lp from loopy.kernel.array import ArrayBase + # {{{ arg counting bug handling + + # For example: + # https://github.com/pocl/pocl/issues/197 + # (but Apple CPU has a similar bug) + + work_around_arg_count_bug = False + warn_about_arg_count_bug = False + + from pyopencl.characterize import has_struct_arg_count_bug + + devices = cl_kernel.context.devices + + count_bug_per_dev = [ + has_struct_arg_count_bug(dev) + for dev in devices] + + if any(count_bug_per_dev): + if all(count_bug_per_dev): + work_around_arg_count_bug = True + else: + warn_about_arg_count_bug = True + + # }}} + + cl_arg_idx = 0 + arg_idx_to_cl_arg_idx = {} + + fp_arg_count = 0 + for arg_idx, arg in enumerate(impl_arg_info): + arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx + if arg.arg_class is not lp.ValueArg: assert issubclass(arg.arg_class, ArrayBase) + + # assume each of those generates exactly one... + cl_arg_idx += 1 + continue gen("# {{{ process %s" % arg.name) gen("") if not options.skip_arg_checks: - gen("if %s is None:" % arg.name) - with Indentation(gen): - gen("raise RuntimeError(\"input argument '%s' must " - "be supplied\")" % arg.name) - gen("") - - if arg.dtype.kind == "i": - gen("# cast to int to avoid numpy scalar trouble with Boost.Python") - gen("%s = int(%s)" % (arg.name, arg.name)) + gen(""" + if {name} is None: + raise RuntimeError("input argument '{name}' must " + "be supplied") + """.format(name=arg.name)) + + if sys.version_info < (2, 7) and arg.dtype.kind == "i": + gen("# cast to long to avoid trouble with struct packing") + gen("%s = long(%s)" % (arg.name, arg.name)) gen("") if arg.dtype.char == "V": - gen("cl_kernel.set_arg(%d, %s)" % (arg_idx, arg.name)) + gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name)) + cl_arg_idx += 1 + + elif arg.dtype.kind == "c": + if warn_about_arg_count_bug: + from warnings import warn + warn("{knl_name}: arguments include complex numbers, and " + "some (but not all) of the target devices mishandle " + "struct kernel arguments (hence the workaround is " + "disabled".format( + knl_name=kernel.name)) + + if arg.dtype == np.complex64: + arg_char = "f" + elif arg.dtype == np.complex128: + arg_char = "d" + else: + raise TypeError("unexpected complex type: %s" % arg.dtype) + + if (work_around_arg_count_bug + and arg.dtype == np.complex128 + and fp_arg_count + 2 <= 8): + gen( + "buf = _lpy_pack('{arg_char}', {arg_var}.real)" + .format(arg_char=arg_char, arg_var=arg.name)) + gen( + "cl_kernel.set_arg({cl_arg_idx}, buf)" + .format(cl_arg_idx=cl_arg_idx)) + cl_arg_idx += 1 + + gen( + "buf = _lpy_pack('{arg_char}', {arg_var}.imag)" + .format(arg_char=arg_char, arg_var=arg.name)) + gen( + "cl_kernel.set_arg({cl_arg_idx}, buf)" + .format(cl_arg_idx=cl_arg_idx)) + cl_arg_idx += 1 + else: + gen( + "buf = _lpy_pack('{arg_char}{arg_char}', " + "{arg_var}.real, {arg_var}.imag)" + .format(arg_char=arg_char, arg_var=arg.name)) + gen( + "cl_kernel.set_arg({cl_arg_idx}, buf)" + .format(cl_arg_idx=cl_arg_idx)) + cl_arg_idx += 1 + + fp_arg_count += 2 + else: - gen("cl_kernel.set_arg(%d, _lpy_pack(\"%s\", %s))" - % (arg_idx, arg.dtype.char, arg.name)) + if arg.dtype.kind == "f": + fp_arg_count += 1 + + gen("cl_kernel.set_arg(%d, _lpy_pack('%s', %s))" + % (cl_arg_idx, arg.dtype.char, arg.name)) + + cl_arg_idx += 1 + gen("") gen("# }}}") gen("") + assert cl_arg_idx == cl_kernel.num_args + + return arg_idx_to_cl_arg_idx + # }}} # {{{ array arg setup -def generate_array_arg_setup(gen, kernel, impl_arg_info, options): +def generate_array_arg_setup(gen, kernel, impl_arg_info, options, + arg_idx_to_cl_arg_idx): import loopy as lp from loopy.kernel.array import ArrayBase @@ -356,12 +452,12 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options): is_written = arg.base_name in kernel.get_written_variables() kernel_arg = kernel.impl_arg_to_arg.get(arg.name) - gen("# {{{ process %s" % arg.name) - gen("") - if not issubclass(arg.arg_class, ArrayBase): continue + gen("# {{{ process %s" % arg.name) + gen("") + if not options.no_numpy: gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name) with Indentation(gen): @@ -552,10 +648,12 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options): gen("del _lpy_made_by_loopy") gen("") + cl_arg_idx = arg_idx_to_cl_arg_idx[arg_idx] + if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]: - gen("cl_kernel.set_arg(%d, %s.base_data)" % (arg_idx, arg.name)) + gen("cl_kernel.set_arg(%d, %s.base_data)" % (cl_arg_idx, arg.name)) else: - gen("cl_kernel.set_arg(%d, %s)" % (arg_idx, arg.name)) + gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name)) gen("") gen("# }}}") @@ -567,7 +665,7 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options): # }}} -def generate_invoker(kernel, impl_arg_info, options): +def generate_invoker(kernel, cl_kernel, impl_arg_info, options): system_args = [ "cl_kernel", "queue", "allocator=None", "wait_for=None", # ignored if options.no_numpy @@ -584,7 +682,7 @@ def generate_invoker(kernel, impl_arg_info, options): gen.add_to_preamble("import pyopencl.array as _lpy_cl_array") gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools") gen.add_to_preamble("import numpy as _lpy_np") - gen.add_to_preamble("from pyopencl._pvt_struct import pack as _lpy_pack") + gen.add_to_preamble("from struct import pack as _lpy_pack") gen.add_to_preamble("") gen("if allocator is None:") @@ -596,8 +694,10 @@ def generate_invoker(kernel, impl_arg_info, options): generate_integer_arg_finding_from_offsets(gen, kernel, impl_arg_info, options) generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, options) - generate_value_arg_setup(gen, kernel, impl_arg_info, options) - generate_array_arg_setup(gen, kernel, impl_arg_info, options) + arg_idx_to_cl_arg_idx = \ + generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options) + generate_array_arg_setup(gen, kernel, impl_arg_info, options, + arg_idx_to_cl_arg_idx) # {{{ generate invocation @@ -763,7 +863,7 @@ class CompiledKernel: cl_kernel=cl_kernel, impl_arg_info=impl_arg_info, invoker=generate_invoker( - kernel, impl_arg_info, self.kernel.options)) + kernel, cl_kernel, impl_arg_info, self.kernel.options)) # {{{ debugging aids diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 24588976a2971d16d58dba03a44035cbc494397a..485de9ac2716e0c5e51ba02830d9197acbfc991d 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -635,6 +635,37 @@ class LoopKernel(RecordWithoutPickling): return result + @memoize_method + def remove_inames_for_shared_hw_axes(self, cond_inames): + """ + See if cond_inames contains references to two (or more) inames that + boil down to the same tag. If so, exclude them. (We shouldn't be writing + conditionals for such inames because we would be implicitly restricting + the other inames as well.) + """ + + tag_key_uses = {} + + from loopy.kernel.data import HardwareParallelTag + + for iname in cond_inames: + tag = self.iname_to_tag.get(iname) + + if isinstance(tag, HardwareParallelTag): + tag_key_uses.setdefault(tag.key, []).append(iname) + + multi_use_keys = set( + key for key, user_inames in six.iteritems(tag_key_uses) + if len(user_inames) > 1) + + multi_use_inames = set() + for iname in cond_inames: + tag = self.iname_to_tag.get(iname) + if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys: + multi_use_inames.add(iname) + + return frozenset(cond_inames - multi_use_inames) + # }}} # {{{ dependency wrangling diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py index 174506cd65a81b405053c212c9683f9dd2df2cc1..ee936680016b6808723076034c8486a49544e2bc 100644 --- a/loopy/target/pyopencl/__init__.py +++ b/loopy/target/pyopencl/__init__.py @@ -272,8 +272,13 @@ class PyOpenCLTarget(OpenCLTarget): return vec.types[base, count] def alignment_requirement(self, type_decl): - import pyopencl._pvt_struct as _struct - return _struct.calcsize(type_decl.struct_format()) + import struct + + fmt = (type_decl.struct_format() + .replace("F", "ff") + .replace("D", "dd")) + + return struct.calcsize(fmt) # }}} diff --git a/test/test_dg.py b/test/test_dg.py index 581562da89210ea476700191c6d21ad2dbe7fd3d..0eb5be224d23fb295b229b3913ef479dc519e9fa 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -43,10 +43,10 @@ def test_dg_volume(ctx_factory): order = "F" - N = 3 - Np = (N+1)*(N+2)*(N+3)//6 + N = 3 # noqa + Np = (N+1)*(N+2)*(N+3)//6 # noqa - K = 10000 + K = 10000 # noqa knl = lp.make_kernel([ "{[n,m,k]: 0<= n,m < Np and 0<= k < K}", @@ -146,16 +146,21 @@ def test_dg_volume(ctx_factory): parameters_dict = dict(K=K) - for variant in [ + variants = [ variant_basic, variant_more_per_work_group, - variant_image_d, variant_prefetch_d, variant_prefetch_fields, variant_k_ilp, variant_simple_padding, variant_fancy_padding - ]: + ] + + if (ctx.devices[0].image_support + and ctx.devices[0].platform.name != "Portable Computing Language"): + variants.append(variant_image_d) + + for variant in variants: lp.auto_test_vs_ref( seq_knl, ctx, variant(knl), parameters=parameters_dict, #codegen_kwargs=dict(with_annotation=True) @@ -169,12 +174,12 @@ def no_test_dg_surface(ctx_factory): order = "F" - N = 3 - Np = (N+1)*(N+2)*(N+3)//6 - Nfp = (N+1)*(N+2)//2 - Nfaces = 4 + N = 3 # noqa + Np = (N+1)*(N+2)*(N+3)//6 # noqa + Nfp = (N+1)*(N+2)//2 # noqa + Nfaces = 4 # noqa - K = 10000 + K = 10000 # noqa knl = lp.make_kernel( [ diff --git a/test/test_linalg.py b/test/test_linalg.py index 9dbedf320eec358d29245e35ab009ded910f1b5c..c019eb67fbaba5e6d8983665b67002837225d9ad 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -422,6 +422,10 @@ def test_magma_fermi_matrix_mul(ctx_factory): n = get_suitable_size(ctx) + if (not ctx.devices[0].image_support + or ctx.devices[0].platform.name == "Portable Computing Language"): + pytest.skip("crashes on pocl") + image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) if image_format not in cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D): @@ -469,6 +473,10 @@ def test_image_matrix_mul(ctx_factory): n = get_suitable_size(ctx) + if (not ctx.devices[0].image_support + or ctx.devices[0].platform.name == "Portable Computing Language"): + pytest.skip("crashes on pocl") + image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) if image_format not in cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D): @@ -505,6 +513,10 @@ def test_image_matrix_mul_ilp(ctx_factory): ctx = ctx_factory() order = "C" + if (not ctx.devices[0].image_support + or ctx.devices[0].platform.name == "Portable Computing Language"): + pytest.skip("crashes on pocl") + image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) if image_format not in cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D): diff --git a/test/test_loopy.py b/test/test_loopy.py index 22c0ce47c8cfc2aea051b63d5f50603f0d406f70..1fa35101d93066c5de9a0539db17d337305be2e0 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -379,6 +379,10 @@ def test_stencil_with_overfetch(ctx_factory): ], assumptions="n>=1") + if ctx.devices[0].platform.name == "Portable Computing Language": + # https://github.com/pocl/pocl/issues/205 + pytest.skip("takes very long to compile on pocl") + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl