diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b6c1ccf8424b3989bef169c63c58b9a4344e2031..ea88059c4c0d74afa0b778e3d9755142cd419a03 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -24,9 +24,9 @@ Python 2.6 POCL: - pocl except: - tags -Python 3.4 AMD CPU: +Python 3.5 AMD CPU: script: - - export PY_EXE=python3.4 + - export PY_EXE=python3.5 - export PYOPENCL_TEST=amd:pu - export EXTRA_INSTALL="numpy mako" - export NO_DOCTESTS=1 @@ -34,7 +34,7 @@ Python 3.4 AMD CPU: - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: - - python3.4 + - python3.5 - amd-cl-cpu except: - tags diff --git a/build-helpers/make-linux-build-docker-inner.sh b/build-helpers/make-linux-build-docker-inner.sh index 119609a3c1767dd1e5ce1781f27c946781d1a79b..a7f621b1ef21676898d2283d93f8a54f086e5d9d 100755 --- a/build-helpers/make-linux-build-docker-inner.sh +++ b/build-helpers/make-linux-build-docker-inner.sh @@ -8,7 +8,7 @@ cd /tmp/build useradd -d /home/user -m -s /bin/bash user -yum install -y centos-release-SCL +yum install -y centos-release-scl yum install -y git python27 python27-python-devel python27-numpy tar gcc gcc-c++ mercurial libffi-devel scl enable python27 /mnt/make-linux-build-docker-inner-part-2.sh diff --git a/loopy/__init__.py b/loopy/__init__.py index 19d9ddbc72c6de90ad7cbfddc740d74a87b2d2a7..4322bf928d524bfa4b52ca9698fcfaefac93d804 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -62,7 +62,7 @@ from loopy.library.reduction import register_reduction_parser from loopy.transform.iname import ( set_loop_priority, split_iname, chunk_iname, join_inames, tag_inames, duplicate_inames, - rename_iname, link_inames, remove_unused_inames, + rename_iname, remove_unused_inames, split_reduction_inward, split_reduction_outward, affine_map_inames, find_unused_axis_tag, make_reduction_inames_unique) @@ -159,7 +159,7 @@ __all__ = [ "set_loop_priority", "split_iname", "chunk_iname", "join_inames", "tag_inames", "duplicate_inames", - "rename_iname", "link_inames", "remove_unused_inames", + "rename_iname", "remove_unused_inames", "split_reduction_inward", "split_reduction_outward", "affine_map_inames", "find_unused_axis_tag", "make_reduction_inames_unique", diff --git a/loopy/auto_test.py b/loopy/auto_test.py index bada80328c337cc5e756afaf05b180f2b027b1ca..b475a40f3580b3b0e3c746a0de90f33cf182aa4e 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -265,7 +265,7 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): host_storage_array = np.empty(alloc_size, dtype) host_array = as_strided( host_storage_array, shape, numpy_strides) - host_array[:] = host_contig_array + host_array[...] = host_contig_array host_contig_array = arg_desc.ref_storage_array.get() storage_array = cl_array.to_device(queue, host_storage_array) diff --git a/loopy/check.py b/loopy/check.py index 910ab24ab3f98a9b01953d30354eda077273423d..0ef1f163aaed071d5dc4d219017b1164342be651 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -59,47 +59,6 @@ def check_loop_priority_inames_known(kernel): raise LoopyError("unknown iname '%s' in loop priorities" % iname) -def check_for_unused_hw_axes_in_insns(kernel): - # FIXME: This could be made specific to the current kernel piece. - group_size, local_size = kernel.get_grid_size_upper_bounds_as_exprs() - - group_axes = set(ax for ax, length in enumerate(group_size)) - local_axes = set(ax for ax, length in enumerate(local_size)) - - # alternative: just disregard length-1 dimensions? - - from loopy.kernel.data import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag - for insn in kernel.instructions: - if insn.boostable: - continue - - group_axes_used = set() - local_axes_used = set() - - for iname in kernel.insn_inames(insn): - tag = kernel.iname_to_tag.get(iname) - - if isinstance(tag, LocalIndexTag): - local_axes_used.add(tag.axis) - elif isinstance(tag, GroupIndexTag): - group_axes_used.add(tag.axis) - elif isinstance(tag, AutoLocalIndexTagBase): - raise LoopyError("auto local tag encountered") - - if group_axes != group_axes_used: - raise LoopyError("instruction '%s' does not use all group hw axes " - "(available: %s used:%s)" - % (insn.id, - ",".join(str(i) for i in group_axes), - ",".join(str(i) for i in group_axes_used))) - if local_axes != local_axes_used: - raise LoopyError("instruction '%s' does not use all local hw axes" - "(available: %s used:%s)" - % (insn.id, - ",".join(str(i) for i in local_axes), - ",".join(str(i) for i in local_axes_used))) - - def check_for_double_use_of_hw_axes(kernel): from loopy.kernel.data import UniqueTag @@ -122,7 +81,7 @@ def check_for_inactive_iname_access(kernel): if not expression_inames <= kernel.insn_inames(insn): raise LoopyError( - "instructiosn '%s' references " + "instruction '%s' references " "inames that the instruction does not depend on" % insn.id) @@ -154,22 +113,13 @@ def _is_racing_iname_tag(tv, tag): def check_for_write_races(kernel): - from loopy.symbolic import DependencyMapper from loopy.kernel.data import ParallelTag - depmap = DependencyMapper(composite_leaves=False) iname_to_tag = kernel.iname_to_tag.get for insn in kernel.instructions: - for assignee_name, assignee_indices in insn.assignees_and_indices(): - assignee_indices = depmap(assignee_indices) - - def strip_var(expr): - from pymbolic.primitives import Variable - assert isinstance(expr, Variable) - return expr.name - - assignee_indices = set(strip_var(index) for index in assignee_indices) - + for assignee_name, assignee_indices in zip( + insn.assignee_var_names(), + insn.assignee_subscript_deps()): assignee_inames = assignee_indices & kernel.all_inames() if not assignee_inames <= kernel.insn_inames(insn): raise LoopyError( @@ -332,7 +282,7 @@ def check_bounds(kernel): def check_write_destinations(kernel): for insn in kernel.instructions: - for wvar, _ in insn.assignees_and_indices(): + for wvar in insn.assignee_var_names(): if wvar in kernel.all_inames(): raise LoopyError("iname '%s' may not be written" % wvar) @@ -363,7 +313,6 @@ def pre_schedule_checks(kernel): check_for_double_use_of_hw_axes(kernel) check_insn_attributes(kernel) check_loop_priority_inames_known(kernel) - check_for_unused_hw_axes_in_insns(kernel) check_for_inactive_iname_access(kernel) check_for_write_races(kernel) check_for_data_dependent_parallel_bounds(kernel) @@ -382,7 +331,89 @@ def pre_schedule_checks(kernel): raise -# {{{ pre-code-generation checks +# {{{ post-schedule / pre-code-generation checks + +def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): + from loopy.schedule import (CallKernel, RunInstruction, + Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, + get_insn_ids_for_block_at, gather_schedule_block) + + if sched_index is None: + group_axes = set() + local_axes = set() + + i = 0 + loop_end_i = past_end_i = len(kernel.schedule) + else: + assert isinstance(kernel.schedule[sched_index], CallKernel) + _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) + group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( + get_insn_ids_for_block_at(kernel.schedule, sched_index)) + + group_axes = set(ax for ax, length in enumerate(group_size)) + local_axes = set(ax for ax, length in enumerate(local_size)) + + i = sched_index + 1 + assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel) + loop_end_i = past_end_i - 1 + + # alternative: just disregard length-1 dimensions? + + from loopy.kernel.data import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag + + while i < loop_end_i: + sched_item = kernel.schedule[i] + if isinstance(sched_item, CallKernel): + i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) + + elif isinstance(sched_item, RunInstruction): + insn = kernel.id_to_insn[sched_item.insn_id] + i += 1 + + if insn.boostable: + continue + + group_axes_used = set() + local_axes_used = set() + + for iname in kernel.insn_inames(insn): + tag = kernel.iname_to_tag.get(iname) + + if isinstance(tag, LocalIndexTag): + local_axes_used.add(tag.axis) + elif isinstance(tag, GroupIndexTag): + group_axes_used.add(tag.axis) + elif isinstance(tag, AutoLocalIndexTagBase): + raise LoopyError("auto local tag encountered") + + if group_axes != group_axes_used: + raise LoopyError("instruction '%s' does not use all group hw axes " + "(available: %s used:%s)" + % (insn.id, + ",".join(str(i) for i in group_axes), + ",".join(str(i) for i in group_axes_used))) + if local_axes != local_axes_used: + raise LoopyError("instruction '%s' does not use all local hw axes " + "(available: %s used:%s)" + % (insn.id, + ",".join(str(i) for i in local_axes), + ",".join(str(i) for i in local_axes_used))) + + elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)): + i += 1 + continue + + else: + raise TypeError( + "schedule item not understood: %s" % type(sched_item).__name__) + + return past_end_i + + +def check_for_unused_hw_axes_in_insns(kernel): + if kernel.schedule: + _check_for_unused_hw_axes_in_kernel_chunk(kernel) + def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): from loopy.kernel.data import ArrayBase, Assignment @@ -459,6 +490,7 @@ def pre_codegen_checks(kernel): try: logger.info("pre-codegen check %s: start" % kernel.name) + check_for_unused_hw_axes_in_insns(kernel) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) kernel.target.pre_codegen_check(kernel) check_that_shapes_and_strides_are_arguments(kernel) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 4438958e53d176535b13262a56f0b30a95d9faf7..2f266d2803f81985a79cd4793abbdb3c89bd1585 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -112,7 +112,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( - new_codegen_state, sched_index + 1) + new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index db3d15184a94aa13e6a4b449a2036869e34ca566..8531b97a947f3c46b984cf7405b794bd68e2f638 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -122,7 +122,25 @@ def generate_assignment_instruction_code(codegen_state, insn): # }}} - (assignee_var_name, assignee_indices), = insn.assignees_and_indices() + from pymbolic.primitives import Variable, Subscript + from loopy.symbolic import LinearSubscript + + lhs = insn.assignee + if isinstance(lhs, Variable): + assignee_var_name = lhs.name + assignee_indices = () + + elif isinstance(lhs, Subscript): + assignee_var_name = lhs.aggregate.name + assignee_indices = lhs.index_tuple + + elif isinstance(lhs, LinearSubscript): + assignee_var_name = lhs.aggregate.name + assignee_indices = (lhs.index,) + + else: + raise RuntimeError("invalid lvalue '%s'" % lhs) + lhs_var = kernel.get_var_descriptor(assignee_var_name) lhs_dtype = lhs_var.dtype diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index a9eb44f84d44015fa11ce83afe410e833bb8e214..648c3fe6f5b748dcc47de5ac972bb82ce605a9a9 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -233,17 +233,23 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, from loopy.kernel.data import ( UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag) + from loopy.schedule import get_insn_ids_for_block_at + insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) + if hw_inames_left is None: + all_inames_by_insns = set() + for insn_id in insn_ids_for_block: + all_inames_by_insns |= kernel.insn_inames(insn_id) + hw_inames_left = [iname - for iname in kernel.all_inames() + for iname in all_inames_by_insns if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)] if not hw_inames_left: return next_func(codegen_state) - from loopy.schedule import get_insn_ids_for_block_at global_size, local_size = kernel.get_grid_sizes_for_insn_ids( - get_insn_ids_for_block_at(kernel.schedule, schedule_index)) + insn_ids_for_block) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index ad92fcefa7db5e20cbc44d0cf643070b8f4c1832..82dbe09986b2567e9951b2446313a8ef5fde4f8c 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -241,19 +241,22 @@ def wrap_in_if(codegen_state, condition_exprs, inner): def generate_host_or_device_program(codegen_state, schedule_index): ast_builder = codegen_state.ast_builder - temp_decls = ast_builder.get_temporary_decls(codegen_state) + temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index) from functools import partial from loopy.codegen.control import build_loop_nest - next_func = partial(build_loop_nest, schedule_index=schedule_index) - if codegen_state.is_generating_device_code: + from loopy.schedule import CallKernel + assert isinstance(codegen_state.kernel.schedule[schedule_index], CallKernel) + from loopy.codegen.loop import set_up_hw_parallel_loops codegen_result = set_up_hw_parallel_loops( - codegen_state, schedule_index, next_func=next_func) + codegen_state, schedule_index, + next_func=partial(build_loop_nest, + schedule_index=schedule_index + 1)) else: - codegen_result = next_func(codegen_state) + codegen_result = build_loop_nest(codegen_state, schedule_index) codegen_result = merge_codegen_results( codegen_state, diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 2ea1bd40e540b24d5a08e4136ab603d6c5a422a1..c1d0e03633772bb7e3f86807f579e2b33793e6aa 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -23,6 +23,9 @@ THE SOFTWARE. """ +from pytools import MovedFunctionDeprecationWrapper + + # {{{ warnings class LoopyWarningBase(UserWarning): @@ -47,7 +50,7 @@ class WriteRaceConditionWarning(LoopyWarning): # }}} -def warn(kernel, id, text, type=LoopyWarning): +def warn_with_kernel(kernel, id, text, type=LoopyWarning): from fnmatch import fnmatchcase for sw in kernel.silenced_warnings: if fnmatchcase(id, sw): @@ -60,6 +63,9 @@ def warn(kernel, id, text, type=LoopyWarning): warn(text, type) +warn = MovedFunctionDeprecationWrapper(warn_with_kernel) + + # {{{ errors class LoopyError(RuntimeError): diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 5ac63b56eb6347d93e65a3fa860840c37cabe146..bfd8ee284f0cf4439cb4409ba807777900be726a 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -669,9 +669,11 @@ class LoopKernel(RecordWithoutPickling): """Return a mapping from instruction ids to inames inside which they should be run. """ + result = {} + for insn in self.instructions: + result[insn.id] = insn.forced_iname_deps - from loopy.kernel.tools import find_all_insn_inames - return find_all_insn_inames(self) + return result @memoize_method def all_referenced_inames(self): @@ -681,11 +683,9 @@ class LoopKernel(RecordWithoutPickling): return result def insn_inames(self, insn): - from loopy.kernel.data import InstructionBase - if isinstance(insn, InstructionBase): - return self.all_insn_inames()[insn.id] - else: - return self.all_insn_inames()[insn] + if isinstance(insn, str): + insn = self.id_to_insn[insn] + return insn.forced_iname_deps @memoize_method def iname_to_insns(self): @@ -790,7 +790,7 @@ class LoopKernel(RecordWithoutPickling): result = {} for insn in self.instructions: - for var_name, _ in insn.assignees_and_indices(): + for var_name in insn.assignee_var_names(): result.setdefault(var_name, set()).add(insn.id) return result @@ -807,7 +807,7 @@ class LoopKernel(RecordWithoutPickling): return frozenset( var_name for insn in self.instructions - for var_name, _ in insn.assignees_and_indices()) + for var_name in insn.assignee_var_names()) @memoize_method def get_temporary_to_base_storage_map(self): @@ -918,9 +918,7 @@ class LoopKernel(RecordWithoutPickling): all_inames_by_insns = set() for insn_id in insn_ids: - insn = self.id_to_insn[insn_id] - - all_inames_by_insns |= self.insn_inames(insn) + all_inames_by_insns |= self.insn_inames(insn_id) if not all_inames_by_insns <= self.all_inames(): raise RuntimeError("some inames collected from instructions (%s) " @@ -935,7 +933,7 @@ class LoopKernel(RecordWithoutPickling): GroupIndexTag, LocalIndexTag, AutoLocalIndexTagBase) - for iname in self.all_inames(): + for iname in all_inames_by_insns: tag = self.iname_to_tag.get(iname) if isinstance(tag, GroupIndexTag): @@ -1138,6 +1136,8 @@ class LoopKernel(RecordWithoutPickling): options.append("groups=%s" % ":".join(insn.groups)) if insn.conflicts_with_groups: options.append("conflicts=%s" % ":".join(insn.conflicts_with_groups)) + if insn.no_sync_with: + options.append("no_sync_with=%s" % ":".join(insn.no_sync_with)) if len(loop_list) > loop_list_width: lines.append("[%s]" % loop_list) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index e25320729df3d5b4cceda46c9c406eb80e13ebdf..0040a6d9543952cb3615f10e86ea7fb32fd5cdab 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -146,18 +146,165 @@ def expand_defines_in_expr(expr, defines): # }}} -# {{{ parse instructions +# {{{ instruction options + +def get_default_insn_options_dict(): + return { + "depends_on": None, + "depends_on_is_final": False, + "no_sync_with": None, + "groups": None, + "conflicts_with_groups": None, + "insn_id": None, + "inames_to_dup": [], + "priority": 0, + "forced_iname_deps_is_final": False, + "forced_iname_deps": frozenset(), + "predicates": frozenset(), + "tags": (), + "atomicity": (), + } + + +def parse_insn_options(opt_dict, options_str, assignee_names=None): + if options_str is None: + return opt_dict + + result = opt_dict.copy() + + for option in options_str.split(","): + option = option.strip() + if not option: + raise RuntimeError("empty option supplied") + + equal_idx = option.find("=") + if equal_idx == -1: + opt_key = option + opt_value = None + else: + opt_key = option[:equal_idx].strip() + opt_value = option[equal_idx+1:].strip() + + if opt_key == "id" and opt_value is not None: + result["insn_id"] = intern(opt_value) + elif opt_key == "id_prefix" and opt_value is not None: + result["insn_id"] = UniqueName(opt_value) + elif opt_key == "priority" and opt_value is not None: + result["priority"] = int(opt_value) + elif opt_key == "dup" and opt_value is not None: + for value in opt_value.split(":"): + arrow_idx = value.find("->") + if arrow_idx >= 0: + result["inames_to_dup"].append( + (value[:arrow_idx], value[arrow_idx+2:])) + else: + result["inames_to_dup"].append((value, None)) + + elif opt_key == "dep" and opt_value is not None: + if opt_value.startswith("*"): + result["depends_on_is_final"] = True + opt_value = (opt_value[1:]).strip() + + result["depends_on"] = frozenset( + intern(dep.strip()) for dep in opt_value.split(":") + if dep.strip()) + + elif opt_key == "nosync" and opt_value is not None: + result["no_sync_with"] = frozenset( + intern(dep.strip()) for dep in opt_value.split(":") + if dep.strip()) + + elif opt_key == "groups" and opt_value is not None: + result["groups"] = frozenset( + intern(grp.strip()) for grp in opt_value.split(":") + if grp.strip()) + + elif opt_key == "conflicts" and opt_value is not None: + result["conflicts_with_groups"] = frozenset( + intern(grp.strip()) for grp in opt_value.split(":") + if grp.strip()) + + elif opt_key == "inames" and opt_value is not None: + if opt_value.startswith("+"): + result["forced_iname_deps_is_final"] = False + opt_value = (opt_value[1:]).strip() + else: + result["forced_iname_deps_is_final"] = True + + result["forced_iname_deps"] = intern_frozenset_of_ids( + opt_value.split(":")) + + elif opt_key == "if" and opt_value is not None: + result["predicates"] = intern_frozenset_of_ids(opt_value.split(":")) + + elif opt_key == "tags" and opt_value is not None: + result["tags"] = tuple( + tag.strip() for tag in opt_value.split(":") + if tag.strip()) + + elif opt_key == "atomic": + if assignee_names is None: + raise LoopyError("'atomic' option may only be specified " + "on an actual instruction") + if len(assignee_names) != 1: + raise LoopyError("atomic operations with more than one " + "left-hand side not supported") + assignee_name, = assignee_names + + import loopy as lp + if opt_value is None: + result["atomicity"] = result["atomicity"] + ( + lp.AtomicUpdate(assignee_name),) + else: + for v in opt_value.split(":"): + if v == "init": + result["atomicity"] = result["atomicity"] + ( + lp.AtomicInit(assignee_name),) + else: + raise LoopyError("atomicity directive not " + "understood: %s" + % v) + del assignee_name + + else: + raise ValueError( + "unrecognized instruction option '%s' " + "(maybe a missing/extraneous =value?)" + % opt_key) + + return result + +# }}} + + +# {{{ parse one instruction + +WITH_OPTIONS_RE = re.compile( + "^" + "\s*with\s*" + "\{(?P<options>.+)\}" + "\s*$") + +FOR_RE = re.compile( + "^" + "\s*for\s*" + "(?P<inames>[ ,\w]+)" + "\s*$") INSN_RE = re.compile( - "\s*(?P<lhs>.+?)\s*(?<!\:)=\s*(?P<rhs>.+?)" - "\s*?(?:\{(?P<options>.+)\}\s*)?$" - ) + "^" + "\s*" + "(?P<lhs>.+?)" + "\s*(?<!\:)=\s*" + "(?P<rhs>.+?)" + "\s*?" + "(?:\{(?P<options>.+)\}\s*)?$") + SUBST_RE = re.compile( - r"^\s*(?P<lhs>.+?)\s*:=\s*(?P<rhs>.+)\s*$" - ) + r"^\s*(?P<lhs>.+?)\s*:=\s*(?P<rhs>.+)\s*$") -def parse_insn(insn): +def parse_insn(groups, insn_options): """ :return: a tuple ``(insn, inames_to_dup)``, where insn is a :class:`Assignment`, a :class:`CallInstruction`, @@ -167,18 +314,6 @@ def parse_insn(insn): import loopy as lp - insn_match = INSN_RE.match(insn) - subst_match = SUBST_RE.match(insn) - if insn_match is not None and subst_match is not None: - raise RuntimeError("instruction parse error: %s" % insn) - - if insn_match is not None: - groups = insn_match.groupdict() - elif subst_match is not None: - groups = subst_match.groupdict() - else: - raise RuntimeError("instruction parse error at '%s'" % insn) - from loopy.symbolic import parse try: lhs = parse(groups["lhs"]) @@ -194,38 +329,9 @@ def parse_insn(insn): "the following error occurred:" % groups["rhs"]) raise - from pymbolic.primitives import Variable, Call, Subscript + from pymbolic.primitives import Variable, Subscript from loopy.symbolic import TypeAnnotation - # {{{ deal with subst rules - - if subst_match is not None: - assert insn_match is None - if isinstance(lhs, Variable): - subst_name = lhs.name - arg_names = [] - elif isinstance(lhs, Call): - if not isinstance(lhs.function, Variable): - raise RuntimeError("Invalid substitution rule left-hand side") - subst_name = lhs.function.name - arg_names = [] - - for i, arg in enumerate(lhs.parameters): - if not isinstance(arg, Variable): - raise RuntimeError("Invalid substitution rule " - "left-hand side: %s--arg number %d " - "is not a variable" % (lhs, i)) - arg_names.append(arg.name) - else: - raise RuntimeError("Invalid substitution rule left-hand side") - - return SubstitutionRule( - name=subst_name, - arguments=tuple(arg_names), - expression=rhs), [] - - # }}} - if not isinstance(lhs, tuple): lhs = (lhs,) @@ -244,9 +350,10 @@ def parse_insn(insn): else: temp_var_types.append(None) + from loopy.symbolic import LinearSubscript if isinstance(lhs_i, Variable): assignee_names.append(lhs_i.name) - elif isinstance(lhs_i, Subscript): + elif isinstance(lhs_i, (Subscript, LinearSubscript)): assignee_names.append(lhs_i.aggregate.name) else: raise LoopyError("left hand side of assignment '%s' must " @@ -258,170 +365,179 @@ def parse_insn(insn): temp_var_types = tuple(temp_var_types) del new_lhs - if insn_match is not None: - depends_on = None - depends_on_is_final = False - no_sync_with = None - insn_groups = None - conflicts_with_groups = None - insn_id = None - inames_to_dup = [] - priority = 0 - forced_iname_deps_is_final = False - forced_iname_deps = frozenset() - predicates = frozenset() - tags = () - atomicity = () - - if groups["options"] is not None: - for option in groups["options"].split(","): - option = option.strip() - if not option: - raise RuntimeError("empty option supplied") - - equal_idx = option.find("=") - if equal_idx == -1: - opt_key = option - opt_value = None - else: - opt_key = option[:equal_idx].strip() - opt_value = option[equal_idx+1:].strip() - - if opt_key == "id" and opt_value is not None: - insn_id = intern(opt_value) - elif opt_key == "id_prefix" and opt_value is not None: - insn_id = UniqueName(opt_value) - elif opt_key == "priority" and opt_value is not None: - priority = int(opt_value) - elif opt_key == "dup" and opt_value is not None: - for value in opt_value.split(":"): - arrow_idx = value.find("->") - if arrow_idx >= 0: - inames_to_dup.append( - (value[:arrow_idx], value[arrow_idx+2:])) - else: - inames_to_dup.append((value, None)) - - elif opt_key == "dep" and opt_value is not None: - if opt_value.startswith("*"): - depends_on_is_final = True - opt_value = (opt_value[1:]).strip() - - depends_on = frozenset( - intern(dep.strip()) for dep in opt_value.split(":") - if dep.strip()) - - elif opt_key == "nosync" and opt_value is not None: - no_sync_with = frozenset( - intern(dep.strip()) for dep in opt_value.split(":") - if dep.strip()) - - elif opt_key == "groups" and opt_value is not None: - insn_groups = frozenset( - intern(grp.strip()) for grp in opt_value.split(":") - if grp.strip()) - - elif opt_key == "conflicts" and opt_value is not None: - conflicts_with_groups = frozenset( - intern(grp.strip()) for grp in opt_value.split(":") - if grp.strip()) - - elif opt_key == "inames" and opt_value is not None: - if opt_value.startswith("+"): - forced_iname_deps_is_final = False - opt_value = (opt_value[1:]).strip() - else: - forced_iname_deps_is_final = True + insn_options = parse_insn_options( + insn_options, + groups["options"], + assignee_names=assignee_names) - forced_iname_deps = intern_frozenset_of_ids(opt_value.split(":")) + insn_id = insn_options.pop("insn_id", None) + inames_to_dup = insn_options.pop("inames_to_dup", []) - elif opt_key == "if" and opt_value is not None: - predicates = intern_frozenset_of_ids(opt_value.split(":")) + kwargs = dict( + id=( + intern(insn_id) + if isinstance(insn_id, str) + else insn_id), + **insn_options) - elif opt_key == "tags" and opt_value is not None: - tags = tuple( - tag.strip() for tag in opt_value.split(":") - if tag.strip()) + from loopy.kernel.data import make_assignment + return make_assignment( + lhs, rhs, temp_var_types, **kwargs + ), inames_to_dup - elif opt_key == "atomic": - if len(assignee_names) != 1: - raise LoopyError("atomic operations with more than one " - "left-hand side not supported") - assignee_name, = assignee_names +# }}} - if opt_value is None: - atomicity = atomicity + ( - lp.AtomicUpdate(assignee_name),) - else: - for v in opt_value.split(":"): - if v == "init": - atomicity = atomicity + ( - lp.AtomicInit(assignee_name),) - else: - raise LoopyError("atomicity directive not " - "understood: %s" - % v) - del assignee_name - else: - raise ValueError( - "unrecognized instruction option '%s' " - "(maybe a missing/extraneous =value?)" - % opt_key) - - kwargs = dict( - id=( - intern(insn_id) - if isinstance(insn_id, str) - else insn_id), - depends_on=depends_on, - depends_on_is_final=depends_on_is_final, - no_sync_with=no_sync_with, - groups=insn_groups, - conflicts_with_groups=conflicts_with_groups, - forced_iname_deps_is_final=forced_iname_deps_is_final, - forced_iname_deps=forced_iname_deps, - priority=priority, - predicates=predicates, - tags=tags, - atomicity=atomicity) - - from loopy.kernel.data import make_assignment - return make_assignment( - lhs, rhs, temp_var_types, **kwargs - ), inames_to_dup - - -def parse_if_necessary(insn, defines): - if isinstance(insn, InstructionBase): - yield insn.copy( - id=intern(insn.id) if isinstance(insn.id, str) else insn.id, - depends_on=frozenset(intern(dep) for dep in insn.depends_on), - groups=frozenset(intern(grp) for grp in insn.groups), - conflicts_with_groups=frozenset( - intern(grp) for grp in insn.conflicts_with_groups), - forced_iname_deps=frozenset( - intern(iname) for iname in insn.forced_iname_deps), - predicates=frozenset( - intern(pred) for pred in insn.predicates), - ), [] - return - elif not isinstance(insn, str): - raise TypeError("Instructions must be either an Instruction " - "instance or a parseable string. got '%s' instead." - % type(insn)) - - for insn in insn.split("\n"): - comment_start = insn.find("#") - if comment_start >= 0: - insn = insn[:comment_start] - - insn = insn.strip() - if not insn: +# {{{ parse_subst_rule + +def parse_subst_rule(groups): + from loopy.symbolic import parse + try: + lhs = parse(groups["lhs"]) + except: + print("While parsing left hand side '%s', " + "the following error occurred:" % groups["lhs"]) + raise + + try: + rhs = parse(groups["rhs"]) + except: + print("While parsing right hand side '%s', " + "the following error occurred:" % groups["rhs"]) + raise + + from pymbolic.primitives import Variable, Call + if isinstance(lhs, Variable): + subst_name = lhs.name + arg_names = [] + elif isinstance(lhs, Call): + if not isinstance(lhs.function, Variable): + raise RuntimeError("Invalid substitution rule left-hand side") + subst_name = lhs.function.name + arg_names = [] + + for i, arg in enumerate(lhs.parameters): + if not isinstance(arg, Variable): + raise RuntimeError("Invalid substitution rule " + "left-hand side: %s--arg number %d " + "is not a variable" % (lhs, i)) + arg_names.append(arg.name) + else: + raise RuntimeError("Invalid substitution rule left-hand side") + + return SubstitutionRule( + name=subst_name, + arguments=tuple(arg_names), + expression=rhs) + +# }}} + + +# {{{ parse_instructions + +def parse_instructions(instructions, defines): + if isinstance(instructions, str): + instructions = [instructions] + + parsed_instructions = [] + + # {{{ pass 1: interning, comments, whitespace, defines + + for insn in instructions: + if isinstance(insn, InstructionBase): + parsed_instructions.append( + insn.copy( + id=intern(insn.id) if isinstance(insn.id, str) else insn.id, + depends_on=frozenset(intern(dep) for dep in insn.depends_on), + groups=frozenset(intern(grp) for grp in insn.groups), + conflicts_with_groups=frozenset( + intern(grp) for grp in insn.conflicts_with_groups), + forced_iname_deps=frozenset( + intern(iname) for iname in insn.forced_iname_deps), + predicates=frozenset( + intern(pred) for pred in insn.predicates), + )) + continue + + elif not isinstance(insn, str): + raise TypeError("Instructions must be either an Instruction " + "instance or a parseable string. got '%s' instead." + % type(insn)) + + for insn in insn.split("\n"): + comment_start = insn.find("#") + if comment_start >= 0: + insn = insn[:comment_start] + + insn = insn.strip() + if not insn: + continue + + for sub_insn in expand_defines(insn, defines, single_valued=False): + parsed_instructions.append(sub_insn) + + # }}} + + instructions = parsed_instructions + parsed_instructions = [] + substitutions = {} + inames_to_dup = [] # one for each parsed_instruction + + # {{{ pass 2: parsing + + insn_options_stack = [get_default_insn_options_dict()] + + for insn in instructions: + if isinstance(insn, InstructionBase): + parsed_instructions.append(insn) + inames_to_dup.append([]) + continue + + with_options_match = WITH_OPTIONS_RE.match(insn) + if with_options_match is not None: + insn_options_stack.append( + parse_insn_options( + insn_options_stack[-1], + with_options_match.group("options"))) + continue + + for_match = FOR_RE.match(insn) + if for_match is not None: + options = insn_options_stack[-1].copy() + options["forced_iname_deps"] = ( + options.get("forced_iname_deps", frozenset()) + | frozenset( + iname.strip() + for iname in for_match.group("inames").split(","))) + options["forced_iname_deps_is_final"] = True + + insn_options_stack.append(options) + del options + continue + + if insn == "end": + insn_options_stack.pop() + continue + + subst_match = SUBST_RE.match(insn) + if subst_match is not None: + subst = parse_subst_rule(subst_match.groupdict()) + substitutions[subst.name] = subst continue - for sub_insn in expand_defines(insn, defines, single_valued=False): - yield parse_insn(sub_insn) + insn_match = INSN_RE.match(insn) + if insn_match is not None: + insn, insn_inames_to_dup = parse_insn( + insn_match.groupdict(), insn_options_stack[-1]) + parsed_instructions.append(insn) + inames_to_dup.append(insn_inames_to_dup) + continue + + raise RuntimeError("instruction parse error: %s" % insn) + + # }}} + + return parsed_instructions, inames_to_dup, substitutions # }}} @@ -551,7 +667,7 @@ class ArgumentGuesser: from loopy.symbolic import get_dependencies for insn in instructions: if isinstance(insn, MultiAssignmentBase): - for assignee_var_name, _ in insn.assignees_and_indices(): + for assignee_var_name in insn.assignee_var_names(): self.all_written_names.add(assignee_var_name) self.all_names.update(get_dependencies( self.submap(insn.assignees))) @@ -619,8 +735,8 @@ class ArgumentGuesser: for insn in self.instructions: if isinstance(insn, MultiAssignmentBase): - for (assignee_var_name, _), temp_var_type in zip( - insn.assignees_and_indices(), + for assignee_var_name, temp_var_type in zip( + insn.assignee_var_names(), insn.temp_var_types): if temp_var_type is not None: temp_var_names.add(assignee_var_name) @@ -655,33 +771,6 @@ class ArgumentGuesser: # }}} -# {{{ tag reduction inames as sequential - -def tag_reduction_inames_as_sequential(knl): - result = set() - - for insn in knl.instructions: - result.update(insn.reduction_inames()) - - from loopy.kernel.data import ParallelTag, ForceSequentialTag - - new_iname_to_tag = {} - for iname in result: - tag = knl.iname_to_tag.get(iname) - if tag is not None and isinstance(tag, ParallelTag): - raise RuntimeError("inconsistency detected: " - "reduction iname '%s' has " - "a parallel tag" % iname) - - if tag is None: - new_iname_to_tag[iname] = ForceSequentialTag() - - from loopy import tag_inames - return tag_inames(knl, new_iname_to_tag) - -# }}} - - # {{{ sanity checking def check_for_duplicate_names(knl): @@ -740,7 +829,7 @@ def check_written_variable_names(knl): | set(six.iterkeys(knl.temporary_variables))) for insn in knl.instructions: - for var_name, _ in insn.assignees_and_indices(): + for var_name in insn.assignee_var_names(): if var_name not in admissible_vars: raise RuntimeError("variable '%s' not declared or not " "allowed for writing" % var_name) @@ -837,8 +926,8 @@ def create_temporaries(knl, default_order): for insn in knl.instructions: if isinstance(insn, MultiAssignmentBase): - for (assignee_name, _), temp_var_type in zip( - insn.assignees_and_indices(), + for assignee_name, temp_var_type in zip( + insn.assignee_var_names(), insn.temp_var_types): if temp_var_type is None: @@ -882,7 +971,6 @@ def determine_shapes_of_temporaries(knl): new_temp_vars = knl.temporary_variables.copy() from loopy.symbolic import AccessRangeMapper - from pymbolic import var import loopy as lp new_temp_vars = {} @@ -890,10 +978,8 @@ def determine_shapes_of_temporaries(knl): if tv.shape is lp.auto or tv.base_indices is lp.auto: armap = AccessRangeMapper(knl, tv.name) for insn in knl.instructions: - for assignee_name, assignee_index in insn.assignees_and_indices(): - if assignee_index: - armap(var(assignee_name).index(assignee_index), - knl.insn_inames(insn)) + for assignee in insn.assignees: + armap(assignee, knl.insn_inames(insn)) if armap.access_range is not None: base_indices, shape = list(zip(*[ @@ -976,7 +1062,6 @@ def guess_arg_shape_if_requested(kernel, default_order): from traceback import print_exc print_exc() - from loopy.diagnostic import LoopyError raise LoopyError( "Failed to (automatically, as requested) find " "shape/strides for argument '%s'. " @@ -986,6 +1071,14 @@ def guess_arg_shape_if_requested(kernel, default_order): if armap.access_range is None: if armap.bad_subscripts: + from loopy.symbolic import LinearSubscript + if any(isinstance(sub, LinearSubscript) + for sub in armap.bad_subscripts): + raise LoopyError("cannot determine access range for '%s': " + "linear subscript(s) in '%s'" + % (arg.name, ", ".join( + str(i) for i in armap.bad_subscripts))) + n_axes_in_subscripts = set( len(sub.index_tuple) for sub in armap.bad_subscripts) @@ -999,7 +1092,7 @@ def guess_arg_shape_if_requested(kernel, default_order): # Leave shape undetermined--we can live with that for 1D. shape = (None,) else: - raise RuntimeError("cannot determine access range for '%s': " + raise LoopyError("cannot determine access range for '%s': " "undetermined index in subscript(s) '%s'" % (arg.name, ", ".join( str(i) for i in armap.bad_subscripts))) @@ -1106,6 +1199,19 @@ def resolve_wildcard_deps(knl): # }}} +# {{{ add inferred iname deps + +def add_inferred_inames(knl): + from loopy.kernel.tools import find_all_insn_inames + insn_inames = find_all_insn_inames(knl) + + return knl.copy(instructions=[ + insn.copy(forced_iname_deps=insn_inames[insn.id]) + for insn in knl.instructions]) + +# }}} + + # {{{ kernel creation top-level def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): @@ -1158,14 +1264,6 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): :arg preamble_generators: a list of functions of signature (seen_dtypes, seen_functions) where seen_functions is a set of (name, c_name, arg_dtypes), generating extra entries for *preambles*. - :arg defines: a dictionary of replacements to be made in instructions given - as strings before parsing. A macro instance intended to be replaced - should look like "MACRO" in the instruction code. The expansion given - in this parameter is allowed to be a list. In this case, instructions - are generated for *each* combination of macro values. - - These defines may also be used in the domain and in argument shapes and - strides. They are expanded only upon kernel creation. :arg default_order: "C" (default) or "F" :arg default_offset: 0 or :class:`loopy.auto`. The default value of *offset* in :attr:`GlobalArg` for guessed arguments. @@ -1199,6 +1297,12 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): flags = kwargs.pop("flags", None) target = kwargs.pop("target", None) + if defines: + from warnings import warn + warn("'defines' argument to make_kernel is deprecated. " + "Use lp.fix_parameters instead", + DeprecationWarning, stacklevel=2) + if target is None: from loopy import _DEFAULT_TARGET target = _DEFAULT_TARGET @@ -1256,34 +1360,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # }}} - # {{{ instruction/subst parsing - - parsed_instructions = [] - kwargs["substitutions"] = substitutions = {} - inames_to_dup = [] - - if isinstance(instructions, str): - instructions = [instructions] - - for insn in instructions: - for new_insn, insn_inames_to_dup in parse_if_necessary(insn, defines): - if isinstance(new_insn, InstructionBase): - parsed_instructions.append(new_insn) - - # Need to maintain 1-to-1 correspondence to instructions - inames_to_dup.append(insn_inames_to_dup) - - elif isinstance(new_insn, SubstitutionRule): - substitutions[new_insn.name] = new_insn - - assert not insn_inames_to_dup - else: - raise RuntimeError("unexpected type in instruction parsing") - - instructions = parsed_instructions - del parsed_instructions - - # }}} + instructions, inames_to_dup, substitutions = \ + parse_instructions(instructions, defines) # {{{ find/create isl_context @@ -1307,6 +1385,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): kernel_args = arg_guesser.convert_names_to_full_args(kernel_args) kernel_args = arg_guesser.guess_kernel_args_if_requested(kernel_args) + kwargs["substitutions"] = substitutions + from loopy.kernel import LoopKernel knl = LoopKernel(domains, instructions, kernel_args, temporary_variables=temporary_variables, @@ -1324,8 +1404,21 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_nonexistent_iname_deps(knl) knl = create_temporaries(knl, default_order) + # ------------------------------------------------------------------------- + # Ordering dependency: + # ------------------------------------------------------------------------- + # Must create temporaries before inferring inames (because those temporaries + # mediate dependencies that are then used for iname propagation.) + # ------------------------------------------------------------------------- + # NOTE: add_inferred_inames will be phased out and throws warnings if it + # does something. + knl = add_inferred_inames(knl) + # ------------------------------------------------------------------------- + # Ordering dependency: + # ------------------------------------------------------------------------- + # Must infer inames before determining shapes. + # ------------------------------------------------------------------------- knl = determine_shapes_of_temporaries(knl) - knl = tag_reduction_inames_as_sequential(knl) knl = expand_defines_in_shapes(knl, defines) knl = guess_arg_shape_if_requested(knl, default_order) knl = apply_default_order_to_args(knl, default_order) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 26f4f64a60e90aed59cedcbaa0e1cd2b117f6af1..5b6ff31cbcbdb404b7e812c6ae1f25a1a9323b88 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -506,7 +506,7 @@ class InstructionBase(Record): a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances with which no barrier synchronization is necessary, even given the existence - of a dependency chain and apparently conflicting writes + of a dependency chain and apparently conflicting access .. rubric:: Conditionals @@ -542,7 +542,8 @@ class InstructionBase(Record): A :class:`set` of inames into which the instruction may need to be boosted, as a heuristic help for the scheduler. - Also allowed to be *None*. + Also allowed to be *None* to indicate that this hasn't been + decided yet. .. rubric:: Tagging @@ -552,12 +553,11 @@ class InstructionBase(Record): of instructions. .. automethod:: __init__ - .. automethod:: assignees_and_indices - .. automethod:: assignee_name + .. automethod:: assignee_var_names + .. automethod:: assignee_subscript_deps .. automethod:: with_transformed_expressions .. automethod:: write_dependency_names .. automethod:: dependency_names - .. automethod:: assignee_var_names .. automethod:: copy """ @@ -571,7 +571,8 @@ class InstructionBase(Record): def __init__(self, id, depends_on, depends_on_is_final, groups, conflicts_with_groups, no_sync_with, - forced_iname_deps_is_final, forced_iname_deps, priority, + forced_iname_deps_is_final, forced_iname_deps, + priority, boostable, boostable_into, predicates, tags, insn_deps=None, insn_deps_is_final=None): @@ -658,10 +659,15 @@ class InstructionBase(Record): def reduction_inames(self): raise NotImplementedError - def assignees_and_indices(self): - """Return a list of tuples *(assignee_var_name, subscript)* - where assignee_var_name is a string representing an assigned - variable name and subscript is a :class:`tuple`. + def assignee_var_names(self): + """Return a tuple of tuples of assignee variable names, one + for each quantity being assigned to. + """ + raise NotImplementedError + + def assignee_subscript_deps(self): + """Return a list of sets of variable names referred to in the subscripts + of the quantities being assigned to, one for each assignee. """ raise NotImplementedError @@ -676,20 +682,20 @@ class InstructionBase(Record): @property def assignee_name(self): - """A convenience wrapper around :meth:`assignees_and_indices` + """A convenience wrapper around :meth:`assignee_var_names` that returns the the name of the variable being assigned. If more than one variable is being modified in the instruction, :raise:`ValueError` is raised. """ - aai = self.assignees_and_indices() + names = self.assignee_var_names() - if len(aai) != 1: + if len(names) != 1: raise ValueError("expected exactly one assignment in instruction " "on which assignee_name is being called, found %d" - % len(aai)) + % len(names)) - (name, _), = aai + name, = names return name @memoize_method @@ -700,19 +706,15 @@ class InstructionBase(Record): """ result = set() - for assignee, indices in self.assignees_and_indices(): - result.add(assignee) + for assignee in self.assignees: from loopy.symbolic import get_dependencies - result.update(get_dependencies(indices)) + result.update(get_dependencies(assignee)) return frozenset(result) def dependency_names(self): return self.read_dependency_names() | self.write_dependency_names() - def assignee_var_names(self): - return (var_name for var_name, _ in self.assignees_and_indices()) - def get_str_options(self): result = [] @@ -730,6 +732,8 @@ class InstructionBase(Record): if self.depends_on: result.append("deps="+":".join(self.depends_on)) + if self.no_sync_with: + result.append("nosync="+":".join(self.no_sync_with)) if self.groups: result.append("groups=%s" % ":".join(self.groups)) if self.conflicts_with_groups: @@ -738,7 +742,7 @@ class InstructionBase(Record): result.append("priority=%d" % self.priority) if self.tags: result.append("tags=%s" % ":".join(self.tags)) - if hasattr(self, "atomicity"): + if hasattr(self, "atomicity") and self.atomicity: result.append("atomic=%s" % ":".join(str(a) for a in self.atomicity)) return result @@ -806,15 +810,38 @@ class InstructionBase(Record): # }}} -def _get_assignee_and_index(expr): +def _get_assignee_var_name(expr): from pymbolic.primitives import Variable, Subscript + from loopy.symbolic import LinearSubscript + if isinstance(expr, Variable): - return (expr.name, ()) + return expr.name + elif isinstance(expr, Subscript): agg = expr.aggregate assert isinstance(agg, Variable) - return (agg.name, expr.index_tuple) + return agg.name + + elif isinstance(expr, LinearSubscript): + agg = expr.aggregate + assert isinstance(agg, Variable) + + return agg.name + else: + raise RuntimeError("invalid lvalue '%s'" % expr) + + +def _get_assignee_subscript_deps(expr): + from pymbolic.primitives import Variable, Subscript + from loopy.symbolic import LinearSubscript, get_dependencies + + if isinstance(expr, Variable): + return frozenset() + elif isinstance(expr, Subscript): + return get_dependencies(expr.index) + elif isinstance(expr, LinearSubscript): + return get_dependencies(expr.index) else: raise RuntimeError("invalid lvalue '%s'" % expr) @@ -976,8 +1003,8 @@ class MultiAssignmentBase(InstructionBase): def read_dependency_names(self): from loopy.symbolic import get_dependencies result = get_dependencies(self.expression) - for _, subscript in self.assignees_and_indices(): - result = result | get_dependencies(subscript) + for subscript_deps in self.assignee_subscript_deps(): + result = result | subscript_deps processed_predicates = frozenset( pred.lstrip("!") for pred in self.predicates) @@ -1114,8 +1141,11 @@ class Assignment(MultiAssignmentBase): # {{{ implement InstructionBase interface @memoize_method - def assignees_and_indices(self): - return [_get_assignee_and_index(self.assignee)] + def assignee_var_names(self): + return (_get_assignee_var_name(self.assignee),) + + def assignee_subscript_deps(self): + return (_get_assignee_subscript_deps(self.assignee),) def with_transformed_expressions(self, f, *args): return self.copy( @@ -1258,8 +1288,13 @@ class CallInstruction(MultiAssignmentBase): # {{{ implement InstructionBase interface @memoize_method - def assignees_and_indices(self): - return [_get_assignee_and_index(a) for a in self.assignees] + def assignee_var_names(self): + return tuple(_get_assignee_var_name(a) for a in self.assignees) + + def assignee_subscript_deps(self): + return tuple( + _get_assignee_subscript_deps(a) + for a in self.assignees) def with_transformed_expressions(self, f, *args): return self.copy( @@ -1451,17 +1486,21 @@ class CInstruction(InstructionBase): for name, iname_expr in self.iname_exprs: result.update(get_dependencies(iname_expr)) - for _, subscript in self.assignees_and_indices(): - result.update(get_dependencies(subscript)) + for subscript_deps in self.assignee_subscript_deps(): + result.update(subscript_deps) return frozenset(result) | self.predicates def reduction_inames(self): return set() - def assignees_and_indices(self): - return [_get_assignee_and_index(expr) - for expr in self.assignees] + def assignee_var_names(self): + return tuple(_get_assignee_var_name(expr) for expr in self.assignees) + + def assignee_subscript_deps(self): + return tuple( + _get_assignee_subscript_deps(a) + for a in self.assignees) def with_transformed_expressions(self, f, *args): return self.copy( diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 7862ceb1efbed4f70ebe0fa81d07de84b12fca60..a4e6ab0d6b7cc3e884eadd0f044429e186b9118f 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -30,7 +30,7 @@ from six.moves import intern import numpy as np import islpy as isl from islpy import dim_type -from loopy.diagnostic import LoopyError +from loopy.diagnostic import LoopyError, warn_with_kernel import logging logger = logging.getLogger(__name__) @@ -125,7 +125,41 @@ def _add_and_infer_dtypes_overdetermined(knl, dtype_dict): # }}} -# {{{ find_all_insn_inames fixed point iteration +# {{{ find_all_insn_inames fixed point iteration (deprecated) + +def guess_iname_deps_based_on_var_use(kernel, insn, insn_id_to_inames=None): + # For all variables that insn depends on, find the intersection + # of iname deps of all writers, and add those to insn's + # dependencies. + + result = frozenset() + + writer_map = kernel.writer_map() + + for tv_name in (insn.read_dependency_names() & kernel.get_written_variables()): + tv_implicit_inames = None + + for writer_id in writer_map[tv_name]: + writer_insn = kernel.id_to_insn[writer_id] + if insn_id_to_inames is None: + writer_inames = writer_insn.forced_iname_deps + else: + writer_inames = insn_id_to_inames[writer_id] + + writer_implicit_inames = ( + writer_inames + - (writer_insn.write_dependency_names() & kernel.all_inames())) + if tv_implicit_inames is None: + tv_implicit_inames = writer_implicit_inames + else: + tv_implicit_inames = (tv_implicit_inames + & writer_implicit_inames) + + if tv_implicit_inames is not None: + result = result | tv_implicit_inames + + return result - insn.reduction_inames() + def find_all_insn_inames(kernel): logger.debug("%s: find_all_insn_inames: start" % kernel.name) @@ -166,8 +200,6 @@ def find_all_insn_inames(kernel): insn_id_to_inames[insn.id] = iname_deps insn_assignee_inames[insn.id] = write_deps & kernel.all_inames() - written_vars = kernel.get_written_variables() - # fixed point iteration until all iname dep sets have converged # Why is fixed point iteration necessary here? Consider the following @@ -190,32 +222,22 @@ def find_all_insn_inames(kernel): # {{{ depdency-based propagation - # For all variables that insn depends on, find the intersection - # of iname deps of all writers, and add those to insn's - # dependencies. - - for tv_name in (all_read_deps[insn.id] & written_vars): - implicit_inames = None - - for writer_id in writer_map[tv_name]: - writer_implicit_inames = ( - insn_id_to_inames[writer_id] - - insn_assignee_inames[writer_id]) - if implicit_inames is None: - implicit_inames = writer_implicit_inames - else: - implicit_inames = (implicit_inames - & writer_implicit_inames) - - inames_old = insn_id_to_inames[insn.id] - inames_new = (inames_old | implicit_inames) \ - - insn.reduction_inames() - insn_id_to_inames[insn.id] = inames_new - - if inames_new != inames_old: - did_something = True - logger.debug("%s: find_all_insn_inames: %s -> %s (dep-based)" % ( - kernel.name, insn.id, ", ".join(sorted(inames_new)))) + inames_old = insn_id_to_inames[insn.id] + inames_new = inames_old | guess_iname_deps_based_on_var_use( + kernel, insn, insn_id_to_inames) + + insn_id_to_inames[insn.id] = inames_new + + if inames_new != inames_old: + did_something = True + + warn_with_kernel(kernel, "inferred_iname", + "The iname(s) '%s' on instruction '%s' was " + "automatically added. " + "This is deprecated. Please add the iname " + "to the instruction " + "implicitly, e.g. by adding '{inames=...}" + % (inames_new-inames_old, insn.id)) # }}} @@ -245,8 +267,14 @@ def find_all_insn_inames(kernel): if inames_new != inames_old: did_something = True insn_id_to_inames[insn.id] = frozenset(inames_new) - logger.debug("%s: find_all_insn_inames: %s -> %s (domain-based)" % ( - kernel.name, insn.id, ", ".join(sorted(inames_new)))) + + warn_with_kernel(kernel, "inferred_iname", + "The iname(s) '%s' on instruction '%s' was " + "automatically added. " + "This is deprecated. Please add the iname " + "to the instruction " + "implicitly, e.g. by adding '{inames=...}" + % (inames_new-inames_old, insn.id)) # }}} diff --git a/loopy/maxima.py b/loopy/maxima.py index 29f974ff9d1f6f0a6e6d4311eca88201559e6854..738df86c4b94988b57d6dc01bcc22bb0ca62ac21 100644 --- a/loopy/maxima.py +++ b/loopy/maxima.py @@ -60,7 +60,7 @@ def get_loopy_instructions_as_maxima(kernel, prefix): my_variable_names = ( avn for insn in kernel.instructions - for avn, _ in insn.assignees_and_indices() + for avn in insn.assignee_var_names() ) from pymbolic import var @@ -89,7 +89,7 @@ def get_loopy_instructions_as_maxima(kernel, prefix): if dep not in written_insn_ids: write_insn(dep) - (aname, _), = insn.assignees_and_indices() + aname, = insn.assignee_var_names() result.append("%s%s : %s;" % ( prefix, aname, mstr(substitute(insn.expression)))) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 8b7be4f0407bb0b9e52f0b6915841af1c5dd7ac5..d333cdf18291546e3efaaa96a5468c9e9ed4a470 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -28,7 +28,10 @@ from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn, LoopyAdvisory, DependencyTypeInferenceFailure) +import islpy as isl + from pytools.persistent_dict import PersistentDict + from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment @@ -137,8 +140,8 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): result_dtypes = type_inf_mapper(expr, multiple_types_ok=True) result = None - for (assignee, _), comp_dtype in zip( - writer_insn.assignees_and_indices(), result_dtypes): + for assignee, comp_dtype in zip( + writer_insn.assignee_var_names(), result_dtypes): if assignee == var_name: result = comp_dtype break @@ -302,12 +305,11 @@ def _get_compute_inames_tagged(kernel, insn, tag_base): def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_name): - from loopy.symbolic import get_dependencies - return set(iname - for aname, aindices in insn.assignees_and_indices() - for iname in get_dependencies(aindices) - & kernel.all_inames() + for aname, adeps in zip( + insn.assignee_var_names(), + insn.assignee_subscript_deps()) + for iname in adeps & kernel.all_inames() if aname == tv_name if isinstance(kernel.iname_to_tag.get(iname), tag_base)) @@ -414,6 +416,9 @@ def find_temporary_scope(kernel): def add_default_dependencies(kernel): logger.debug("%s: default deps" % kernel.name) + from loopy.transform.subst import expand_subst + expanded_kernel = expand_subst(kernel) + writer_map = kernel.writer_map() arg_names = set(arg.name for arg in kernel.args) @@ -422,7 +427,7 @@ def add_default_dependencies(kernel): dep_map = dict( (insn.id, insn.read_dependency_names() & var_names) - for insn in kernel.instructions) + for insn in expanded_kernel.instructions) new_insns = [] for insn in kernel.instructions: @@ -463,7 +468,7 @@ def add_default_dependencies(kernel): # {{{ rewrite reduction to imperative form -def realize_reduction(kernel, insn_id_filter=None): +def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. @@ -479,6 +484,7 @@ def realize_reduction(kernel, insn_id_filter=None): logger.debug("%s: realize reduction" % kernel.name) new_insns = [] + new_iname_tags = {} insn_id_gen = kernel.get_instruction_id_generator() @@ -488,30 +494,16 @@ def realize_reduction(kernel, insn_id_filter=None): from loopy.expression import TypeInferenceMapper type_inf_mapper = TypeInferenceMapper(kernel) - def map_reduction(expr, rec, multiple_values_ok=False): - # Only expand one level of reduction at a time, going from outermost to - # innermost. Otherwise we get the (iname + insn) dependencies wrong. - - try: - arg_dtype = type_inf_mapper(expr.expr) - except DependencyTypeInferenceFailure: - raise LoopyError("failed to determine type of accumulator for " - "reduction '%s'" % expr) - - arg_dtype = arg_dtype.with_target(kernel.target) - - reduction_dtypes = expr.operation.result_dtypes( - kernel, arg_dtype, expr.inames) - reduction_dtypes = tuple( - dt.with_target(kernel.target) for dt in reduction_dtypes) + # {{{ sequential - ncomp = len(reduction_dtypes) + def map_reduction_seq(expr, rec, nresults, arg_dtype, + reduction_dtypes): + outer_insn_inames = temp_kernel.insn_inames(insn) from pymbolic import var - acc_var_names = [ var_name_gen("acc_"+"_".join(expr.inames)) - for i in range(ncomp)] + for i in range(nresults)] acc_vars = tuple(var(n) for n in acc_var_names) from loopy.kernel.data import TemporaryVariable, temp_var_scope @@ -523,12 +515,6 @@ def realize_reduction(kernel, insn_id_filter=None): dtype=dtype, scope=temp_var_scope.PRIVATE) - outer_insn_inames = temp_kernel.insn_inames(insn) - bad_inames = frozenset(expr.inames) & outer_insn_inames - if bad_inames: - raise LoopyError("reduction used within loop(s) that it was " - "supposed to reduce over: " + ", ".join(bad_inames)) - init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) @@ -562,25 +548,284 @@ def realize_reduction(kernel, insn_id_filter=None): generated_insns.append(reduction_insn) - new_insn_depends_on.add(reduction_insn.id) + new_insn_add_depends_on.add(reduction_insn.id) - if multiple_values_ok: - return acc_vars - else: + if nresults == 1: assert len(acc_vars) == 1 return acc_vars[0] + else: + return acc_vars + + # }}} + + # {{{ local-parallel + + def _get_int_iname_size(iname): + from loopy.isl_helpers import static_max_of_pw_aff + from loopy.symbolic import pw_aff_to_expr + size = pw_aff_to_expr( + static_max_of_pw_aff( + kernel.get_iname_bounds(iname).size, + constants_only=True)) + assert isinstance(size, six.integer_types) + return size + + def _make_slab_set(iname, size): + v = isl.make_zero_and_vars([iname]) + bs, = ( + v[0].le_set(v[iname]) + & + v[iname].lt_set(v[0] + size)).get_basic_sets() + return bs + + def map_reduction_local(expr, rec, nresults, arg_dtype, + reduction_dtypes): + red_iname, = expr.inames + + size = _get_int_iname_size(red_iname) + + outer_insn_inames = temp_kernel.insn_inames(insn) + + from loopy.kernel.data import LocalIndexTagBase + outer_local_inames = tuple( + oiname + for oiname in outer_insn_inames + if isinstance( + kernel.iname_to_tag.get(oiname), + LocalIndexTagBase)) + + from pymbolic import var + outer_local_iname_vars = tuple( + var(oiname) for oiname in outer_local_inames) + + outer_local_iname_sizes = tuple( + _get_int_iname_size(oiname) + for oiname in outer_local_inames) + + # {{{ add separate iname to carry out the reduction + + # Doing this sheds any odd conditionals that may be active + # on our red_iname. + + base_exec_iname = var_name_gen("red_"+red_iname) + domains.append(_make_slab_set(base_exec_iname, size)) + new_iname_tags[base_exec_iname] = kernel.iname_to_tag[red_iname] + + # }}} + + acc_var_names = [ + var_name_gen("acc_"+red_iname) + for i in range(nresults)] + acc_vars = tuple(var(n) for n in acc_var_names) + + from loopy.kernel.data import TemporaryVariable, temp_var_scope + for name, dtype in zip(acc_var_names, reduction_dtypes): + new_temporary_variables[name] = TemporaryVariable( + name=name, + shape=outer_local_iname_sizes + (size,), + dtype=dtype, + scope=temp_var_scope.LOCAL) + + base_iname_deps = outer_insn_inames - frozenset(expr.inames) + + neutral = expr.operation.neutral_element(arg_dtype, expr.inames) + + init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) + init_insn = make_assignment( + id=init_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(base_exec_iname),)] + for acc_var in acc_vars), + expression=neutral, + forced_iname_deps=base_iname_deps | frozenset([base_exec_iname]), + forced_iname_deps_is_final=insn.forced_iname_deps_is_final, + depends_on=frozenset()) + generated_insns.append(init_insn) + + transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname)) + transfer_insn = make_assignment( + id=transfer_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(red_iname),)] + for acc_var in acc_vars), + expression=expr.operation( + arg_dtype, neutral, expr.expr, expr.inames), + forced_iname_deps=( + (outer_insn_inames - frozenset(expr.inames)) + | frozenset([red_iname])), + forced_iname_deps_is_final=insn.forced_iname_deps_is_final, + depends_on=frozenset([init_id]) | insn.depends_on, + no_sync_with=frozenset([init_id])) + generated_insns.append(transfer_insn) + + def _strip_if_scalar(c): + if len(acc_vars) == 1: + return c[0] + else: + return c + + cur_size = 1 + while cur_size < size: + cur_size *= 2 + + prev_id = transfer_id + bound = size + + istage = 0 + while cur_size > 1: + + new_size = cur_size // 2 + assert new_size * 2 == cur_size + + stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage)) + domains.append(_make_slab_set(stage_exec_iname, bound-new_size)) + new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[red_iname] + + stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage)) + stage_insn = make_assignment( + id=stage_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars), + expression=expr.operation( + arg_dtype, + _strip_if_scalar([ + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars]), + _strip_if_scalar([ + acc_var[ + outer_local_iname_vars + ( + var(stage_exec_iname) + new_size,)] + for acc_var in acc_vars]), + expr.inames), + forced_iname_deps=( + base_iname_deps | frozenset([stage_exec_iname])), + forced_iname_deps_is_final=insn.forced_iname_deps_is_final, + depends_on=frozenset([prev_id]), + ) + + generated_insns.append(stage_insn) + prev_id = stage_id + + cur_size = new_size + bound = cur_size + istage += 1 + + new_insn_add_depends_on.add(prev_id) + new_insn_add_no_sync_with.add(prev_id) + new_insn_add_forced_iname_deps.add(stage_exec_iname or base_exec_iname) + + if nresults == 1: + assert len(acc_vars) == 1 + return acc_vars[0][outer_local_iname_vars + (0,)] + else: + return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] + # }}} + + # {{{ seq/par dispatch + + def map_reduction(expr, rec, nresults=1): + # Only expand one level of reduction at a time, going from outermost to + # innermost. Otherwise we get the (iname + insn) dependencies wrong. + + try: + arg_dtype = type_inf_mapper(expr.expr) + except DependencyTypeInferenceFailure: + if unknown_types_ok: + arg_dtype = lp.auto + + reduction_dtypes = (lp.auto,)*nresults + + else: + raise LoopyError("failed to determine type of accumulator for " + "reduction '%s'" % expr) + else: + arg_dtype = arg_dtype.with_target(kernel.target) + + reduction_dtypes = expr.operation.result_dtypes( + kernel, arg_dtype, expr.inames) + reduction_dtypes = tuple( + dt.with_target(kernel.target) for dt in reduction_dtypes) + + outer_insn_inames = temp_kernel.insn_inames(insn) + bad_inames = frozenset(expr.inames) & outer_insn_inames + if bad_inames: + raise LoopyError("reduction used within loop(s) that it was " + "supposed to reduce over: " + ", ".join(bad_inames)) + + n_sequential = 0 + n_local_par = 0 + + from loopy.kernel.data import ( + LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, + ParallelTag) + for iname in expr.inames: + iname_tag = kernel.iname_to_tag.get(iname) + + if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): + # These are nominally parallel, but we can live with + # them as sequential. + n_sequential += 1 + + elif isinstance(iname_tag, LocalIndexTagBase): + n_local_par += 1 + + elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): + raise LoopyError("the only form of parallelism supported " + "by reductions is 'local'--found iname '%s' " + "tagged '%s'" + % (iname, type(iname_tag).__name__)) + + else: + n_sequential += 1 + + if n_local_par and n_sequential: + raise LoopyError("Reduction over '%s' contains both parallel and " + "sequential inames. It must be split " + "(using split_reduction_{in,out}ward) " + "before code generation." + % ", ".join(expr.inames)) + + if n_local_par > 1: + raise LoopyError("Reduction over '%s' contains more than" + "one parallel iname. It must be split " + "(using split_reduction_{in,out}ward) " + "before code generation." + % ", ".join(expr.inames)) + + if n_sequential: + assert n_local_par == 0 + return map_reduction_seq(expr, rec, nresults, arg_dtype, + reduction_dtypes) + elif n_local_par: + return map_reduction_local(expr, rec, nresults, arg_dtype, + reduction_dtypes) + else: + from loopy.diagnostic import warn + warn(kernel, "empty_reduction", + "Empty reduction found (no inames to reduce over). " + "Eliminating.") + + return expr.expr + + # }}} from loopy.symbolic import ReductionCallbackMapper cb_mapper = ReductionCallbackMapper(map_reduction) insn_queue = kernel.instructions[:] insn_id_replacements = {} + domains = kernel.domains[:] temp_kernel = kernel import loopy as lp while insn_queue: - new_insn_depends_on = set() + new_insn_add_depends_on = set() + new_insn_add_no_sync_with = set() + new_insn_add_forced_iname_deps = set() + generated_insns = [] insn = insn_queue.pop(0) @@ -590,10 +835,12 @@ def realize_reduction(kernel, insn_id_filter=None): new_insns.append(insn) continue + nresults = len(insn.assignees) + # Run reduction expansion. from loopy.symbolic import Reduction - if isinstance(insn.expression, Reduction): - new_expressions = cb_mapper(insn.expression, multiple_values_ok=True) + if isinstance(insn.expression, Reduction) and nresults > 1: + new_expressions = cb_mapper(insn.expression, nresults=nresults) else: new_expressions = (cb_mapper(insn.expression),) @@ -603,8 +850,12 @@ def realize_reduction(kernel, insn_id_filter=None): kwargs = insn.get_copy_kwargs( depends_on=insn.depends_on - | frozenset(new_insn_depends_on), - forced_iname_deps=temp_kernel.insn_inames(insn)) + | frozenset(new_insn_add_depends_on), + no_sync_with=insn.no_sync_with + | frozenset(new_insn_add_no_sync_with), + forced_iname_deps=( + temp_kernel.insn_inames(insn) + | new_insn_add_forced_iname_deps)) kwargs.pop("id") kwargs.pop("expression") @@ -624,8 +875,6 @@ def realize_reduction(kernel, insn_id_filter=None): insn_id_replacements[insn.id] = [ rinsn.id for rinsn in replacement_insns] - # FIXME: Track dep rename - insn_queue = generated_insns + replacement_insns + insn_queue # The reduction expander needs an up-to-date kernel @@ -633,27 +882,35 @@ def realize_reduction(kernel, insn_id_filter=None): temp_kernel = kernel.copy( instructions=new_insns + insn_queue, - temporary_variables=new_temporary_variables) + temporary_variables=new_temporary_variables, + domains=domains) + temp_kernel = lp.replace_instruction_ids( + temp_kernel, insn_id_replacements) else: # nothing happened, we're done with insn - assert not new_insn_depends_on + assert not new_insn_add_depends_on new_insns.append(insn) kernel = kernel.copy( instructions=new_insns, - temporary_variables=new_temporary_variables) + temporary_variables=new_temporary_variables, + domains=domains) + + kernel = lp.replace_instruction_ids(kernel, insn_id_replacements) + + kernel = lp.tag_inames(kernel, new_iname_tags) - return lp.replace_instruction_ids(kernel, insn_id_replacements) + return kernel # }}} -# {{{ find boostability of instructions +# {{{ find idempotence ("boostability") of instructions -def find_boostability(kernel): - logger.debug("%s: boostability" % kernel.name) +def find_idempotence(kernel): + logger.debug("%s: idempotence" % kernel.name) writer_map = kernel.writer_map() @@ -661,16 +918,20 @@ def find_boostability(kernel): var_names = arg_names | set(six.iterkeys(kernel.temporary_variables)) - dep_map = dict( + reads_map = dict( (insn.id, insn.read_dependency_names() & var_names) for insn in kernel.instructions) - non_boostable_vars = set() + non_idempotently_updated_vars = set() + + # FIXME: This can be made more efficient by simply starting + # from all written variables and not even considering + # instructions as the start of the first pass. new_insns = [] for insn in kernel.instructions: all_my_var_writers = set() - for var in dep_map[insn.id]: + for var in reads_map[insn.id]: var_writers = writer_map.get(var, set()) all_my_var_writers |= var_writers @@ -680,7 +941,7 @@ def find_boostability(kernel): last_all_my_var_writers = all_my_var_writers for writer_insn_id in last_all_my_var_writers: - for var in dep_map[writer_insn_id]: + for var in reads_map[writer_insn_id]: all_my_var_writers = \ all_my_var_writers | writer_map.get(var, set()) @@ -692,19 +953,20 @@ def find_boostability(kernel): boostable = insn.id not in all_my_var_writers if not boostable: - non_boostable_vars.update( - var_name for var_name, _ in insn.assignees_and_indices()) + non_idempotently_updated_vars.update( + insn.assignee_var_names()) insn = insn.copy(boostable=boostable) new_insns.append(insn) - # {{{ remove boostability from isns that access non-boostable vars + # {{{ remove boostability from isns that access non-idempotently updated vars new2_insns = [] for insn in new_insns: accessed_vars = insn.dependency_names() - boostable = insn.boostable and not bool(non_boostable_vars & accessed_vars) + boostable = insn.boostable and not bool( + non_idempotently_updated_vars & accessed_vars) new2_insns.append(insn.copy(boostable=boostable)) # }}} @@ -827,7 +1089,7 @@ def preprocess_kernel(kernel, device=None): # because it manipulates the depends_on field, which could prevent # defaults from being applied. - kernel = realize_reduction(kernel) + kernel = realize_reduction(kernel, unknown_types_ok=False) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators @@ -837,7 +1099,7 @@ def preprocess_kernel(kernel, device=None): kernel = add_axes_to_temporaries_for_ilp_and_vec(kernel) kernel = find_temporary_scope(kernel) - kernel = find_boostability(kernel) + kernel = find_idempotence(kernel) kernel = limit_boostability(kernel) kernel = kernel.target.preprocess(kernel) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 89c9ffcdaaceb78778231101411e1f017f7c4f85..8b2a8190cedb65c034e95a8f4cd6b0c9a2bd65e3 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -142,7 +142,7 @@ def generate_sub_sched_items(schedule, start_idx): def get_insn_ids_for_block_at(schedule, start_idx): return frozenset( sub_sched_item.insn_id - for sub_sched_item in generate_sub_sched_items( + for i, sub_sched_item in generate_sub_sched_items( schedule, start_idx) if isinstance(sub_sched_item, RunInstruction)) @@ -569,7 +569,7 @@ def generate_loop_schedules_internal( print() print(75*"=") print("KERNEL:") - print(kernel) + print(kernel.stringify(with_dependencies=True)) print(75*"=") print("CURRENT SCHEDULE:") print(dump_schedule(sched_state.kernel, sched_state.schedule)) @@ -1486,15 +1486,6 @@ def generate_loop_schedules(kernel, debug_args={}): gen_sched = insert_barriers(kernel, gen_sched, reverse=False, kind="global") - """ - for sched_item in gen_sched: - if ( - isinstance(sched_item, Barrier) - and sched_item.kind == "global"): - raise LoopyError("kernel requires a global barrier %s" - % sched_item.comment) - """ - logger.info("%s: barrier insertion: local" % kernel.name) gen_sched = insert_barriers(kernel, gen_sched, diff --git a/loopy/statistics.py b/loopy/statistics.py index 8a31f67fbf85bf8a370f17d2775569697f247502..932acab50b29163345585b10edca0e020f8183b8 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -72,7 +72,7 @@ class ToCountMap: def __radd__(self, other): if other != 0: raise ValueError("ToCountMap: Attempted to add ToCountMap " - "to {} {}. ToCountMap may only be added to " + "to {0} {1}. ToCountMap may only be added to " "0 and other ToCountMap objects." .format(type(other), other)) return self @@ -84,7 +84,7 @@ class ToCountMap: for index in self.dict.keys())) else: raise ValueError("ToCountMap: Attempted to multiply " - "ToCountMap by {} {}." + "ToCountMap by {0} {1}." .format(type(other), other)) __rmul__ = __mul__ @@ -490,7 +490,10 @@ def count(kernel, set): except AttributeError: pass - count = 0 + count = isl.PwQPolynomial.zero( + set.space + .drop_dims(dim_type.set, 0, set.dim(dim_type.set)) + .add_dims(dim_type.set, 1)) set = set.make_disjoint() diff --git a/loopy/symbolic.py b/loopy/symbolic.py index cade5e5e060b26a8c6d44965e61500baa31e5c08..cef32851f7578c946a7e26f672dbcceb588a9edd 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1286,6 +1286,12 @@ class AccessRangeMapper(WalkMapper): self.access_range = self.access_range | access_range + def map_linear_subscript(self, expr, inames): + self.rec(expr.index, inames) + + if expr.aggregate.name == self.arg_name: + self.bad_subscripts.append(expr) + def map_reduction(self, expr, inames): return WalkMapper.map_reduction(self, expr, inames | set(expr.inames)) diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 3ec3a50b11f72a2975ac4366d495326bfcb69b37..eb39539b9c489320b227da7c7397c0748a704159 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -156,7 +156,7 @@ class ASTBuilderBase(object): def generate_top_of_body(self, codegen_state): return [] - def get_temporary_decls(self, codegen_state): + def get_temporary_decls(self, codegen_state, schedule_index): raise NotImplementedError def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): @@ -239,7 +239,7 @@ class DummyHostASTBuilder(ASTBuilderBase): schedule_index): return None - def get_temporary_decls(self, codegen_state): + def get_temporary_decls(self, codegen_state, schedule_index): return [] def get_expression_to_code_mapper(self, codegen_state): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 15156f2a13ca1cb60d8b64471852e253b59a8c2c..11e93df8d4f912a73b4064ba17404ec27e6356cc 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -236,7 +236,7 @@ class CASTBuilder(ASTBuilderBase): [self.idi_to_cgen_declarator(codegen_state.kernel, idi) for idi in codegen_state.implemented_data_info]) - def get_temporary_decls(self, codegen_state): + def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import temp_var_scope kernel = codegen_state.kernel @@ -264,7 +264,7 @@ class CASTBuilder(ASTBuilderBase): temp_decls.append( self.wrap_temporary_decl( self.get_temporary_decl( - kernel, tv, idi), tv.scope)) + kernel, schedule_index, tv, idi), tv.scope)) else: offset = 0 @@ -349,7 +349,7 @@ class CASTBuilder(ASTBuilderBase): return ExpressionToCMapper( codegen_state, fortran_abi=self.target.fortran_abi) - def get_temporary_decl(self, knl, temp_var, decl_info): + def get_temporary_decl(self, knl, schedule_index, temp_var, decl_info): temp_var_decl = POD(self, decl_info.dtype, decl_info.name) if decl_info.shape: @@ -402,8 +402,9 @@ class CASTBuilder(ASTBuilderBase): if isinstance(func_id, Variable): func_id = func_id.name - assignee_var_descriptors = [codegen_state.kernel.get_var_descriptor(a) - for a, _ in insn.assignees_and_indices()] + assignee_var_descriptors = [ + codegen_state.kernel.get_var_descriptor(a) + for a in insn.assignee_var_names()] par_dtypes = tuple(ecm.infer_type(par) for par in parameters) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 896ea9158223435e3bef933818fbf3bc51a424b4..7f19bdbdf838bcf43b66eb9c1c4fbf58699634a9 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -74,12 +74,19 @@ class ExprToISPCMapper(ExpressionToCMapper): "for constant '%s'" % expr) def map_variable(self, expr, enclosing_prec, type_context): - if expr.name in self.kernel.temporary_variables: - gsize, lsize = self.kernel.get_grid_sizes_as_exprs() + tv = self.kernel.temporary_variables.get(expr.name) + + from loopy.kernel.data import temp_var_scope + if tv is not None and tv.scope == temp_var_scope.PRIVATE: + # FIXME: This is a pretty coarse way of deciding what + # private temporaries get duplicated. Refine? (See also + # below in decl generation) + gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() if lsize: return "%s[programIndex]" % expr.name else: return expr.name + else: return super(ExprToISPCMapper, self).map_variable( expr, enclosing_prec, type_context) @@ -291,7 +298,7 @@ class ISPCASTBuilder(CASTBuilder): else: raise LoopyError("unknown barrier kind") - def get_temporary_decl(self, knl, temp_var, decl_info): + def get_temporary_decl(self, knl, sched_index, temp_var, decl_info): from loopy.target.c import POD # uses the correct complex type temp_var_decl = POD(self, decl_info.dtype, decl_info.name) @@ -299,7 +306,10 @@ class ISPCASTBuilder(CASTBuilder): from loopy.kernel.data import temp_var_scope if temp_var.scope == temp_var_scope.PRIVATE: - gsize, lsize = knl.get_grid_sizes_as_exprs() + # FIXME: This is a pretty coarse way of deciding what + # private temporaries get duplicated. Refine? (See also + # above in expr to code mapper) + _, lsize = knl.get_grid_size_upper_bounds_as_exprs() shape = lsize + shape if shape: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index c8f3b6b9e9dbd827e11b35af7189cd0c49b5bba3..fe38f9c7eb1a6a6cf30a04c4f34fad232e653e13 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -556,8 +556,10 @@ class OpenCLCASTBuilder(CASTBuilder): cast_str = "(%s %s *) " % (var_kind, ctype) return Block([ - POD(self, NumpyType(lhs_dtype.dtype), old_val_var), - POD(self, NumpyType(lhs_dtype.dtype), new_val_var), + POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + old_val_var), + POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + new_val_var), DoWhile( "%(func_name)s(" "%(cast_str)s&(%(lhs_expr)s), " diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index c36d78c4d78098217f2005184c2a5bdde2f59918..b570cf4ab7e4ed543213b73716b6271e61e6b69b 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -611,7 +611,7 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): # no such thing in Python return None - def get_temporary_decls(self, codegen_state): + def get_temporary_decls(self, codegen_state, schedule_state): from genpy import Assign, Comment, Line def alloc_nbytes(tv): diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index 2c50b3f11a34ced81d6a93eb467482110f495f2f..181f5241137cc1abe3256506f3ebd59327e4c4f8 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -136,9 +136,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): raise ValueError("unexpected type of access_expr") def is_assignee(insn): - return any( - lhs == var_name - for lhs, sbscript in insn.assignees_and_indices()) + return var_name in insn.assignee_var_names() def iterate_as(cls, expr): if isinstance(expr, cls): @@ -237,7 +235,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): new_insns.append(insn) continue - (_, index_key), = insn.assignees_and_indices() + index_key = extract_index_key(insn.assignee) lhs = insn.assignee rhs = insn.expression diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 967e14de692ee96b0c01d2ea5bcf8b411890038b..97c15c6f234211204db8366c5e69462cec5c2e77 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -143,9 +143,17 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", bvc = _BatchVariableChanger(rule_mapping_context, knl, batch_varying_args, batch_iname_expr, sequential=sequential) - return rule_mapping_context.finish_kernel( + kernel = rule_mapping_context.finish_kernel( bvc.map_kernel(knl)) + batch_iname_set = frozenset([batch_iname]) + kernel = kernel.copy( + instructions=[ + insn.copy(forced_iname_deps=insn.forced_iname_deps | batch_iname_set) + for insn in kernel.instructions]) + + return kernel + # }}} # vim: foldmethod=marker diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index a7d22b2d0f7e376086b3a42ac2f186e2805fda26..0c25b63930f99ebcb8b2b817367ec1572b15a1d4 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -239,8 +239,27 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, if not within(kernel, insn.id, ()): continue - for assignee, index in insn.assignees_and_indices(): - if assignee == var_name: + from pymbolic.primitives import Variable, Subscript + from loopy.symbolic import LinearSubscript + + for assignee in insn.assignees: + if isinstance(assignee, Variable): + assignee_name = assignee.name + index = () + + elif isinstance(assignee, Subscript): + assignee_name = assignee.aggregate.name + index = assignee.index_tuple + + elif isinstance(assignee, LinearSubscript): + if assignee.aggregate.name == var_name: + raise LoopyError("buffer_array may not be applied in the " + "presence of linear write indexing into '%s'" % var_name) + + else: + raise LoopyError("invalid lvalue '%s'" % assignee) + + if assignee_name == var_name: within_inames.update( (get_dependencies(index) & kernel.all_inames()) - buffer_inames_set) @@ -381,7 +400,9 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, init_instruction = Assignment(id=init_insn_id, assignee=buf_var_init, expression=init_expression, - forced_iname_deps=frozenset(within_inames), + forced_iname_deps=( + frozenset(within_inames) + | frozenset(non1_init_inames)), depends_on=frozenset(), depends_on_is_final=True) @@ -396,8 +417,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, did_write = False for insn_id in aar.modified_insn_ids: insn = kernel.id_to_insn[insn_id] - if any(assignee_name == buf_var_name - for assignee_name, _ in insn.assignees_and_indices()): + if buf_var_name in insn.assignee_var_names(): did_write = True # {{{ add init_insn_id to depends_on @@ -457,9 +477,12 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, store_instruction = Assignment( id=kernel.make_unique_instruction_id(based_on="store_"+var_name), depends_on=frozenset(aar.modified_insn_ids), + no_sync_with=frozenset([init_insn_id]), assignee=store_target, expression=store_expression, - forced_iname_deps=frozenset(within_inames)) + forced_iname_deps=( + frozenset(within_inames) + | frozenset(non1_store_inames))) else: did_write = False diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 3db96712eab21d9ad4efed6fc0a3dde08d1b9f65..89ad1cdb52d69cf1bee981bc93679fb76883dc54 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -139,7 +139,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, temporary_name=None, temporary_scope=None, temporary_is_local=None, footprint_subscripts=None, - fetch_bounding_box=False): + fetch_bounding_box=False, + fetch_outer_inames=None): """Prefetch all accesses to the variable *var_name*, with all accesses being swept through *sweep_inames*. @@ -153,6 +154,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, such as those occurring in dimension splits are recorded and also applied to these indices. + :arg fetch_outer_inames: The inames within which the fetch + instruction is nested. If *None*, make an educated guess. + This function combines :func:`extract_subst` and :func:`precompute`. """ @@ -246,7 +250,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, - temporary_scope=temporary_scope, temporary_is_local=temporary_is_local) + temporary_scope=temporary_scope, temporary_is_local=temporary_is_local, + precompute_outer_inames=fetch_outer_inames) # {{{ remove inames that were temporarily added by slice sweeps @@ -571,4 +576,69 @@ def set_temporary_scope(kernel, temp_var_names, scope): # }}} + +# {{{ reduction_arg_to_subst_rule + +def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None): + if isinstance(inames, str): + inames = [s.strip() for s in inames.split(",")] + + inames_set = frozenset(inames) + + substs = knl.substitutions.copy() + + var_name_gen = knl.get_var_name_generator() + + def map_reduction(expr, rec, nresults=1): + if frozenset(expr.inames) != inames_set: + return type(expr)( + operation=expr.operation, + inames=expr.inames, + expr=rec(expr.expr), + allow_simultaneous=expr.allow_simultaneous) + + if subst_rule_name is None: + subst_rule_prefix = "red_%s_arg" % "_".join(inames) + my_subst_rule_name = var_name_gen(subst_rule_prefix) + else: + my_subst_rule_name = subst_rule_name + + if my_subst_rule_name in substs: + raise LoopyError("substitution rule '%s' already exists" + % my_subst_rule_name) + + from loopy.kernel.data import SubstitutionRule + substs[my_subst_rule_name] = SubstitutionRule( + name=my_subst_rule_name, + arguments=tuple(inames), + expression=expr.expr) + + from pymbolic import var + iname_vars = [var(iname) for iname in inames] + + return type(expr)( + operation=expr.operation, + inames=expr.inames, + expr=var(my_subst_rule_name)(*iname_vars), + allow_simultaneous=expr.allow_simultaneous) + + from loopy.symbolic import ReductionCallbackMapper + cb_mapper = ReductionCallbackMapper(map_reduction) + + from loopy.kernel.data import MultiAssignmentBase + + new_insns = [] + for insn in knl.instructions: + if not isinstance(insn, MultiAssignmentBase): + new_insns.append(insn) + else: + new_insns.append(insn.copy(expression=cb_mapper(insn.expression))) + + return knl.copy( + instructions=new_insns, + substitutions=substs) + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index 6e9eb45bb58b7750c075ee7b79fb024e4cf4f24a..a8c3b5849d90466d1163f4cc47777e1253cae079 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -295,13 +295,24 @@ class DifferentiationContext(object): if not diff_expr: return None - (_, lhs_ind), = orig_writer_insn.assignees_and_indices() + assert isinstance(orig_writer_insn, lp.Assignment) + if isinstance(orig_writer_insn.assignee, p.Subscript): + lhs_ind = orig_writer_insn.assignee.index_tuple + elif isinstance(orig_writer_insn.assignee, p.Variable): + lhs_ind = () + else: + raise LoopyError( + "Unrecognized LHS type in differentiation: %s" + % type(orig_writer_insn.assignee).__name__) + new_insn_id = self.generate_instruction_id() insn = lp.Assignment( id=new_insn_id, assignee=var(new_var_name)[ lhs_ind + diff_iname_exprs], - expression=diff_expr) + expression=diff_expr, + forced_iname_deps=( + orig_writer_insn.forced_iname_deps | frozenset(diff_inames))) self.new_instructions.append(insn) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index e72897a0427c9818be26d42cda298f8d4f71bca9..4c3cd0a6988b269139eceb7028423407765282f6 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -48,8 +48,6 @@ __doc__ = """ .. autofunction:: duplicate_inames -.. undocumented .. autofunction:: link_inames - .. autofunction:: rename_iname .. autofunction:: remove_unused_inames @@ -890,114 +888,6 @@ def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): # }}} -# {{{ link inames - -def link_inames(knl, inames, new_iname, within=None, tag=None): - # {{{ normalize arguments - - if isinstance(inames, str): - inames = inames.split(",") - - var_name_gen = knl.get_var_name_generator() - new_iname = var_name_gen(new_iname) - - # }}} - - # {{{ ensure that each iname is used at most once in each instruction - - inames_set = set(inames) - - if 0: - # FIXME! - for insn in knl.instructions: - insn_inames = knl.insn_inames(insn.id) | insn.reduction_inames() - - if len(insn_inames & inames_set) > 1: - raise LoopyError("To-be-linked inames '%s' are used in " - "instruction '%s'. No more than one such iname can " - "be used in one instruction." - % (", ".join(insn_inames & inames_set), insn.id)) - - # }}} - - from loopy.kernel.tools import DomainChanger - domch = DomainChanger(knl, tuple(inames)) - - # {{{ ensure that projections are identical - - unrelated_dom_inames = list( - set(domch.domain.get_var_names(dim_type.set)) - - inames_set) - - domain = domch.domain - - # move all inames to be linked to end to prevent shuffly confusion - for iname in inames: - dt, index = domain.get_var_dict()[iname] - assert dt == dim_type.set - - # move to tail of param dim_type - domain = domain.move_dims( - dim_type.param, domain.dim(dim_type.param), - dt, index, 1) - # move to tail of set dim_type - domain = domain.move_dims( - dim_type.set, domain.dim(dim_type.set), - dim_type.param, domain.dim(dim_type.param)-1, 1) - - projections = [ - domch.domain.project_out_except( - unrelated_dom_inames + [iname], [dim_type.set]) - for iname in inames] - - all_equal = True - first_proj = projections[0] - for proj in projections[1:]: - all_equal = all_equal and (proj <= first_proj and first_proj <= proj) - - if not all_equal: - raise LoopyError("Inames cannot be linked because their domain " - "constraints are not the same.") - - del domain # messed up for testing, do not use - - # }}} - - # change the domain - from loopy.isl_helpers import duplicate_axes - knl = knl.copy( - domains=domch.get_domains_with( - duplicate_axes(domch.domain, [inames[0]], [new_iname]))) - - # {{{ change the code - - from pymbolic import var - subst_dict = dict((iname, var(new_iname)) for iname in inames) - - from loopy.match import parse_stack_match - within = parse_stack_match(within) - - from pymbolic.mapper.substitutor import make_subst_func - rule_mapping_context = SubstitutionRuleMappingContext( - knl.substitutions, var_name_gen) - ijoin = RuleAwareSubstitutionMapper(rule_mapping_context, - make_subst_func(subst_dict), within) - - knl = rule_mapping_context.finish_kernel( - ijoin.map_kernel(knl)) - - # }}} - - knl = remove_unused_inames(knl, inames) - - if tag is not None: - knl = tag_inames(knl, {new_iname: tag}) - - return knl - -# }}} - - # {{{ remove unused inames def remove_unused_inames(knl, inames=None): @@ -1252,8 +1142,8 @@ def affine_map_inames(kernel, old_inames, new_inames, equations): # {{{ change domains - new_inames_set = set(new_inames) - old_inames_set = set(old_inames) + new_inames_set = frozenset(new_inames) + old_inames_set = frozenset(old_inames) new_domains = [] for idom, dom in enumerate(kernel.domains): @@ -1339,7 +1229,26 @@ def affine_map_inames(kernel, old_inames, new_inames, equations): # }}} - return kernel.copy(domains=new_domains) + # {{{ switch iname refs in instructions + + def fix_iname_set(insn_id, inames): + if old_inames_set <= inames: + return (inames - old_inames_set) | new_inames_set + elif old_inames_set & inames: + raise LoopyError("instruction '%s' uses only a part (%s), not all, " + "of the old inames" + % (insn_id, ", ".join(old_inames_set & inames))) + else: + return inames + + new_instructions = [ + insn.copy(forced_iname_deps=fix_iname_set( + insn.id, insn.forced_iname_deps)) + for insn in kernel.instructions] + + # }}} + + return kernel.copy(domains=new_domains, instructions=new_instructions) # }}} diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 8b2fa370ad4214cbe16caec61ff45b684ed0a889..e73e5c33c8703a4cdbb5fc20feca9caabeb077c1 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -27,7 +27,7 @@ import six # noqa from loopy.diagnostic import LoopyError -# {{{ instruction processing +# {{{ find_instructions def find_instructions(kernel, insn_match): from loopy.match import parse_match @@ -35,6 +35,11 @@ def find_instructions(kernel, insn_match): return [insn for insn in kernel.instructions if match(kernel, insn)] +# }}} + + +# {{{ map_instructions + def map_instructions(kernel, insn_match, f): from loopy.match import parse_match match = parse_match(insn_match) @@ -49,6 +54,10 @@ def map_instructions(kernel, insn_match, f): return kernel.copy(instructions=new_insns) +# }}} + + +# {{{ set_instruction_priority def set_instruction_priority(kernel, insn_match, priority): """Set the priority of instructions matching *insn_match* to *priority*. @@ -62,6 +71,10 @@ def set_instruction_priority(kernel, insn_match, priority): return map_instructions(kernel, insn_match, set_prio) +# }}} + + +# {{{ add_dependency def add_dependency(kernel, insn_match, dependency): """Add the instruction dependency *dependency* to the instructions matched @@ -87,6 +100,10 @@ def add_dependency(kernel, insn_match, dependency): return map_instructions(kernel, insn_match, add_dep) +# }}} + + +# {{{ remove_instructions def remove_instructions(kernel, insn_ids): """Return a new kernel with instructions in *insn_ids* removed. @@ -124,6 +141,10 @@ def remove_instructions(kernel, insn_ids): return kernel.copy( instructions=new_insns) +# }}} + + +# {{{ replace_instruction_ids def replace_instruction_ids(kernel, replacements): new_insns = [] diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index fd6f33efc95d568fbc553c761c64508f42c4832c..0772b3dc02e7f1bbd83ccfabd485497d93b57601 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -239,6 +239,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): def precompute(kernel, subst_use, sweep_inames=[], within=None, storage_axes=None, temporary_name=None, precompute_inames=None, + precompute_outer_inames=None, storage_axis_to_tag={}, default_tag="l.auto", dtype=None, fetch_bounding_box=False, temporary_scope=None, temporary_is_local=None, @@ -307,6 +308,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, tuple, in which case names will be automatically created. May also equivalently be a comma-separated string. + :arg precompute_outer_inames: The inames within which the compute + instruction is nested. If *None*, make an educated guess. + :arg compute_insn_id: The ID of the instruction performing the precomputation. If `storage_axes` is not specified, it defaults to the arrangement @@ -480,8 +484,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, from loopy.symbolic import SubstitutionRuleExpander submap = SubstitutionRuleExpander(kernel.substitutions) - value_inames = get_dependencies( - submap(subst.expression) + value_inames = ( + get_dependencies(submap(subst.expression)) + - frozenset(subst.arguments) ) & kernel.all_inames() if value_inames - expanding_usage_arg_deps < extra_storage_axes: raise RuntimeError("unreferenced sweep inames specified: " @@ -728,8 +733,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, assignee = var(temporary_name) if non1_storage_axis_names: - assignee = assignee.index( - tuple(var(iname) for iname in non1_storage_axis_names)) + assignee = assignee[ + tuple(var(iname) for iname in non1_storage_axis_names)] # {{{ process substitutions on compute instruction @@ -764,7 +769,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, compute_insn = Assignment( id=compute_insn_id, assignee=assignee, - expression=compute_expression) + expression=compute_expression, + # forced_iname_deps determined below + ) # }}} @@ -784,6 +791,29 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} + # {{{ determine inames for compute insn + + if precompute_outer_inames is None: + from loopy.kernel.tools import guess_iname_deps_based_on_var_use + precompute_outer_inames = ( + frozenset(non1_storage_axis_names) + | frozenset( + (expanding_usage_arg_deps | value_inames) + - sweep_inames_set) + | guess_iname_deps_based_on_var_use(kernel, compute_insn)) + else: + if not isinstance(precompute_outer_inames, frozenset): + raise TypeError("precompute_outer_inames must be a frozenset") + + kernel = kernel.copy( + instructions=[ + insn.copy(forced_iname_deps=precompute_outer_inames) + if insn.id == compute_insn_id + else insn + for insn in kernel.instructions]) + + # }}} + # {{{ set up temp variable import loopy as lp diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 63e70f8e386bfc6ecdddb4b8661e9f975206725e..27e98383ee7211643aecdafebc9bf63988c55e0c 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -391,11 +391,21 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, for def_id, subst_name in six.iteritems(tts.definition_insn_id_to_subst_name): def_insn = kernel.id_to_insn[def_id] - (_, indices), = def_insn.assignees_and_indices() + from loopy.kernel.data import Assignment + assert isinstance(def_insn, Assignment) + + from pymbolic.primitives import Variable, Subscript + if isinstance(def_insn.assignee, Subscript): + indices = def_insn.assignee.index_tuple + elif isinstance(def_insn.assignee, Variable): + indices = () + else: + raise LoopyError( + "Unrecognized LHS type: %s" + % type(def_insn.assignee).__name__) arguments = [] - from pymbolic.primitives import Variable for i in indices: if not isinstance(i, Variable): raise LoopyError("In defining instruction '%s': " @@ -460,6 +470,9 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, # {{{ expand_subst def expand_subst(kernel, within=None): + if not kernel.substitutions: + return kernel + logger.debug("%s: expand subst" % kernel.name) from loopy.symbolic import RuleAwareSubstitutionRuleExpander diff --git a/loopy/version.py b/loopy/version.py index cfaa4b9a01d4eeb57b4cdf1b64a114e3b8c3bb33..0fcb9c04d1439378a56f4f47cfd6b1a653a1b5c1 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v33-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v36-islpy%s" % _islpy_version diff --git a/test/test_apps.py b/test/test_apps.py new file mode 100644 index 0000000000000000000000000000000000000000..6dfc6dc0c5edc22e8ccc8fdb791f4f7013901de5 --- /dev/null +++ b/test/test_apps.py @@ -0,0 +1,416 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +# {{{ convolutions + +def test_convolution(ctx_factory): + ctx = ctx_factory() + + dtype = np.float32 + + knl = lp.make_kernel( + "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ + -f_w <= f_x,f_y <= f_w \ + and 0 <= im_x < im_w and 0 <= im_y < im_h \ + and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ + }", + """ + out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \ + img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \ + * f[ifeat, f_w+f_x, f_w+f_y, icolor]) + """, + [ + lp.GlobalArg("f", dtype, shape=lp.auto), + lp.GlobalArg("img", dtype, shape=lp.auto), + lp.GlobalArg("out", dtype, shape=lp.auto), + "..." + ], + assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", + options="annotate_inames") + + f_w = 3 + + knl = lp.fix_parameters(knl, f_w=f_w, ncolors=3) + + ref_knl = knl + + def variant_0(knl): + #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") + knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y") + return knl + + def variant_1(knl): + knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") + knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") + return knl + + def variant_2(knl): + knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") + knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") + knl = lp.tag_inames(knl, dict(ifeat="g.2")) + knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]") + knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y") + return knl + + for variant in [ + #variant_0, + #variant_1, + variant_2 + ]: + lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), + parameters=dict( + im_w=128, im_h=128, f_w=f_w, + nfeats=3, nimgs=3 + )) + + +def test_convolution_with_nonzero_base(ctx_factory): + # This is kept alive as a test for domains that don't start at zero. + # These are a bad idea for split_iname, which places its origin at zero + # and therefore produces a first block that is odd-sized. + # + # Therefore, for real tests, check test_convolution further up. + + ctx = ctx_factory() + + dtype = np.float32 + + knl = lp.make_kernel( + "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ + -f_w <= f_x,f_y <= f_w \ + and f_w <= im_x < im_w-f_w and f_w <= im_y < im_h-f_w \ + and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ + }", + """ + out[iimg, ifeat, im_x-f_w, im_y-f_w] = sum((f_x, f_y, icolor), \ + img[iimg, im_x-f_x, im_y-f_y, icolor] \ + * f[ifeat, f_w+f_x, f_w+f_y, icolor]) + """, + [ + lp.GlobalArg("f", dtype, shape=lp.auto), + lp.GlobalArg("img", dtype, shape=lp.auto), + lp.GlobalArg("out", dtype, shape=lp.auto), + "..." + ], + assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", + options="annotate_inames") + + knl = lp.fix_parameters(knl, ncolors=3) + + ref_knl = knl + + f_w = 3 + + def variant_0(knl): + #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") + knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y") + return knl + + def variant_1(knl): + knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") + knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") + return knl + + for variant in [ + variant_0, + variant_1, + ]: + lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), + parameters=dict( + im_w=128, im_h=128, f_w=f_w, + nfeats=12, nimgs=17 + )) + +# }}} + + +def test_rob_stroud_bernstein(ctx_factory): + ctx = ctx_factory() + + # NOTE: tmp would have to be zero-filled beforehand + + knl = lp.make_kernel( + "{[el, i2, alpha1,alpha2]: \ + 0 <= el < nels and \ + 0 <= i2 < nqp1d and \ + 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }", + """ + for el,i2 + <> xi = qpts[1, i2] + <> s = 1-xi + <> r = xi/s + <> aind = 0 {id=aind_init} + + for alpha1 + <> w = s**(deg-alpha1) {id=init_w} + + for alpha2 + tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \ + {id=write_tmp} + w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ + {id=update_w,dep=init_w:write_tmp} + aind = aind + 1 \ + {id=aind_incr,dep=aind_init:write_tmp:update_w} + end + end + end + """, + [ + # Must declare coeffs to have "no" shape, to keep loopy + # from trying to figure it out the shape automatically. + + lp.GlobalArg("coeffs", None, shape=None), + "..." + ], + assumptions="deg>=0 and nels>=1" + ) + + knl = lp.fix_parameters(knl, nqp1d=7, deg=4) + knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") + knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", + slabs=(0, 1)) + knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) + + print(lp.CompiledKernel(ctx, knl).get_highlighted_code( + dict( + qpts=np.float32, + coeffs=np.float32, + tmp=np.float32, + ))) + + +def test_rob_stroud_bernstein_full(ctx_factory): + #logging.basicConfig(level=logging.DEBUG) + ctx = ctx_factory() + + # NOTE: result would have to be zero-filled beforehand + + knl = lp.make_kernel( + "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \ + 0 <= el < nels and \ + 0 <= i2 < nqp1d and \ + 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\ + \ + 0 <= i1_2 < nqp1d and \ + 0 <= alpha1_2 <= deg and \ + 0 <= i2_2 < nqp1d \ + }", + """ + for el + for i2 + <> xi = qpts[1, i2] + <> s = 1-xi + <> r = xi/s + <> aind = 0 {id=aind_init} + + for alpha1 + <> w = s**(deg-alpha1) {id=init_w} + + <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \ + {id=write_tmp} + for alpha2 + w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ + {id=update_w,dep=init_w:write_tmp} + aind = aind + 1 \ + {id=aind_incr,dep=aind_init:write_tmp:update_w} + end + end + end + + for i1_2 + <> xi2 = qpts[0, i1_2] {dep=aind_incr} + <> s2 = 1-xi2 + <> r2 = xi2/s2 + <> w2 = s2**deg + + for alpha1_2 + for i2_2 + result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \ + w2 * tmp[alpha1_2, i2_2] + end + + w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2) + end + end + end + """, + [ + # Must declare coeffs to have "no" shape, to keep loopy + # from trying to figure it out the shape automatically. + + lp.GlobalArg("coeffs", None, shape=None), + "..." + ], + assumptions="deg>=0 and nels>=1" + ) + + knl = lp.fix_parameters(knl, nqp1d=7, deg=4) + + if 0: + knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") + knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", + slabs=(0, 1)) + knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) + + from pickle import dumps, loads + knl = loads(dumps(knl)) + + knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( + dict( + qpts=np.float32, + tmp=np.float32, + coeffs=np.float32, + result=np.float32, + )) + print(knl) + + +def test_stencil(ctx_factory): + ctx = ctx_factory() + + # n=32 causes corner case behavior in size calculations for temprorary (a + # non-unifiable, two-constant-segments PwAff as the base index) + + n = 256 + knl = lp.make_kernel( + "{[i,j]: 0<= i,j < %d}" % n, + [ + "a_offset(ii, jj) := a[ii+1, jj+1]", + "z[i,j] = -2*a_offset(i,j)" + " + a_offset(i,j-1)" + " + a_offset(i,j+1)" + " + a_offset(i-1,j)" + " + a_offset(i+1,j)" + ], + [ + lp.GlobalArg("a", np.float32, shape=(n+2, n+2,)), + lp.GlobalArg("z", np.float32, shape=(n+2, n+2,)) + ]) + + ref_knl = knl + + def variant_1(knl): + knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") + knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") + knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"]) + knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) + return knl + + def variant_2(knl): + knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") + knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") + knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], + fetch_bounding_box=True) + knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) + return knl + + for variant in [ + #variant_1, + variant_2, + ]: + lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), + print_ref_code=False, + op_count=[n*n], op_label=["cells"]) + + +def test_stencil_with_overfetch(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<= i,j < n}", + [ + "a_offset(ii, jj) := a[ii+2, jj+2]", + "z[i,j] = -2*a_offset(i,j)" + " + a_offset(i,j-1)" + " + a_offset(i,j+1)" + " + a_offset(i-1,j)" + " + a_offset(i+1,j)" + + " + a_offset(i,j-2)" + " + a_offset(i,j+2)" + " + a_offset(i-2,j)" + " + a_offset(i+2,j)" + ], + assumptions="n>=1") + + if ctx.devices[0].platform.name == "Portable Computing Language": + # https://github.com/pocl/pocl/issues/205 + pytest.skip("takes very long to compile on pocl") + + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) + + ref_knl = knl + + def variant_overfetch(knl): + knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1", + slabs=(1, 1)) + knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0", + slabs=(1, 1)) + knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], + fetch_bounding_box=True) + knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) + return knl + + for variant in [variant_overfetch]: + n = 200 + lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), + print_ref_code=False, + op_count=[n*n], parameters=dict(n=n), op_label=["cells"]) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_dg.py b/test/test_dg.py index fafef86c35211183ebdaeb75acf2b664a36586a0..d65c68ed4c729582d511064d6495535efcf7a9a4 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -72,8 +72,9 @@ def test_dg_volume(ctx_factory): order=order), lp.ValueArg("K", np.int32, approximately=1000), ], - name="dg_volume", assumptions="K>=1", - defines=dict(Np=Np)) + name="dg_volume", assumptions="K>=1") + + knl = lp.fix_parameters(knl, Np=Np) seq_knl = knl @@ -218,9 +219,10 @@ def no_test_dg_surface(ctx_factory): lp.GlobalArg("LIFT", dtype, shape="Np, NfpNfaces", order="C"), lp.ValueArg("K", np.int32, approximately=1000), ], - name="dg_surface", assumptions="K>=1", - defines=dict(Np=Np, Nfp=Nfp, NfpNfaces=Nfaces*Nfp, nsurf_dofs=K*Nfp), - ) + name="dg_surface", assumptions="K>=1") + + knl = lp.fix_parameters(knl, + Np=Np, Nfp=Nfp, NfpNfaces=Nfaces*Nfp, nsurf_dofs=K*Nfp) seq_knl = knl diff --git a/test/test_domain.py b/test/test_domain.py new file mode 100644 index 0000000000000000000000000000000000000000..0631a08796eeda73376afcd1cd03a737a33241f6 --- /dev/null +++ b/test/test_domain.py @@ -0,0 +1,375 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six +from six.moves import range + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +def test_assume(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + "a[i] = a[i] + 1", + [lp.GlobalArg("a", np.float32, shape="n"), "..."]) + + knl = lp.split_iname(knl, "i", 16) + knl = lp.set_loop_priority(knl, "i_outer,i_inner") + knl = lp.assume(knl, "n mod 16 = 0") + knl = lp.assume(knl, "n > 10") + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + kernel_gen = lp.generate_loop_schedules(knl) + + for gen_knl in kernel_gen: + print(gen_knl) + compiled = lp.CompiledKernel(ctx, gen_knl) + print(compiled.get_code()) + assert "if" not in compiled.get_code() + + +def test_divisibility_assumption(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "[n] -> {[i]: 0<=i<n}", + [ + "b[i] = 2*a[i]" + ], + [ + lp.GlobalArg("a", np.float32, shape=("n",)), + lp.GlobalArg("b", np.float32, shape=("n",)), + lp.ValueArg("n", np.int32), + ], + assumptions="n>=1 and (exists zz: n = 16*zz)") + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 16) + + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + for k in lp.generate_loop_schedules(knl): + code = lp.generate_code(k) + assert "if" not in code + + lp.auto_test_vs_ref(ref_knl, ctx, knl, + parameters={"n": 16**3}) + + +def test_eq_constraint(ctx_factory): + logging.basicConfig(level=logging.INFO) + + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<= i,j < 32}", + [ + "a[i] = b[i]" + ], + [ + lp.GlobalArg("a", np.float32, shape=(1000,)), + lp.GlobalArg("b", np.float32, shape=(1000,)) + ]) + + knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") + knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") + + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + kernel_gen = lp.generate_loop_schedules(knl) + + for knl in kernel_gen: + print(lp.generate_code(knl)) + + +def test_dependent_loop_bounds(ctx_factory): + dtype = np.dtype(np.float32) + ctx = ctx_factory() + + knl = lp.make_kernel( + [ + "{[i]: 0<=i<n}", + "{[jj]: 0<=jj<row_len}", + ], + [ + "<> row_len = a_rowstarts[i+1] - a_rowstarts[i]", + "a_sum[i] = sum(jj, a_values[[a_rowstarts[i]+jj]])", + ], + [ + lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto), + lp.GlobalArg("a_indices", np.int32, shape=lp.auto), + lp.GlobalArg("a_values", dtype), + lp.GlobalArg("a_sum", dtype, shape=lp.auto), + lp.ValueArg("n", np.int32), + ], + assumptions="n>=1 and row_len>=1") + + cknl = lp.CompiledKernel(ctx, knl) + print("---------------------------------------------------") + print(cknl.get_highlighted_code()) + print("---------------------------------------------------") + + +def test_dependent_loop_bounds_2(ctx_factory): + dtype = np.dtype(np.float32) + ctx = ctx_factory() + + knl = lp.make_kernel( + [ + "{[i]: 0<=i<n}", + "{[jj]: 0<=jj<row_len}", + ], + [ + "<> row_start = a_rowstarts[i]", + "<> row_len = a_rowstarts[i+1] - row_start", + "ax[i] = sum(jj, a_values[[row_start+jj]])", + ], + [ + lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto), + lp.GlobalArg("a_indices", np.int32, shape=lp.auto), + lp.GlobalArg("a_values", dtype, strides=(1,)), + lp.GlobalArg("ax", dtype, shape=lp.auto), + lp.ValueArg("n", np.int32), + ], + assumptions="n>=1 and row_len>=1") + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", + inner_tag="l.0") + cknl = lp.CompiledKernel(ctx, knl) + print("---------------------------------------------------") + print(cknl.get_highlighted_code()) + print("---------------------------------------------------") + + +def test_dependent_loop_bounds_3(ctx_factory): + # The point of this test is that it shows a dependency between + # domains that is exclusively mediated by the row_len temporary. + # It also makes sure that row_len gets read before any + # conditionals use it. + + dtype = np.dtype(np.float32) + ctx = ctx_factory() + + knl = lp.make_kernel( + [ + "{[i]: 0<=i<n}", + "{[jj]: 0<=jj<row_len}", + ], + [ + "<> row_len = a_row_lengths[i]", + "a[i,jj] = 1", + ], + [ + lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), + lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), + lp.ValueArg("n", np.int32), + ]) + + assert knl.parents_per_domain()[1] == 0 + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", + inner_tag="l.0") + + cknl = lp.CompiledKernel(ctx, knl) + print("---------------------------------------------------") + print(cknl.get_highlighted_code()) + print("---------------------------------------------------") + + knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", + inner_tag="l.1") + + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + + import pytest + with pytest.raises(RuntimeError): + list(lp.generate_loop_schedules(knl_bad)) + + +def test_independent_multi_domain(ctx_factory): + dtype = np.dtype(np.float32) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + [ + "{[i]: 0<=i<n}", + "{[j]: 0<=j<n}", + ], + [ + "a[i] = 1", + "b[j] = 2", + ], + [ + lp.GlobalArg("a", dtype, shape=("n"), order="C"), + lp.GlobalArg("b", dtype, shape=("n"), order="C"), + lp.ValueArg("n", np.int32), + ]) + + knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", + inner_tag="l.0") + knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", + inner_tag="l.0") + assert knl.parents_per_domain() == 2*[None] + + n = 50 + cknl = lp.CompiledKernel(ctx, knl) + evt, (a, b) = cknl(queue, n=n, out_host=True) + + assert a.shape == (50,) + assert b.shape == (50,) + assert (a == 1).all() + assert (b == 2).all() + + +def test_equality_constraints(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + + order = "C" + + n = 10 + + knl = lp.make_kernel([ + "[n] -> {[i,j]: 0<=i,j<n }", + "{[k]: k =i+5 and k < n}", + ], + [ + "a[i,j] = 5 {id=set_all}", + "b[i,k] = 22 {dep=set_all}", + ], + [ + lp.GlobalArg("a,b", dtype, shape="n, n", order=order), + lp.ValueArg("n", np.int32, approximately=1000), + ], + name="equality_constraints", assumptions="n>=1") + + seq_knl = knl + + knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") + knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") + #print(knl) + #print(knl.domains[0].detect_equalities()) + + lp.auto_test_vs_ref(seq_knl, ctx, knl, + parameters=dict(n=n), print_ref_code=True) + + +def test_stride(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + + order = "C" + + n = 10 + + knl = lp.make_kernel([ + "{[i]: 0<=i<n and (exists l: i = 2*l)}", + ], + [ + "a[i] = 5", + ], + [ + lp.GlobalArg("a", dtype, shape="n", order=order), + lp.ValueArg("n", np.int32, approximately=1000), + ], + assumptions="n>=1") + + seq_knl = knl + + lp.auto_test_vs_ref(seq_knl, ctx, knl, + parameters=dict(n=n)) + + +def test_domain_dependency_via_existentially_quantified_variable(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + + order = "C" + + n = 10 + + knl = lp.make_kernel([ + "{[i]: 0<=i<n }", + "{[k]: k=i and (exists l: k = 2*l) }", + ], + [ + "a[i] = 5 {id=set}", + "b[k] = 6 {dep=set}", + ], + [ + lp.GlobalArg("a,b", dtype, shape="n", order=order), + lp.ValueArg("n", np.int32, approximately=1000), + ], + assumptions="n>=1") + + seq_knl = knl + + lp.auto_test_vs_ref(seq_knl, ctx, knl, + parameters=dict(n=n)) + + +def test_triangle_domain(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n and i <= j}", + "a[i,j] = 17", + assumptions="n>=1") + + print(knl) + print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_linalg.py b/test/test_linalg.py index 0e0b59089fe1d3a1c1310bd0834a29ca751b8df0..b7a1e059cb4499fe136f962b5d5fdcd539fc05f1 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -285,7 +285,7 @@ def test_rank_one(ctx_factory): def variant_1(knl): knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") - knl = knl.set_loop_priority(knl, ["i", "j"]) + knl = lp.set_loop_priority(knl, ["i", "j"]) return knl def variant_2(knl): @@ -294,7 +294,6 @@ def test_rank_one(ctx_factory): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - knl = knl.set_loop_priority(knl, ["i", "j"]) knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") return knl @@ -331,8 +330,7 @@ def test_rank_one(ctx_factory): seq_knl = knl - #for variant in [variant_1, variant_2, variant_3, variant_4]: - for variant in [variant_4]: + for variant in [variant_1, variant_2, variant_3, variant_4]: lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), op_count=[np.dtype(dtype).itemsize*n**2/1e9], op_label=["GBytes"], parameters={"n": n}) @@ -604,7 +602,7 @@ def test_small_batched_matvec(ctx_factory): Np = 36 # noqa knl = lp.make_kernel( - "[K] -> {[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np, + "{[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np, [ "result[k, i] = sum(j, d[i, j]*f[k, j])" ], diff --git a/test/test_loopy.py b/test/test_loopy.py index 739a99b8bcd43d8940e8ccbb5ce460d1c6c2b44e..26f4f2faf3b83a24932cad0ac3941696ed3e2636 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -79,23 +79,6 @@ def test_complicated_subst(ctx_factory): assert substs_with_letter == how_many -def test_extract_subst(ctx_factory): - knl = lp.make_kernel( - "{[i]: 0<=i<n}", - """ - a[i] = 23*b[i]**2 + 25*b[i]**2 - """) - - knl = lp.extract_subst(knl, "bsquare", "alpha*b[i]**2", "alpha") - - print(knl) - - from loopy.symbolic import parse - - insn, = knl.instructions - assert insn.expression == parse("bsquare(23) + bsquare(25)") - - def test_type_inference_no_artificial_doubles(ctx_factory): ctx = ctx_factory() @@ -142,28 +125,6 @@ def test_sized_and_complex_literals(ctx_factory): lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5)) -def test_assume(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i]: 0<=i<n}", - "a[i] = a[i] + 1", - [lp.GlobalArg("a", np.float32, shape="n"), "..."]) - - knl = lp.split_iname(knl, "i", 16) - knl = lp.set_loop_priority(knl, "i_outer,i_inner") - knl = lp.assume(knl, "n mod 16 = 0") - knl = lp.assume(knl, "n > 10") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) - assert "if" not in compiled.get_code() - - def test_simple_side_effect(ctx_factory): ctx = ctx_factory() @@ -184,22 +145,6 @@ def test_simple_side_effect(ctx_factory): print(compiled.get_code()) -def test_nonsense_reduction(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i]: 0<=i<100}", - """ - a[i] = sum(i, 2) - """, - [lp.GlobalArg("a", np.float32, shape=(100,))] - ) - - import pytest - with pytest.raises(RuntimeError): - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - - def test_owed_barriers(ctx_factory): ctx = ctx_factory() @@ -243,56 +188,6 @@ def test_wg_too_small(ctx_factory): lp.CompiledKernel(ctx, gen_knl).get_code() -def test_join_inames(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<16}", - [ - "b[i,j] = 2*a[i,j]" - ], - [ - lp.GlobalArg("a", np.float32, shape=(16, 16,)), - lp.GlobalArg("b", np.float32, shape=(16, 16,)) - ], - ) - - ref_knl = knl - - knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"]) - knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"]) - - lp.auto_test_vs_ref(ref_knl, ctx, knl, print_ref_code=True) - - -def test_divisibility_assumption(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "[n] -> {[i]: 0<=i<n}", - [ - "b[i] = 2*a[i]" - ], - [ - lp.GlobalArg("a", np.float32, shape=("n",)), - lp.GlobalArg("b", np.float32, shape=("n",)), - lp.ValueArg("n", np.int32), - ], - assumptions="n>=1 and (exists zz: n = 16*zz)") - - ref_knl = knl - - knl = lp.split_iname(knl, "i", 16) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "if" not in code - - lp.auto_test_vs_ref(ref_knl, ctx, knl, - parameters={"n": 16**3}) - - def test_multi_cse(ctx_factory): ctx = ctx_factory() @@ -315,149 +210,6 @@ def test_multi_cse(ctx_factory): print(compiled.get_code()) -def test_stencil(ctx_factory): - ctx = ctx_factory() - - # n=32 causes corner case behavior in size calculations for temprorary (a - # non-unifiable, two-constant-segments PwAff as the base index) - - n = 256 - knl = lp.make_kernel( - "{[i,j]: 0<= i,j < %d}" % n, - [ - "a_offset(ii, jj) := a[ii+1, jj+1]", - "z[i,j] = -2*a_offset(i,j)" - " + a_offset(i,j-1)" - " + a_offset(i,j+1)" - " + a_offset(i-1,j)" - " + a_offset(i+1,j)" - ], - [ - lp.GlobalArg("a", np.float32, shape=(n+2, n+2,)), - lp.GlobalArg("z", np.float32, shape=(n+2, n+2,)) - ]) - - ref_knl = knl - - def variant_1(knl): - knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"]) - knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) - return knl - - def variant_2(knl): - knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], - fetch_bounding_box=True) - knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) - return knl - - for variant in [ - #variant_1, - variant_2, - ]: - lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), - print_ref_code=False, - op_count=[n*n], op_label=["cells"]) - - -def test_stencil_with_overfetch(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<= i,j < n}", - [ - "a_offset(ii, jj) := a[ii+2, jj+2]", - "z[i,j] = -2*a_offset(i,j)" - " + a_offset(i,j-1)" - " + a_offset(i,j+1)" - " + a_offset(i-1,j)" - " + a_offset(i+1,j)" - - " + a_offset(i,j-2)" - " + a_offset(i,j+2)" - " + a_offset(i-2,j)" - " + a_offset(i+2,j)" - ], - assumptions="n>=1") - - if ctx.devices[0].platform.name == "Portable Computing Language": - # https://github.com/pocl/pocl/issues/205 - pytest.skip("takes very long to compile on pocl") - - knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - - ref_knl = knl - - def variant_overfetch(knl): - knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1", - slabs=(1, 1)) - knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0", - slabs=(1, 1)) - knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], - fetch_bounding_box=True) - knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) - return knl - - for variant in [variant_overfetch]: - n = 200 - lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), - print_ref_code=False, - op_count=[n*n], parameters=dict(n=n), op_label=["cells"]) - - -def test_eq_constraint(ctx_factory): - logging.basicConfig(level=logging.INFO) - - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<= i,j < 32}", - [ - "a[i] = b[i]" - ], - [ - lp.GlobalArg("a", np.float32, shape=(1000,)), - lp.GlobalArg("b", np.float32, shape=(1000,)) - ]) - - knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") - knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for knl in kernel_gen: - print(lp.generate_code(knl)) - - -def test_argmax(ctx_factory): - logging.basicConfig(level=logging.INFO) - - dtype = np.dtype(np.float32) - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 10000 - - knl = lp.make_kernel( - "{[i]: 0<=i<%d}" % n, - """ - max_val, max_idx = argmax(i, fabs(a[i])) - """) - - knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) - print(lp.preprocess_kernel(knl)) - knl = lp.set_options(knl, write_cl=True, highlight_cl=True) - - a = np.random.randn(10000).astype(dtype) - evt, (max_idx, max_val) = knl(queue, a=a, out_host=True) - assert max_val == np.max(np.abs(a)) - assert max_idx == np.where(np.abs(a) == max_val)[-1] - - # {{{ code generator fuzzing def make_random_value(): @@ -571,552 +323,148 @@ def test_fuzz_code_generator(ctx_factory): # }}} -def test_empty_reduction(ctx_factory): +def test_bare_data_dependency(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ - "{[i]: 0<=i<20}", - "[i] -> {[j]: 0<=j<0}" + "[znirp] -> {[i]: 0<=i<znirp}", ], [ - "a[i] = sum(j, j)", + "<> znirp = n", + "a[i] = 1", ], [ - lp.GlobalArg("a", dtype, (20,)), + lp.GlobalArg("a", dtype, shape=("n"), order="C"), + lp.ValueArg("n", np.int32), ]) + cknl = lp.CompiledKernel(ctx, knl) + n = 20000 + evt, (a,) = cknl(queue, n=n, out_host=True) - evt, (a,) = cknl(queue) + assert a.shape == (n,) + assert (a == 1).all() - assert (a.get() == 0).all() +# {{{ test race detection -def test_nested_dependent_reduction(ctx_factory): - dtype = np.dtype(np.int32) +@pytest.mark.skipif("sys.version_info < (2,6)") +def test_ilp_write_race_detection_global(ctx_factory): ctx = ctx_factory() - queue = cl.CommandQueue(ctx) knl = lp.make_kernel( + "[n] -> {[i,j]: 0<=i,j<n }", [ - "{[i]: 0<=i<n}", - "{[j]: 0<=j<i+sumlen}" + "a[i] = 5+i+j", ], [ - "<> sumlen = l[i]", - "a[i] = sum(j, j)", + lp.GlobalArg("a", np.float32), + lp.ValueArg("n", np.int32, approximately=1000), ], - [ - lp.ValueArg("n", np.int32), - lp.GlobalArg("a", dtype, ("n",)), - lp.GlobalArg("l", np.int32, ("n",)), - ]) + assumptions="n>=1") - cknl = lp.CompiledKernel(ctx, knl) + knl = lp.tag_inames(knl, dict(j="ilp")) - n = 330 - l = np.arange(n, dtype=np.int32) - evt, (a,) = cknl(queue, l=l, n=n, out_host=True) + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + + with lp.CacheMode(False): + from loopy.diagnostic import WriteRaceConditionWarning + from warnings import catch_warnings + with catch_warnings(record=True) as warn_list: + list(lp.generate_loop_schedules(knl)) - tgt_result = (2*l-1)*2*l/2 - assert (a == tgt_result).all() + assert any(isinstance(w.message, WriteRaceConditionWarning) + for w in warn_list) -def test_multi_nested_dependent_reduction(ctx_factory): - dtype = np.dtype(np.int32) +def test_ilp_write_race_avoidance_local(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( + "{[i,j]: 0<=i<16 and 0<=j<17 }", [ - "{[itgt]: 0 <= itgt < ntgts}", - "{[isrc_box]: 0 <= isrc_box < nboxes}", - "{[isrc]: 0 <= isrc < npart}" - ], - [ - "<> npart = nparticles_per_box[isrc_box]", - "a[itgt] = sum((isrc_box, isrc), 1)", - ], - [ - lp.ValueArg("n", np.int32), - lp.GlobalArg("a", dtype, ("n",)), - lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)), - lp.ValueArg("ntgts", np.int32), - lp.ValueArg("nboxes", np.int32), + "<> a[i] = 5+i+j", ], - assumptions="ntgts>=1") + []) - cknl = lp.CompiledKernel(ctx, knl) - print(cknl.get_code()) - # FIXME: Actually test functionality. + knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) + + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + for k in lp.generate_loop_schedules(knl): + assert k.temporary_variables["a"].shape == (16, 17) -def test_recursive_nested_dependent_reduction(ctx_factory): - dtype = np.dtype(np.int32) +def test_ilp_write_race_avoidance_private(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( + "{[j]: 0<=j<16 }", [ - "{[itgt]: 0 <= itgt < ntgts}", - "{[isrc_box]: 0 <= isrc_box < nboxes}", - "{[isrc]: 0 <= isrc < npart}" - ], - [ - "<> npart = nparticles_per_box[isrc_box]", - "<> boxsum = sum(isrc, isrc+isrc_box+itgt)", - "a[itgt] = sum(isrc_box, boxsum)", - ], - [ - lp.ValueArg("n", np.int32), - lp.GlobalArg("a", dtype, ("n",)), - lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)), - lp.ValueArg("ntgts", np.int32), - lp.ValueArg("nboxes", np.int32), + "<> a = 5+j", ], - assumptions="ntgts>=1") - - cknl = lp.CompiledKernel(ctx, knl) - print(cknl.get_code()) - # FIXME: Actually test functionality. - + []) -def test_dependent_loop_bounds(ctx_factory): - dtype = np.dtype(np.float32) - ctx = ctx_factory() + knl = lp.tag_inames(knl, dict(j="ilp")) - knl = lp.make_kernel( - [ - "{[i]: 0<=i<n}", - "{[jj]: 0<=jj<row_len}", - ], - [ - "<> row_len = a_rowstarts[i+1] - a_rowstarts[i]", - "a_sum[i] = sum(jj, a_values[[a_rowstarts[i]+jj]])", - ], - [ - lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto), - lp.GlobalArg("a_indices", np.int32, shape=lp.auto), - lp.GlobalArg("a_values", dtype), - lp.GlobalArg("a_sum", dtype, shape=lp.auto), - lp.ValueArg("n", np.int32), - ], - assumptions="n>=1 and row_len>=1") + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + for k in lp.generate_loop_schedules(knl): + assert k.temporary_variables["a"].shape == (16,) - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") +# }}} -def test_dependent_loop_bounds_2(ctx_factory): - dtype = np.dtype(np.float32) +def test_write_parameter(ctx_factory): + dtype = np.float32 ctx = ctx_factory() knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n }", + """ + a = sum((i,j), i*j) + b = sum(i, sum(j, i*j)) + n = 15 + """, [ - "{[i]: 0<=i<n}", - "{[jj]: 0<=jj<row_len}", - ], - [ - "<> row_start = a_rowstarts[i]", - "<> row_len = a_rowstarts[i+1] - row_start", - "ax[i] = sum(jj, a_values[[row_start+jj]])", - ], - [ - lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto), - lp.GlobalArg("a_indices", np.int32, shape=lp.auto), - lp.GlobalArg("a_values", dtype, strides=(1,)), - lp.GlobalArg("ax", dtype, shape=lp.auto), - lp.ValueArg("n", np.int32), + lp.GlobalArg("a", dtype, shape=()), + lp.GlobalArg("b", dtype, shape=()), + lp.ValueArg("n", np.int32, approximately=1000), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1") - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", - inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + import pytest + with pytest.raises(RuntimeError): + lp.CompiledKernel(ctx, knl).get_code() -def test_dependent_loop_bounds_3(ctx_factory): - # The point of this test is that it shows a dependency between - # domains that is exclusively mediated by the row_len temporary. - # It also makes sure that row_len gets read before any - # conditionals use it. +# {{{ arg guessing - dtype = np.dtype(np.float32) +def test_arg_shape_guessing(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n }", + """ + a = 1.5 + sum((i,j), i*j) + b[i, j] = i*j + c[i+j, j] = b[j,i] + """, [ - "{[i]: 0<=i<n}", - "{[jj]: 0<=jj<row_len}", - ], - [ - "<> row_len = a_row_lengths[i]", - "a[i,jj] = 1", + lp.GlobalArg("a", shape=lp.auto), + lp.GlobalArg("b", shape=lp.auto), + lp.GlobalArg("c", shape=lp.auto), + lp.ValueArg("n"), ], - [ - lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), - lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), - lp.ValueArg("n", np.int32), - ]) - - assert knl.parents_per_domain()[1] == 0 + assumptions="n>=1") - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", - inner_tag="l.0") + print(knl) + print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") - knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", - inner_tag="l.1") - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - - import pytest - with pytest.raises(RuntimeError): - list(lp.generate_loop_schedules(knl_bad)) - - -def test_independent_multi_domain(ctx_factory): - dtype = np.dtype(np.float32) - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - knl = lp.make_kernel( - [ - "{[i]: 0<=i<n}", - "{[j]: 0<=j<n}", - ], - [ - "a[i] = 1", - "b[j] = 2", - ], - [ - lp.GlobalArg("a", dtype, shape=("n"), order="C"), - lp.GlobalArg("b", dtype, shape=("n"), order="C"), - lp.ValueArg("n", np.int32), - ]) - - knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", - inner_tag="l.0") - knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", - inner_tag="l.0") - assert knl.parents_per_domain() == 2*[None] - - n = 50 - cknl = lp.CompiledKernel(ctx, knl) - evt, (a, b) = cknl(queue, n=n, out_host=True) - - assert a.shape == (50,) - assert b.shape == (50,) - assert (a == 1).all() - assert (b == 2).all() - - -def test_bare_data_dependency(ctx_factory): - dtype = np.dtype(np.float32) - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - knl = lp.make_kernel( - [ - "[znirp] -> {[i]: 0<=i<znirp}", - ], - [ - "<> znirp = n", - "a[i] = 1", - ], - [ - lp.GlobalArg("a", dtype, shape=("n"), order="C"), - lp.ValueArg("n", np.int32), - ]) - - cknl = lp.CompiledKernel(ctx, knl) - n = 20000 - evt, (a,) = cknl(queue, n=n, out_host=True) - - assert a.shape == (n,) - assert (a == 1).all() - - -def test_equality_constraints(ctx_factory): - dtype = np.float32 - ctx = ctx_factory() - - order = "C" - - n = 10 - - knl = lp.make_kernel([ - "[n] -> {[i,j]: 0<=i,j<n }", - "{[k]: k =i+5 and k < n}", - ], - [ - "a[i,j] = 5 {id=set_all}", - "b[i,k] = 22 {dep=set_all}", - ], - [ - lp.GlobalArg("a,b", dtype, shape="n, n", order=order), - lp.ValueArg("n", np.int32, approximately=1000), - ], - name="equality_constraints", assumptions="n>=1") - - seq_knl = knl - - knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - #print(knl) - #print(knl.domains[0].detect_equalities()) - - lp.auto_test_vs_ref(seq_knl, ctx, knl, - parameters=dict(n=n), print_ref_code=True) - - -def test_stride(ctx_factory): - dtype = np.float32 - ctx = ctx_factory() - - order = "C" - - n = 10 - - knl = lp.make_kernel([ - "{[i]: 0<=i<n and (exists l: i = 2*l)}", - ], - [ - "a[i] = 5", - ], - [ - lp.GlobalArg("a", dtype, shape="n", order=order), - lp.ValueArg("n", np.int32, approximately=1000), - ], - assumptions="n>=1") - - seq_knl = knl - - lp.auto_test_vs_ref(seq_knl, ctx, knl, - parameters=dict(n=n)) - - -def test_domain_dependency_via_existentially_quantified_variable(ctx_factory): - dtype = np.float32 - ctx = ctx_factory() - - order = "C" - - n = 10 - - knl = lp.make_kernel([ - "{[i]: 0<=i<n }", - "{[k]: k=i and (exists l: k = 2*l) }", - ], - [ - "a[i] = 5 {id=set}", - "b[k] = 6 {dep=set}", - ], - [ - lp.GlobalArg("a,b", dtype, shape="n", order=order), - lp.ValueArg("n", np.int32, approximately=1000), - ], - assumptions="n>=1") - - seq_knl = knl - - lp.auto_test_vs_ref(seq_knl, ctx, knl, - parameters=dict(n=n)) - - -def test_double_sum(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 20 - - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n }", - [ - "a = simul_reduce(sum, (i,j), i*j)", - "b = simul_reduce(sum, i, simul_reduce(sum, j, i*j))", - ], - assumptions="n>=1") - - evt, (a, b) = knl(queue, n=n) - - ref = sum(i*j for i in range(n) for j in range(n)) - assert a.get() == ref - assert b.get() == ref - - -@pytest.mark.parametrize(("op_name", "np_op"), [ - ("sum", np.sum), - ("product", np.prod), - ("min", np.min), - ("max", np.max), - ]) -def test_reduction_library(ctx_factory, op_name, np_op): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - knl = lp.make_kernel( - "{[i,j]: 0<=i<n and 0<=j<m }", - [ - "res[i] = reduce(%s, j, a[i,j])" % op_name, - ], - assumptions="n>=1") - - a = np.random.randn(20, 10) - evt, (res,) = knl(queue, a=a) - - assert np.allclose(res, np_op(a, axis=1)) - - -def test_double_sum_made_unique(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 20 - - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n }", - [ - "a = sum((i,j), i*j)", - "b = sum(i, sum(j, i*j))", - ], - assumptions="n>=1") - - knl = lp.make_reduction_inames_unique(knl) - print(knl) - - evt, (a, b) = knl(queue, n=n) - - ref = sum(i*j for i in range(n) for j in range(n)) - assert a.get() == ref - assert b.get() == ref - - -# {{{ test race detection - -@pytest.mark.skipif("sys.version_info < (2,6)") -def test_ilp_write_race_detection_global(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "[n] -> {[i,j]: 0<=i,j<n }", - [ - "a[i] = 5+i+j", - ], - [ - lp.GlobalArg("a", np.float32), - lp.ValueArg("n", np.int32, approximately=1000), - ], - assumptions="n>=1") - - knl = lp.tag_inames(knl, dict(j="ilp")) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - - with lp.CacheMode(False): - from loopy.diagnostic import WriteRaceConditionWarning - from warnings import catch_warnings - with catch_warnings(record=True) as warn_list: - list(lp.generate_loop_schedules(knl)) - - assert any(isinstance(w.message, WriteRaceConditionWarning) - for w in warn_list) - - -def test_ilp_write_race_avoidance_local(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i<16 and 0<=j<17 }", - [ - "<> a[i] = 5+i+j", - ], - []) - - knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16, 17) - - -def test_ilp_write_race_avoidance_private(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[j]: 0<=j<16 }", - [ - "<> a = 5+j", - ], - []) - - knl = lp.tag_inames(knl, dict(j="ilp")) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16,) - -# }}} - - -def test_write_parameter(ctx_factory): - dtype = np.float32 - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n }", - """ - a = sum((i,j), i*j) - b = sum(i, sum(j, i*j)) - n = 15 - """, - [ - lp.GlobalArg("a", dtype, shape=()), - lp.GlobalArg("b", dtype, shape=()), - lp.ValueArg("n", np.int32, approximately=1000), - ], - assumptions="n>=1") - - import pytest - with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, knl).get_code() - - -# {{{ arg guessing - -def test_arg_shape_guessing(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n }", - """ - a = 1.5 + sum((i,j), i*j) - b[i, j] = i*j - c[i+j, j] = b[j,i] - """, - [ - lp.GlobalArg("a", shape=lp.auto), - lp.GlobalArg("b", shape=lp.auto), - lp.GlobalArg("c", shape=lp.auto), - lp.ValueArg("n"), - ], - assumptions="n>=1") - - print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) - - -def test_arg_guessing(ctx_factory): - ctx = ctx_factory() +def test_arg_guessing(ctx_factory): + ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i,j<n }", @@ -1169,18 +517,6 @@ def test_nonlinear_index(ctx_factory): print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) -def test_triangle_domain(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n and i <= j}", - "a[i,j] = 17", - assumptions="n>=1") - - print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) - - def test_offsets_and_slicing(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -1227,144 +563,21 @@ def test_vector_ilp_with_prefetch(ctx_factory): "out[i] = 2*a[i]", [ # Tests that comma'd arguments interoperate with - # argument guessing. - lp.GlobalArg("out,a", np.float32, shape=lp.auto), - "..." - ]) - - knl = lp.split_iname(knl, "i", 128, inner_tag="l.0") - knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") - knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) - - cknl = lp.CompiledKernel(ctx, knl) - cknl.cl_kernel_info() - - import re - code = cknl.get_code() - assert len(list(re.finditer("barrier", code))) == 1 - - -# {{{ convolutions - -def test_convolution(ctx_factory): - ctx = ctx_factory() - - dtype = np.float32 - - knl = lp.make_kernel( - "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ - -f_w <= f_x,f_y <= f_w \ - and 0 <= im_x < im_w and 0 <= im_y < im_h \ - and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ - }", - """ - out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \ - img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \ - * f[ifeat, f_w+f_x, f_w+f_y, icolor]) - """, - [ - lp.GlobalArg("f", dtype, shape=lp.auto), - lp.GlobalArg("img", dtype, shape=lp.auto), - lp.GlobalArg("out", dtype, shape=lp.auto), - "..." - ], - assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", - flags="annotate_inames", - defines=dict(ncolors=3)) - - f_w = 3 - - knl = lp.fix_parameters(knl, f_w=f_w) - - ref_knl = knl - - def variant_0(knl): - #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") - knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y") - return knl - - def variant_1(knl): - knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") - knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") - return knl - - def variant_2(knl): - knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.tag_inames(knl, dict(ifeat="g.2")) - knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]") - knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y") - return knl - - for variant in [ - variant_0, - variant_1, - variant_2 - ]: - lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), - parameters=dict( - im_w=128, im_h=128, f_w=f_w, - nfeats=3, nimgs=3 - )) - - -def test_convolution_with_nonzero_base(ctx_factory): - # This is kept alive as a test for domains that don't start at zero. - # These are a bad idea for split_iname, which places its origin at zero - # and therefore produces a first block that is odd-sized. - # - # Therefore, for real tests, check test_convolution further up. - - ctx = ctx_factory() - - dtype = np.float32 - - knl = lp.make_kernel( - "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ - -f_w <= f_x,f_y <= f_w \ - and f_w <= im_x < im_w-f_w and f_w <= im_y < im_h-f_w \ - and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ - }", - """ - out[iimg, ifeat, im_x-f_w, im_y-f_w] = sum((f_x, f_y, icolor), \ - img[iimg, im_x-f_x, im_y-f_y, icolor] \ - * f[ifeat, f_w+f_x, f_w+f_y, icolor]) - """, - [ - lp.GlobalArg("f", dtype, shape=lp.auto), - lp.GlobalArg("img", dtype, shape=lp.auto), - lp.GlobalArg("out", dtype, shape=lp.auto), - "..." - ], - assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", - flags="annotate_inames", - defines=dict(ncolors=3)) - - ref_knl = knl - - f_w = 3 - - def variant_0(knl): - #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") - knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y") - return knl + # argument guessing. + lp.GlobalArg("out,a", np.float32, shape=lp.auto), + "..." + ]) - def variant_1(knl): - knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") - knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") - return knl + knl = lp.split_iname(knl, "i", 128, inner_tag="l.0") + knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") + knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) - for variant in [ - variant_0, - variant_1, - ]: - lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), - parameters=dict( - im_w=128, im_h=128, f_w=f_w, - nfeats=12, nimgs=17 - )) + cknl = lp.CompiledKernel(ctx, knl) + cknl.cl_kernel_info() -# }}} + import re + code = cknl.get_code() + assert len(list(re.finditer("barrier", code))) == 1 def test_c_instruction(ctx_factory): @@ -1442,21 +655,6 @@ def test_inames_deps_from_write_subscript(ctx_factory): assert "i" in knl.insn_inames("myred") -def test_split_reduction(ctx_factory): - knl = lp.make_kernel( - "{[i,j,k]: 0<=i,j,k<n}", - """ - b = sum((i,j,k), a[i,j,k]) - """, - [ - lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a", - None, shape=None), - "..."]) - - knl = lp.split_reduction_outward(knl, "j,k") - # FIXME: finish test - - def test_modulo_indexing(ctx_factory): ctx = ctx_factory() @@ -1478,132 +676,6 @@ def test_modulo_indexing(ctx_factory): ))) -def test_rob_stroud_bernstein(ctx_factory): - ctx = ctx_factory() - - # NOTE: tmp would have to be zero-filled beforehand - - knl = lp.make_kernel( - "{[el, i2, alpha1,alpha2]: \ - 0 <= el < nels and \ - 0 <= i2 < nqp1d and \ - 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }", - """ - <> xi = qpts[1, i2] {inames=+el} - <> s = 1-xi - <> r = xi/s - <> aind = 0 {id=aind_init,inames=+i2:el} - - <> w = s**(deg-alpha1) {id=init_w} - - tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \ - {id=write_tmp,inames=+alpha2} - w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ - {id=update_w,dep=init_w:write_tmp} - aind = aind + 1 \ - {id=aind_incr,\ - dep=aind_init:write_tmp:update_w, \ - inames=+el:i2:alpha1:alpha2} - """, - [ - # Must declare coeffs to have "no" shape, to keep loopy - # from trying to figure it out the shape automatically. - - lp.GlobalArg("coeffs", None, shape=None), - "..." - ], - assumptions="deg>=0 and nels>=1" - ) - - knl = lp.fix_parameters(knl, nqp1d=7, deg=4) - knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") - knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", - slabs=(0, 1)) - knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) - - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( - qpts=np.float32, - coeffs=np.float32, - tmp=np.float32, - ))) - - -def test_rob_stroud_bernstein_full(ctx_factory): - #logging.basicConfig(level=logging.DEBUG) - ctx = ctx_factory() - - # NOTE: result would have to be zero-filled beforehand - - knl = lp.make_kernel( - "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \ - 0 <= el < nels and \ - 0 <= i2 < nqp1d and \ - 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\ - \ - 0 <= i1_2 < nqp1d and \ - 0 <= alpha1_2 <= deg and \ - 0 <= i2_2 < nqp1d \ - }", - """ - <> xi = qpts[1, i2] {inames=+el} - <> s = 1-xi - <> r = xi/s - <> aind = 0 {id=aind_init,inames=+i2:el} - - <> w = s**(deg-alpha1) {id=init_w} - - <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \ - {id=write_tmp,inames=+alpha2} - w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ - {id=update_w,dep=init_w:write_tmp} - aind = aind + 1 \ - {id=aind_incr,\ - dep=aind_init:write_tmp:update_w, \ - inames=+el:i2:alpha1:alpha2} - - <> xi2 = qpts[0, i1_2] {dep=aind_incr,inames=+el} - <> s2 = 1-xi2 - <> r2 = xi2/s2 - <> w2 = s2**deg - - result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \ - w2 * tmp[alpha1_2, i2_2] \ - {inames=el:alpha1_2:i1_2:i2_2} - - w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2) - """, - [ - # Must declare coeffs to have "no" shape, to keep loopy - # from trying to figure it out the shape automatically. - - lp.GlobalArg("coeffs", None, shape=None), - "..." - ], - assumptions="deg>=0 and nels>=1" - ) - - knl = lp.fix_parameters(knl, nqp1d=7, deg=4) - - if 0: - knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") - knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", - slabs=(0, 1)) - knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) - - from pickle import dumps, loads - knl = loads(dumps(knl)) - - knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( - qpts=np.float32, - tmp=np.float32, - coeffs=np.float32, - result=np.float32, - )) - print(knl) - - @pytest.mark.parametrize("vec_len", [2, 3, 4, 8, 16]) def test_vector_types(ctx_factory, vec_len): ctx = ctx_factory() @@ -1632,28 +704,6 @@ def test_vector_types(ctx_factory, vec_len): )) -def test_tag_data_axes(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{ [i,j,k]: 0<=i,j,k<n }", - "out[i,j,k] = 15") - - ref_knl = knl - - with pytest.raises(lp.LoopyError): - lp.tag_data_axes(knl, "out", "N1,N0,N5") - - with pytest.raises(lp.LoopyError): - lp.tag_data_axes(knl, "out", "N1,N0,c") - - knl = lp.tag_data_axes(knl, "out", "N1,N0,N2") - knl = lp.tag_inames(knl, dict(j="g.0", i="g.1")) - - lp.auto_test_vs_ref(ref_knl, ctx, knl, - parameters=dict(n=20)) - - def test_conditional(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() @@ -1768,431 +818,62 @@ def test_multiple_writes_to_local_temporary(): "{[i,e]: 0<=i<5 and 0<=e<nelements}", """ <> temp[i, 0] = 17 - temp[i, 1] = 15 - """) - knl = lp.tag_inames(knl, dict(i="l.0")) - - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - code, _ = lp.generate_code(k) - print(code) - - -def test_fd_demo(): - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n}", - "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \ - + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ - + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]") - #assumptions="n mod 16=0") - knl = lp.split_iname(knl, - "i", 16, outer_tag="g.1", inner_tag="l.1") - knl = lp.split_iname(knl, - "j", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.add_prefetch(knl, "u", - ["i_inner", "j_inner"], - fetch_bounding_box=True) - - #n = 1000 - #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32) - - knl = lp.set_options(knl, write_cl=True) - knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32)) - code, inf = lp.generate_code(knl) - print(code) - - assert "double" not in code - - -def test_fd_1d(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i]: 0<=i<n}", - "result[i] = u[i+1]-u[i]") - - knl = lp.add_and_infer_dtypes(knl, {"u": np.float32}) - ref_knl = knl - - knl = lp.split_iname(knl, "i", 16) - knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j") - knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for") - knl = lp.assume(knl, "n mod 16 = 0") - - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(n=2048)) - - -def test_make_copy_kernel(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - intermediate_format = "f,f,sep" - - a1 = np.random.randn(1024, 4, 3) - - cknl1 = lp.make_copy_kernel(intermediate_format) - - cknl1 = lp.fix_parameters(cknl1, n2=3) - - cknl1 = lp.set_options(cknl1, write_cl=True) - evt, a2 = cknl1(queue, input=a1) - - cknl2 = lp.make_copy_kernel("c,c,c", intermediate_format) - cknl2 = lp.fix_parameters(cknl2, n2=3) - - evt, a3 = cknl2(queue, input=a2) - - assert (a1 == a3).all() - - -def test_set_arg_order(): - knl = lp.make_kernel( - "{ [i,j]: 0<=i,j<n }", - "out[i,j] = a[i]*b[j]") - - knl = lp.set_argument_order(knl, "out,a,n,b") - - -def test_affine_map_inames(): - knl = lp.make_kernel( - "{[e, i,j,n]: 0<=e<E and 0<=i,j,n<N}", - "rhsQ[e, n+i, j] = rhsQ[e, n+i, j] - D[i, n]*x[i,j]") - - knl = lp.affine_map_inames(knl, - "i", "i0", - "i0 = n+i") - - print(knl) - - -def test_precompute_confusing_subst_arguments(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i<n and 0<=j<5}", - """ - D(i):=a[i+1]-a[i] - b[i,j] = D(j) - """) - - knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - - ref_knl = knl - - knl = lp.tag_inames(knl, dict(j="g.1")) - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - - from loopy.symbolic import get_dependencies - assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) - knl = lp.precompute(knl, "D") - - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(n=12345)) - - -def test_precompute_nested_subst(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i<n and 0<=j<5}", - """ - E:=a[i] - D:=E*E - b[i] = D - """) - - knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - - ref_knl = knl - - knl = lp.tag_inames(knl, dict(j="g.1")) - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - - from loopy.symbolic import get_dependencies - assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) - knl = lp.precompute(knl, "D", "i_inner") - - # There's only one surviving 'E' rule. - assert len([ - rule_name - for rule_name in knl.substitutions - if rule_name.startswith("E")]) == 1 - - # That rule should use the newly created prefetch inames, - # not the prior 'i_inner' - assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression) - - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(n=12345)) - - -def test_poisson(ctx_factory): - # Stolen from Peter Coogan and Rob Kirby for FEM assembly - ctx = ctx_factory() - - nbf = 5 - nqp = 5 - sdim = 3 - - knl = lp.make_kernel( - "{ [c,i,j,k,ell,ell2,ell3]: \ - 0 <= c < nels and \ - 0 <= i < nbf and \ - 0 <= j < nbf and \ - 0 <= k < nqp and \ - 0 <= ell,ell2 < sdim}", - """ - dpsi(bf,k0,dir) := \ - simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] ) - Ael[c,i,j] = \ - J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell)) - """, - assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0") - - print(knl) - - knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp) - - ref_knl = knl - - knl = lp.set_loop_priority(knl, ["c", "j", "i", "k"]) - - def variant_1(knl): - knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') - knl = lp.set_loop_priority(knl, "c,i,j") - return knl - - def variant_2(knl): - knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') - knl = lp.set_loop_priority(knl, "c,i,j") - return knl - - def add_types(knl): - return lp.add_and_infer_dtypes(knl, dict( - w=np.float32, - J=np.float32, - DPsi=np.float32, - DFinv=np.float32, - )) - - for variant in [ - #variant_1, - variant_2 - ]: - knl = variant(knl) - - lp.auto_test_vs_ref( - add_types(ref_knl), ctx, add_types(knl), - parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7)) - - -def test_auto_test_can_detect_problems(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n}", - """ - a[i,j] = 25 - """) - - knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - - ref_knl = knl - - knl = lp.link_inames(knl, "i,j", "i0") - - from loopy.diagnostic import AutomaticTestFailure - with pytest.raises(AutomaticTestFailure): - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(n=123)) - - -def test_generate_c_snippet(): - from loopy.target.c import CTarget - - from pymbolic import var - I = var("I") # noqa - f = var("f") - df = var("df") - q_v = var("q_v") - eN = var("eN") # noqa - k = var("k") - u = var("u") - - from functools import partial - l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) - - Instr = lp.Assignment # noqa - - knl = lp.make_kernel( - "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", - [ - Instr(f[I], l_sum(k, q_v[k, I]*u)), - Instr(df[I], l_sum(k, q_v[k, I])), - ], - [ - lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), - lp.GlobalArg("f,df", np.float64, shape="nSpace"), - lp.ValueArg("u", np.float64), - "...", - ], - target=CTarget(), - assumptions="nQuad>=1") - - if 0: # enable to play with prefetching - # (prefetch currently requires constant sizes) - knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) - knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) - - knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) - knl = lp.set_loop_priority(knl, "I,k_outer,k_inner") - - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) - print(lp.generate_body(knl)) - - -def test_precompute_with_preexisting_inames(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", - """ - result[e,i] = sum(j, D1[i,j]*u[e,j]) - result2[e,i] = sum(k, D2[i,k]*u[e,k]) - """) - - knl = lp.add_and_infer_dtypes(knl, { - "u": np.float32, - "D1": np.float32, - "D2": np.float32, - }) - - knl = lp.fix_parameters(knl, n=13) - - ref_knl = knl - - knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") - knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") - - knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", - precompute_inames="ii,jj") - knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", - precompute_inames="ii,jj") - - knl = lp.set_loop_priority(knl, "ii,jj,e,j,k") - - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(E=200)) - - -def test_precompute_with_preexisting_inames_fail(): - knl = lp.make_kernel( - "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}", - """ - result[e,i] = sum(j, D1[i,j]*u[e,j]) - result2[e,i] = sum(k, D2[i,k]*u[e,k]) - """) - - knl = lp.add_and_infer_dtypes(knl, { - "u": np.float32, - "D1": np.float32, - "D2": np.float32, - }) - - knl = lp.fix_parameters(knl, n=13) - - knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") - knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") - - knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", - precompute_inames="ii,jj") - with pytest.raises(lp.LoopyError): - lp.precompute(knl, "D2_subst", "i,k", default_tag="for", - precompute_inames="ii,jj") - - -def test_vectorize(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i]: 0<=i<n}", - """ - <> temp = 2*b[i] - a[i] = temp - """) - knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) - knl = lp.set_array_dim_names(knl, "a,b", "i") - knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, - split_kwargs=dict(slabs=(0, 1))) - - knl = lp.tag_data_axes(knl, "a,b", "c,vec") - ref_knl = knl - ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) - - knl = lp.tag_inames(knl, {"i_inner": "vec"}) + temp[i, 1] = 15 + """) + knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) - code, inf = lp.generate_code(knl) - - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(n=30)) + for k in lp.generate_loop_schedules(knl): + code, _ = lp.generate_code(k) + print(code) -def test_alias_temporaries(ctx_factory): +def test_make_copy_kernel(ctx_factory): ctx = ctx_factory() + queue = cl.CommandQueue(ctx) - knl = lp.make_kernel( - "{[i]: 0<=i<n}", - """ - times2(i) := 2*a[i] - times3(i) := 3*a[i] - times4(i) := 4*a[i] + intermediate_format = "f,f,sep" - x[i] = times2(i) - y[i] = times3(i) - z[i] = times4(i) - """) + a1 = np.random.randn(1024, 4, 3) - knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + cknl1 = lp.make_copy_kernel(intermediate_format) - ref_knl = knl + cknl1 = lp.fix_parameters(cknl1, n2=3) + + cknl1 = lp.set_options(cknl1, write_cl=True) + evt, a2 = cknl1(queue, input=a1) - knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") + cknl2 = lp.make_copy_kernel("c,c,c", intermediate_format) + cknl2 = lp.fix_parameters(cknl2, n2=3) - knl = lp.precompute(knl, "times2", "i_inner") - knl = lp.precompute(knl, "times3", "i_inner") - knl = lp.precompute(knl, "times4", "i_inner") + evt, a3 = cknl2(queue, input=a2) - knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"]) + assert (a1 == a3).all() - lp.auto_test_vs_ref( - ref_knl, ctx, knl, - parameters=dict(n=30)) +def test_auto_test_can_detect_problems(ctx_factory): + ctx = ctx_factory() -def test_fusion(): - exp_kernel = lp.make_kernel( - ''' { [i]: 0<=i<n } ''', - ''' exp[i] = pow(E, z[i])''', - assumptions="n>0") + ref_knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n}", + """ + a[i,j] = 25 + """) - sum_kernel = lp.make_kernel( - '{ [j]: 0<=j<n }', - 'out2 = sum(j, exp[j])', - assumptions='n>0') + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + """ + a[i,i] = 25 + """) - knl = lp.fuse_kernels([exp_kernel, sum_kernel]) + ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(a=np.float32)) + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - print(knl) + from loopy.diagnostic import AutomaticTestFailure + with pytest.raises(AutomaticTestFailure): + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=123)) def test_sci_notation_literal(ctx_factory): @@ -2210,37 +891,6 @@ def test_sci_notation_literal(ctx_factory): assert (np.abs(out.get() - 1e-12) < 1e-20).all() -def test_rename_argument(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - kernel = lp.make_kernel( - '''{ [i]: 0<=i<n }''', - '''out[i] = a + 2''') - - kernel = lp.rename_argument(kernel, "a", "b") - - evt, (out,) = kernel(queue, b=np.float32(12), n=20) - - assert (np.abs(out.get() - 14) < 1e-8).all() - - -def test_to_batched(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - knl = lp.make_kernel( - ''' { [i,j]: 0<=i,j<n } ''', - ''' out[i] = sum(j, a[i,j]*x[j])''') - - bknl = lp.to_batched(knl, "nbatches", "out,x") - - a = np.random.randn(5, 5) - x = np.random.randn(7, 5) - - bknl(queue, a=a, x=x) - - def test_variable_size_temporary(): knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', @@ -2295,57 +945,6 @@ def test_indexof_vec(ctx_factory): #assert np.array_equal(out.ravel(order="C"), np.arange(25)) -def test_finite_difference_expr_subst(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - grid = np.linspace(0, 2*np.pi, 2048, endpoint=False) - h = grid[1] - grid[0] - u = cl.clmath.sin(cl.array.to_device(queue, grid)) - - fin_diff_knl = lp.make_kernel( - "{[i]: 1<=i<=n}", - "out[i] = -(f[i+1] - f[i-1])/h", - [lp.GlobalArg("out", shape="n+2"), "..."]) - - flux_knl = lp.make_kernel( - "{[j]: 1<=j<=n}", - "f[j] = u[j]**2/2", - [ - lp.GlobalArg("f", shape="n+2"), - lp.GlobalArg("u", shape="n+2"), - ]) - - fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl], - data_flow=[ - ("f", 1, 0) - ]) - - fused_knl = lp.set_options(fused_knl, write_cl=True) - evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) - - fused_knl = lp.assignment_to_subst(fused_knl, "f") - - fused_knl = lp.set_options(fused_knl, write_cl=True) - - # This is the real test here: The automatically generated - # shape expressions are '2+n' and the ones above are 'n+2'. - # Is loopy smart enough to understand that these are equal? - evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) - - fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i") - - gpu_knl = lp.split_iname( - fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0") - - precomp_knl = lp.precompute( - gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True) - - precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"}) - precomp_knl = lp.set_options(precomp_knl, return_dict=True) - evt, _ = precomp_knl(queue, u=u, h=h) - - def test_is_expression_equal(): from loopy.symbolic import is_expression_equal from pymbolic import var @@ -2359,92 +958,6 @@ def test_is_expression_equal(): assert is_expression_equal((x+y)**2, x**2 + 2*x*y + y**2) -def test_collect_common_factors(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{[i,j,k]: 0<=i,j<n}", - """ - <float32> out_tmp = 0 {id=out_init,inames=i} - out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init} - out_tmp = out_tmp + alpha[i]*a[j,i]*b2[j] {id=out_up2,dep=out_init} - out[i] = out_tmp {dep=out_up1:out_up2} - """) - knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, alpha=np.float32, b1=np.float32, b2=np.float32)) - - ref_knl = knl - - knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0") - knl = lp.collect_common_factors_on_increment(knl, "out_tmp") - - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=13)) - - -def test_ispc_target(occa_mode=False): - from loopy.target.ispc import ISPCTarget - - knl = lp.make_kernel( - "{ [i]: 0<=i<n }", - "out[i] = 2*a[i]", - [ - lp.GlobalArg("out,a", np.float32, shape=lp.auto), - "..." - ], - target=ISPCTarget(occa_mode=occa_mode)) - - knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") - knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") - knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) - - codegen_result = lp.generate_code_v2( - lp.get_one_scheduled_kernel( - lp.preprocess_kernel(knl))) - - print(codegen_result.device_code()) - print(codegen_result.host_code()) - - -def test_cuda_target(): - from loopy.target.cuda import CudaTarget - - knl = lp.make_kernel( - "{ [i]: 0<=i<n }", - "out[i] = 2*a[i]", - [ - lp.GlobalArg("out,a", np.float32, shape=lp.auto), - "..." - ], - target=CudaTarget()) - - knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") - knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") - knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) - - print( - lp.generate_code( - lp.get_one_scheduled_kernel( - lp.preprocess_kernel(knl)))[0]) - - -def test_chunk_iname(ctx_factory): - ctx = ctx_factory() - - knl = lp.make_kernel( - "{ [i]: 0<=i<n }", - "out[i] = 2*a[i]", - [ - lp.GlobalArg("out,a", np.float32, shape=lp.auto), - "..." - ], - assumptions="n>0") - - ref_knl = knl - knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0") - knl = lp.set_loop_priority(knl, "i_outer, i_inner") - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130)) - - @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) def test_atomic(ctx_factory, dtype): ctx = ctx_factory() @@ -2476,23 +989,6 @@ def test_atomic(ctx_factory, dtype): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000)) -def test_clamp(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 15 * 10**6 - x = cl.clrandom.rand(queue, n, dtype=np.float32) - - knl = lp.make_kernel( - "{ [i]: 0<=i<n }", - "out[i] = clamp(x[i], a, b)") - - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - knl = lp.set_options(knl, write_cl=True) - - evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15)) - - def test_forced_iname_deps_and_reduction(): # See https://github.com/inducer/loopy/issues/24 @@ -2528,42 +1024,6 @@ def test_forced_iname_deps_and_reduction(): print(k.stringify(with_dependencies=True)) -@pytest.mark.parametrize("tp", ["f32", "f64"]) -def test_random123(ctx_factory, tp): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - import pyopencl.version # noqa - if cl.version.VERSION < (2016, 2): - pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") - - n = 150000 - - knl = lp.make_kernel( - "{ [i]: 0<=i<n }", - """ - <> key2 = make_uint2(i, 324830944) {inames=i} - <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i} - <> ctr = make_uint4(0, 1, 2, 3) {inames=i} - <> real, ctr = philox4x32_TYPE(ctr, key2) - <> imag, ctr = threefry4x32_TYPE(ctr, key4) - - out[i, 0] = real.s0 + 1j * imag.s0 - out[i, 1] = real.s1 + 1j * imag.s1 - out[i, 2] = real.s2 + 1j * imag.s2 - out[i, 3] = real.s3 + 1j * imag.s3 - """.replace("TYPE", tp)) - - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - knl = lp.set_options(knl, write_cl=True) - - evt, (out,) = knl(queue, n=n) - - out = out.get() - assert (out < 1).all() - assert (0 <= out).all() - - def test_kernel_splitting(ctx_factory): ctx = ctx_factory() @@ -2704,6 +1164,77 @@ def test_global_temporary(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) +def test_assign_to_linear_subscript(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl1 = lp.make_kernel( + "{ [i]: 0<=i<n}", + "a[i,i] = 1") + knl2 = lp.make_kernel( + "{ [i]: 0<=i<n}", + "a[[i*n + i]] = 1", + [lp.GlobalArg("a", shape="n,n"), "..."]) + + a1 = cl.array.zeros(queue, (10, 10), np.float32) + knl1(queue, a=a1) + a2 = cl.array.zeros(queue, (10, 10), np.float32) + knl2(queue, a=a2) + + assert np.array_equal(a1.get(), a2.get()) + + +def test_finite_difference_expr_subst(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + grid = np.linspace(0, 2*np.pi, 2048, endpoint=False) + h = grid[1] - grid[0] + u = cl.clmath.sin(cl.array.to_device(queue, grid)) + + fin_diff_knl = lp.make_kernel( + "{[i]: 1<=i<=n}", + "out[i] = -(f[i+1] - f[i-1])/h", + [lp.GlobalArg("out", shape="n+2"), "..."]) + + flux_knl = lp.make_kernel( + "{[j]: 1<=j<=n}", + "f[j] = u[j]**2/2", + [ + lp.GlobalArg("f", shape="n+2"), + lp.GlobalArg("u", shape="n+2"), + ]) + + fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl], + data_flow=[ + ("f", 1, 0) + ]) + + fused_knl = lp.set_options(fused_knl, write_cl=True) + evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) + + fused_knl = lp.assignment_to_subst(fused_knl, "f") + + fused_knl = lp.set_options(fused_knl, write_cl=True) + + # This is the real test here: The automatically generated + # shape expressions are '2+n' and the ones above are 'n+2'. + # Is loopy smart enough to understand that these are equal? + evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) + + fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i") + + gpu_knl = lp.split_iname( + fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0") + + precomp_knl = lp.precompute( + gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True) + + precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"}) + precomp_knl = lp.set_options(precomp_knl, return_dict=True) + evt, _ = precomp_knl(queue, u=u, h=h) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 3eacbaa2850b12ab1130a0f4b02ac5698bc9fab9..389151f173cc835c0e01f2cbc3c640952cf3c575 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -93,7 +93,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): - (assignee, _), = insn.assignees_and_indices() + assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) @@ -122,7 +122,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ("rknl", rflux_insn, ("j", "n",), rtmps, ("jj", "ii",)), ("sknl", sflux_insn, ("i", "n",), stmps, ("ii", "jj",)), ]: - (flux_var, _), = insn.assignees_and_indices() + flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions(hsv, diff --git a/test/test_reduction.py b/test/test_reduction.py new file mode 100644 index 0000000000000000000000000000000000000000..b751dd515b5978ba3c9ccedab06016d19e746546 --- /dev/null +++ b/test/test_reduction.py @@ -0,0 +1,496 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +def test_nonsense_reduction(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<100}", + """ + a[i] = sum(i, 2) + """, + [lp.GlobalArg("a", np.float32, shape=(100,))] + ) + + import pytest + with pytest.raises(RuntimeError): + knl = lp.preprocess_kernel(knl, ctx.devices[0]) + + +def test_empty_reduction(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + [ + "{[i]: 0<=i<20}", + "[i] -> {[j]: 0<=j<0}" + ], + "a[i] = sum(j, j)", + ) + + knl = lp.realize_reduction(knl) + print(knl) + + knl = lp.set_options(knl, write_cl=True) + evt, (a,) = knl(queue) + + assert (a.get() == 0).all() + + +def test_nested_dependent_reduction(ctx_factory): + dtype = np.dtype(np.int32) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + [ + "{[i]: 0<=i<n}", + "{[j]: 0<=j<i+sumlen}" + ], + [ + "<> sumlen = l[i]", + "a[i] = sum(j, j)", + ], + [ + lp.ValueArg("n", np.int32), + lp.GlobalArg("a", dtype, ("n",)), + lp.GlobalArg("l", np.int32, ("n",)), + ]) + + cknl = lp.CompiledKernel(ctx, knl) + + n = 330 + l = np.arange(n, dtype=np.int32) + evt, (a,) = cknl(queue, l=l, n=n, out_host=True) + + tgt_result = (2*l-1)*2*l/2 + assert (a == tgt_result).all() + + +def test_multi_nested_dependent_reduction(ctx_factory): + dtype = np.dtype(np.int32) + ctx = ctx_factory() + + knl = lp.make_kernel( + [ + "{[itgt]: 0 <= itgt < ntgts}", + "{[isrc_box]: 0 <= isrc_box < nboxes}", + "{[isrc]: 0 <= isrc < npart}" + ], + [ + "<> npart = nparticles_per_box[isrc_box]", + "a[itgt] = sum((isrc_box, isrc), 1)", + ], + [ + lp.ValueArg("n", np.int32), + lp.GlobalArg("a", dtype, ("n",)), + lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)), + lp.ValueArg("ntgts", np.int32), + lp.ValueArg("nboxes", np.int32), + ], + assumptions="ntgts>=1") + + cknl = lp.CompiledKernel(ctx, knl) + print(cknl.get_code()) + # FIXME: Actually test functionality. + + +def test_recursive_nested_dependent_reduction(ctx_factory): + dtype = np.dtype(np.int32) + ctx = ctx_factory() + + knl = lp.make_kernel( + [ + "{[itgt]: 0 <= itgt < ntgts}", + "{[isrc_box]: 0 <= isrc_box < nboxes}", + "{[isrc]: 0 <= isrc < npart}" + ], + [ + "<> npart = nparticles_per_box[isrc_box]", + "<> boxsum = sum(isrc, isrc+isrc_box+itgt)", + "a[itgt] = sum(isrc_box, boxsum)", + ], + [ + lp.ValueArg("n", np.int32), + lp.GlobalArg("a", dtype, ("n",)), + lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)), + lp.ValueArg("ntgts", np.int32), + lp.ValueArg("nboxes", np.int32), + ], + assumptions="ntgts>=1") + + cknl = lp.CompiledKernel(ctx, knl) + print(cknl.get_code()) + # FIXME: Actually test functionality. + + +@pytest.mark.parametrize("size", [128, 5, 113, 67]) +def test_local_parallel_reduction(ctx_factory, size): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i, j]: 0 <= i < n and 0 <= j < 5}", + """ + z[j] = sum(i, i+j) + """) + + knl = lp.fix_parameters(knl, n=size) + + ref_knl = knl + + def variant0(knl): + return lp.tag_inames(knl, "i:l.0") + + def variant1(knl): + return lp.tag_inames(knl, "i:l.0,j:l.1") + + def variant2(knl): + return lp.tag_inames(knl, "i:l.0,j:g.0") + + for variant in [ + variant0, + variant1, + variant2 + ]: + knl = variant(ref_knl) + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +# FIXME: Make me a test +@pytest.mark.parametrize("size", [10000]) +def no_test_global_parallel_reduction(ctx_factory, size): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i]: 0 <= i < n }", + """ + <> key = make_uint2(i, 324830944) {inames=i} + <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} + <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} + z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) + """) + + # ref_knl = knl + + gsize = 128 + knl = lp.split_iname(knl, "i", gsize * 20) + knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") + knl = lp.split_reduction_inward(knl, "i_inner_inner") + knl = lp.split_reduction_inward(knl, "i_inner_outer") + from loopy.transform.data import reduction_arg_to_subst_rule + knl = reduction_arg_to_subst_rule(knl, "i_outer") + knl = lp.precompute(knl, "red_i_outer_arg", "i_outer") + print(knl) + 1/0 + knl = lp.realize_reduction(knl) + + evt, (z,) = knl(queue, n=size) + + #lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +@pytest.mark.parametrize("size", [10000]) +def test_global_parallel_reduction_simpler(ctx_factory, size): + ctx = ctx_factory() + + pytest.xfail("very sensitive to kernel ordering, fails unused hw-axis check") + + knl = lp.make_kernel( + "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}", + """ + <> key = make_uint2(l+nl*g, 1234) {inames=l:g} + <> ctr = make_uint4(0, 1, 2, 3) {inames=l:g,id=init_ctr} + <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} + + <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3) + + result = sum(j, tmp[j]) + """) + + ng = 50 + knl = lp.fix_parameters(knl, ng=ng) + + knl = lp.set_options(knl, write_cl=True) + + ref_knl = knl + + knl = lp.split_iname(knl, "l", 128, inner_tag="l.0") + knl = lp.split_reduction_outward(knl, "l_inner") + knl = lp.tag_inames(knl, "g:g.0,j:l.0") + + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size}) + + +def test_argmax(ctx_factory): + logging.basicConfig(level=logging.INFO) + + dtype = np.dtype(np.float32) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 10000 + + knl = lp.make_kernel( + "{[i]: 0<=i<%d}" % n, + """ + max_val, max_idx = argmax(i, fabs(a[i])) + """) + + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + print(lp.preprocess_kernel(knl)) + knl = lp.set_options(knl, write_cl=True, highlight_cl=True) + + a = np.random.randn(10000).astype(dtype) + evt, (max_idx, max_val) = knl(queue, a=a, out_host=True) + assert max_val == np.max(np.abs(a)) + assert max_idx == np.where(np.abs(a) == max_val)[-1] + + +def test_simul_reduce(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 20 + + knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n }", + [ + "a = simul_reduce(sum, (i,j), i*j)", + "b = simul_reduce(sum, i, simul_reduce(sum, j, i*j))", + ], + assumptions="n>=1") + + evt, (a, b) = knl(queue, n=n) + + ref = sum(i*j for i in range(n) for j in range(n)) + assert a.get() == ref + assert b.get() == ref + + +@pytest.mark.parametrize(("op_name", "np_op"), [ + ("sum", np.sum), + ("product", np.prod), + ("min", np.min), + ("max", np.max), + ]) +def test_reduction_library(ctx_factory, op_name, np_op): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<m }", + [ + "res[i] = reduce(%s, j, a[i,j])" % op_name, + ], + assumptions="n>=1") + + a = np.random.randn(20, 10) + evt, (res,) = knl(queue, a=a) + + assert np.allclose(res, np_op(a, axis=1)) + + +def test_split_reduction(ctx_factory): + knl = lp.make_kernel( + "{[i,j,k]: 0<=i,j,k<n}", + """ + b = sum((i,j,k), a[i,j,k]) + """, + [ + lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a", + None, shape=None), + "..."]) + + knl = lp.split_reduction_outward(knl, "j,k") + # FIXME: finish test + + +def test_double_sum_made_unique(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 20 + + knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n }", + [ + "a = sum((i,j), i*j)", + "b = sum(i, sum(j, i*j))", + ], + assumptions="n>=1") + + knl = lp.make_reduction_inames_unique(knl) + print(knl) + + evt, (a, b) = knl(queue, n=n) + + ref = sum(i*j for i in range(n) for j in range(n)) + assert a.get() == ref + assert b.get() == ref + + +def test_fd_demo(): + knl = lp.make_kernel( + "{[i,j]: 0<=i,j<n}", + "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \ + + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ + + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]") + #assumptions="n mod 16=0") + knl = lp.split_iname(knl, + "i", 16, outer_tag="g.1", inner_tag="l.1") + knl = lp.split_iname(knl, + "j", 16, outer_tag="g.0", inner_tag="l.0") + knl = lp.add_prefetch(knl, "u", + ["i_inner", "j_inner"], + fetch_bounding_box=True) + + #n = 1000 + #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32) + + knl = lp.set_options(knl, write_cl=True) + knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32)) + code, inf = lp.generate_code(knl) + print(code) + + assert "double" not in code + + +def test_fd_1d(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + "result[i] = u[i+1]-u[i]") + + knl = lp.add_and_infer_dtypes(knl, {"u": np.float32}) + ref_knl = knl + + knl = lp.split_iname(knl, "i", 16) + knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j") + knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for") + knl = lp.assume(knl, "n mod 16 = 0") + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=2048)) + + +def test_poisson_fem(ctx_factory): + # Stolen from Peter Coogan and Rob Kirby for FEM assembly + ctx = ctx_factory() + + nbf = 5 + nqp = 5 + sdim = 3 + + knl = lp.make_kernel( + "{ [c,i,j,k,ell,ell2,ell3]: \ + 0 <= c < nels and \ + 0 <= i < nbf and \ + 0 <= j < nbf and \ + 0 <= k < nqp and \ + 0 <= ell,ell2 < sdim}", + """ + dpsi(bf,k0,dir) := \ + simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] ) + Ael[c,i,j] = \ + J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell)) + """, + assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0") + + print(knl) + + knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp) + + ref_knl = knl + + knl = lp.set_loop_priority(knl, ["c", "j", "i", "k"]) + + def variant_1(knl): + knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') + knl = lp.set_loop_priority(knl, "c,i,j") + return knl + + def variant_2(knl): + knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') + knl = lp.set_loop_priority(knl, "c,i,j") + return knl + + def add_types(knl): + return lp.add_and_infer_dtypes(knl, dict( + w=np.float32, + J=np.float32, + DPsi=np.float32, + DFinv=np.float32, + )) + + for variant in [ + #variant_1, + variant_2 + ]: + knl = variant(knl) + + lp.auto_test_vs_ref( + add_types(ref_knl), ctx, add_types(knl), + parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7)) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_target.py b/test/test_target.py new file mode 100644 index 0000000000000000000000000000000000000000..67204d9c1fbc02624eebf230b87d33c114c21ef1 --- /dev/null +++ b/test/test_target.py @@ -0,0 +1,203 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +def test_ispc_target(occa_mode=False): + from loopy.target.ispc import ISPCTarget + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = 2*a[i]", + [ + lp.GlobalArg("out,a", np.float32, shape=lp.auto), + "..." + ], + target=ISPCTarget(occa_mode=occa_mode)) + + knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") + knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") + knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) + + codegen_result = lp.generate_code_v2( + lp.get_one_scheduled_kernel( + lp.preprocess_kernel(knl))) + + print(codegen_result.device_code()) + print(codegen_result.host_code()) + + +def test_cuda_target(): + from loopy.target.cuda import CudaTarget + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = 2*a[i]", + [ + lp.GlobalArg("out,a", np.float32, shape=lp.auto), + "..." + ], + target=CudaTarget()) + + knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") + knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") + knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) + + print( + lp.generate_code( + lp.get_one_scheduled_kernel( + lp.preprocess_kernel(knl)))[0]) + + +def test_generate_c_snippet(): + from loopy.target.c import CTarget + + from pymbolic import var + I = var("I") # noqa + f = var("f") + df = var("df") + q_v = var("q_v") + eN = var("eN") # noqa + k = var("k") + u = var("u") + + from functools import partial + l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) + + Instr = lp.Assignment # noqa + + knl = lp.make_kernel( + "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", + [ + Instr(f[I], l_sum(k, q_v[k, I]*u)), + Instr(df[I], l_sum(k, q_v[k, I])), + ], + [ + lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), + lp.GlobalArg("f,df", np.float64, shape="nSpace"), + lp.ValueArg("u", np.float64), + "...", + ], + target=CTarget(), + assumptions="nQuad>=1") + + if 0: # enable to play with prefetching + # (prefetch currently requires constant sizes) + knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) + knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) + + knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) + knl = lp.set_loop_priority(knl, "I,k_outer,k_inner") + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + print(lp.generate_body(knl)) + + +@pytest.mark.parametrize("tp", ["f32", "f64"]) +def test_random123(ctx_factory, tp): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + import pyopencl.version # noqa + if cl.version.VERSION < (2016, 2): + pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") + + n = 150000 + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + """ + <> key2 = make_uint2(i, 324830944) {inames=i} + <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i} + <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} + <> real, ctr = philox4x32_TYPE(ctr, key2) {dep=init_ctr} + <> imag, ctr = threefry4x32_TYPE(ctr, key4) {dep=init_ctr} + + out[i, 0] = real.s0 + 1j * imag.s0 + out[i, 1] = real.s1 + 1j * imag.s1 + out[i, 2] = real.s2 + 1j * imag.s2 + out[i, 3] = real.s3 + 1j * imag.s3 + """.replace("TYPE", tp)) + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + knl = lp.set_options(knl, write_cl=True) + + evt, (out,) = knl(queue, n=n) + + out = out.get() + assert (out < 1).all() + assert (0 <= out).all() + + +def test_clamp(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 15 * 10**6 + x = cl.clrandom.rand(queue, n, dtype=np.float32) + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = clamp(x[i], a, b)") + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + knl = lp.set_options(knl, write_cl=True) + + evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15)) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_transform.py b/test/test_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..cf9564274d12b2889d5010bbd807c4bd94fa149f --- /dev/null +++ b/test/test_transform.py @@ -0,0 +1,412 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +def test_chunk_iname(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = 2*a[i]", + [ + lp.GlobalArg("out,a", np.float32, shape=lp.auto), + "..." + ], + assumptions="n>0") + + ref_knl = knl + knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0") + knl = lp.set_loop_priority(knl, "i_outer, i_inner") + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130)) + + +def test_collect_common_factors(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j,k]: 0<=i,j<n}", + """ + <float32> out_tmp = 0 {id=out_init,inames=i} + out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init} + out_tmp = out_tmp + alpha[i]*a[j,i]*b2[j] {id=out_up2,dep=out_init} + out[i] = out_tmp {dep=out_up1:out_up2} + """) + knl = lp.add_and_infer_dtypes(knl, + dict(a=np.float32, alpha=np.float32, b1=np.float32, b2=np.float32)) + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0") + knl = lp.collect_common_factors_on_increment(knl, "out_tmp") + + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=13)) + + +def test_to_batched(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + ''' { [i,j]: 0<=i,j<n } ''', + ''' out[i] = sum(j, a[i,j]*x[j])''') + + bknl = lp.to_batched(knl, "nbatches", "out,x") + + a = np.random.randn(5, 5) + x = np.random.randn(7, 5) + + bknl(queue, a=a, x=x) + + +def test_rename_argument(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + kernel = lp.make_kernel( + '''{ [i]: 0<=i<n }''', + '''out[i] = a + 2''') + + kernel = lp.rename_argument(kernel, "a", "b") + + evt, (out,) = kernel(queue, b=np.float32(12), n=20) + + assert (np.abs(out.get() - 14) < 1e-8).all() + + +def test_fusion(): + exp_kernel = lp.make_kernel( + ''' { [i]: 0<=i<n } ''', + ''' exp[i] = pow(E, z[i])''', + assumptions="n>0") + + sum_kernel = lp.make_kernel( + '{ [j]: 0<=j<n }', + 'out2 = sum(j, exp[j])', + assumptions='n>0') + + knl = lp.fuse_kernels([exp_kernel, sum_kernel]) + + print(knl) + + +def test_alias_temporaries(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + """ + times2(i) := 2*a[i] + times3(i) := 3*a[i] + times4(i) := 4*a[i] + + x[i] = times2(i) + y[i] = times3(i) + z[i] = times4(i) + """) + + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + + ref_knl = knl + + knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") + + knl = lp.precompute(knl, "times2", "i_inner") + knl = lp.precompute(knl, "times3", "i_inner") + knl = lp.precompute(knl, "times4", "i_inner") + + knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"]) + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=30)) + + +def test_vectorize(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + """ + <> temp = 2*b[i] + a[i] = temp + """) + knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) + knl = lp.set_array_dim_names(knl, "a,b", "i") + knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, + split_kwargs=dict(slabs=(0, 1))) + + knl = lp.tag_data_axes(knl, "a,b", "c,vec") + ref_knl = knl + ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) + + knl = lp.tag_inames(knl, {"i_inner": "vec"}) + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + code, inf = lp.generate_code(knl) + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=30)) + + +def test_extract_subst(ctx_factory): + knl = lp.make_kernel( + "{[i]: 0<=i<n}", + """ + a[i] = 23*b[i]**2 + 25*b[i]**2 + """) + + knl = lp.extract_subst(knl, "bsquare", "alpha*b[i]**2", "alpha") + + print(knl) + + from loopy.symbolic import parse + + insn, = knl.instructions + assert insn.expression == parse("bsquare(23) + bsquare(25)") + + +def test_join_inames(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<=i,j<16}", + [ + "b[i,j] = 2*a[i,j]" + ], + [ + lp.GlobalArg("a", np.float32, shape=(16, 16,)), + lp.GlobalArg("b", np.float32, shape=(16, 16,)) + ], + ) + + ref_knl = knl + + knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"]) + knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"]) + + lp.auto_test_vs_ref(ref_knl, ctx, knl, print_ref_code=True) + + +def test_tag_data_axes(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{ [i,j,k]: 0<=i,j,k<n }", + "out[i,j,k] = 15") + + ref_knl = knl + + with pytest.raises(lp.LoopyError): + lp.tag_data_axes(knl, "out", "N1,N0,N5") + + with pytest.raises(lp.LoopyError): + lp.tag_data_axes(knl, "out", "N1,N0,c") + + knl = lp.tag_data_axes(knl, "out", "N1,N0,N2") + knl = lp.tag_inames(knl, dict(j="g.0", i="g.1")) + + lp.auto_test_vs_ref(ref_knl, ctx, knl, + parameters=dict(n=20)) + + +def test_set_arg_order(): + knl = lp.make_kernel( + "{ [i,j]: 0<=i,j<n }", + "out[i,j] = a[i]*b[j]") + + knl = lp.set_argument_order(knl, "out,a,n,b") + + +def test_affine_map_inames(): + knl = lp.make_kernel( + "{[e, i,j,n]: 0<=e<E and 0<=i,j,n<N}", + "rhsQ[e, n+i, j] = rhsQ[e, n+i, j] - D[i, n]*x[i,j]") + + knl = lp.affine_map_inames(knl, + "i", "i0", + "i0 = n+i") + + print(knl) + + +def test_precompute_confusing_subst_arguments(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<5}", + """ + D(i):=a[i+1]-a[i] + b[i,j] = D(j) + """) + + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) + + ref_knl = knl + + knl = lp.tag_inames(knl, dict(j="g.1")) + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + + from loopy.symbolic import get_dependencies + assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) + knl = lp.precompute(knl, "D") + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=12345)) + + +def test_precompute_nested_subst(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<5}", + """ + E:=a[i] + D:=E*E + b[i] = D + """) + + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) + + ref_knl = knl + + knl = lp.tag_inames(knl, dict(j="g.1")) + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + + from loopy.symbolic import get_dependencies + assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) + knl = lp.precompute(knl, "D", "i_inner") + + # There's only one surviving 'E' rule. + assert len([ + rule_name + for rule_name in knl.substitutions + if rule_name.startswith("E")]) == 1 + + # That rule should use the newly created prefetch inames, + # not the prior 'i_inner' + assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression) + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(n=12345)) + + +def test_precompute_with_preexisting_inames(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", + """ + result[e,i] = sum(j, D1[i,j]*u[e,j]) + result2[e,i] = sum(k, D2[i,k]*u[e,k]) + """) + + knl = lp.add_and_infer_dtypes(knl, { + "u": np.float32, + "D1": np.float32, + "D2": np.float32, + }) + + knl = lp.fix_parameters(knl, n=13) + + ref_knl = knl + + knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") + knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") + + knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", + precompute_inames="ii,jj") + knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", + precompute_inames="ii,jj") + + knl = lp.set_loop_priority(knl, "ii,jj,e,j,k") + + lp.auto_test_vs_ref( + ref_knl, ctx, knl, + parameters=dict(E=200)) + + +def test_precompute_with_preexisting_inames_fail(): + knl = lp.make_kernel( + "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}", + """ + result[e,i] = sum(j, D1[i,j]*u[e,j]) + result2[e,i] = sum(k, D2[i,k]*u[e,k]) + """) + + knl = lp.add_and_infer_dtypes(knl, { + "u": np.float32, + "D1": np.float32, + "D2": np.float32, + }) + + knl = lp.fix_parameters(knl, n=13) + + knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") + knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") + + knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", + precompute_inames="ii,jj") + with pytest.raises(lp.LoopyError): + lp.precompute(knl, "D2_subst", "i,k", default_tag="for", + precompute_inames="ii,jj") + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker