diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index c490abb6ed1635c135fc77468f27cd833b1d57b2..6224d9709f5f796f84c3fd177125e0703d92d173 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -221,7 +221,7 @@ def generate_call_code(codegen_state, insn): if codegen_state.vectorization_info: if insn.atomicity: - raise Unvectorizable("function call") + raise Unvectorizable("atomic operation") # }}} diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 1e5c36fcf4bc5c8a1fa71f59d088d2612310f36a..4477f5bafc7dab867af63d5152f9cbdec12a0dda 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -440,7 +440,8 @@ class InstructionBase(ImmutableRecord): from loopy.tools import intern_frozenset_of_ids - self.id = intern(self.id) + if self.id is not None: + self.id = intern(self.id) self.depends_on = intern_frozenset_of_ids(self.depends_on) self.groups = intern_frozenset_of_ids(self.groups) self.conflicts_with_groups = ( @@ -993,6 +994,13 @@ class CallInstruction(MultiAssignmentBase): else: key_builder.rec(key_hash, getattr(self, field_name)) + @property + def atomicity(self): + # Function calls can impossibly be atomic, and even the result assignment + # is troublesome, especially in the case of multiple results. Avoid the + # issue altogether by disallowing atomicity. + return () + # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 7545070261a9be7a8b5e3e5dde4fdce3e815f44a..85a8da936469e97577af742bf39286acb0188206 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -527,7 +527,9 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): lines.append("%s -> %s" % (insn_2, insn_1)) if iname_cluster: - from loopy.schedule import EnterLoop, LeaveLoop, RunInstruction, Barrier + from loopy.schedule import ( + EnterLoop, LeaveLoop, RunInstruction, Barrier, + CallKernel, ReturnFromKernel) for sched_item in kernel.schedule: if isinstance(sched_item, EnterLoop): @@ -537,7 +539,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): lines.append("}") elif isinstance(sched_item, RunInstruction): lines.append(sched_item.insn_id) - elif isinstance(sched_item, Barrier): + elif isinstance(sched_item, (CallKernel, ReturnFromKernel, Barrier)): pass else: raise LoopyError("schedule item not unterstood: %r" % sched_item) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 6b5488a20bc9d714fb5fde908b559ddebf4b9591..db7792cce55c9b7851850c3059e821fc574c3270 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -738,52 +738,43 @@ def find_idempotence(kernel): (insn.id, insn.read_dependency_names() & var_names) for insn in kernel.instructions) - non_idempotently_updated_vars = set() - - # FIXME: This can be made more efficient by simply starting - # from all written variables and not even considering - # instructions as the start of the first pass. + from collections import defaultdict + dep_graph = defaultdict(lambda: set()) - new_insns = [] for insn in kernel.instructions: - all_my_var_writers = set() - for var in reads_map[insn.id]: - var_writers = writer_map.get(var, set()) - all_my_var_writers |= var_writers + dep_graph[insn.id] = set(writer_id + for var in reads_map[insn.id] + for writer_id in writer_map.get(var, set())) - # {{{ find dependency loops, flag boostability + # Find SCCs of dep_graph. These are used for checking if the instruction is + # in a dependency cycle. + from loopy.tools import compute_sccs - while True: - last_all_my_var_writers = all_my_var_writers + sccs = dict((item, scc) + for scc in compute_sccs(dep_graph) + for item in scc) - for writer_insn_id in last_all_my_var_writers: - for var in reads_map[writer_insn_id]: - all_my_var_writers = \ - all_my_var_writers | writer_map.get(var, set()) - - if last_all_my_var_writers == all_my_var_writers: - break - - # }}} + non_idempotently_updated_vars = set() - boostable = insn.id not in all_my_var_writers + new_insns = [] + for insn in kernel.instructions: + boostable = len(sccs[insn.id]) == 1 and insn.id not in dep_graph[insn.id] if not boostable: non_idempotently_updated_vars.update( insn.assignee_var_names()) - insn = insn.copy(boostable=boostable) - - new_insns.append(insn) + new_insns.append(insn.copy(boostable=boostable)) # {{{ remove boostability from isns that access non-idempotently updated vars new2_insns = [] for insn in new_insns: - accessed_vars = insn.dependency_names() - boostable = insn.boostable and not bool( - non_idempotently_updated_vars & accessed_vars) - new2_insns.append(insn.copy(boostable=boostable)) + if insn.boostable and bool( + non_idempotently_updated_vars & insn.dependency_names()): + new2_insns.append(insn.copy(boostable=False)) + else: + new2_insns.append(insn) # }}} diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index bf51147b516f84e247c7ab94c103e4c202072a9b..22efee0f2c8730a87f813cfb2d0581e7bb8514f5 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1640,7 +1640,6 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): barrier_kind_more_or_equally_global( sub_sched_item.kind, kind)): - seen_barrier() last_barrier_index = j if first_barrier_index is None: first_barrier_index = j @@ -1683,6 +1682,7 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): if last_barrier_index is None: candidates.update(insn_ids_from_schedule(subresult)) else: + seen_barrier() candidates.update( insn_ids_from_schedule( subresult[last_barrier_index+1:])) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 61e8e4f396126e17123c1bf775dbfeee2fe21f0d..a8f47adb991e331f8a473c4eb14b1ea634c7a3b1 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -669,8 +669,6 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): import pyopencl as cl - logger.info("%s: opencl compilation start" % self.kernel.name) - cl_program = ( cl.Program(self.context, dev_code) .build(options=kernel.options.cl_build_options)) @@ -679,8 +677,6 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): for dp in codegen_result.device_programs: setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) - logger.info("%s: opencl compilation done" % self.kernel.name) - return _CLKernelInfo( kernel=kernel, cl_kernels=cl_kernels, diff --git a/loopy/tools.py b/loopy/tools.py index ae370d5aaac9ff75f530e1d0951a2f904b686e42..01d0641fc25c11a092185125604613819a0293ca 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -281,6 +281,65 @@ def empty_aligned(shape, dtype, order='C', n=64): # }}} +# {{{ compute SCCs with Tarjan's algorithm + +def compute_sccs(graph): + to_search = set(graph.keys()) + visit_order = {} + scc_root = {} + sccs = [] + + while to_search: + top = next(iter(to_search)) + call_stack = [(top, iter(graph[top]), None)] + visit_stack = [] + visiting = set() + + scc = [] + + while call_stack: + top, children, last_popped_child = call_stack.pop() + + if top not in visiting: + # Unvisited: mark as visited, initialize SCC root. + count = len(visit_order) + visit_stack.append(top) + visit_order[top] = count + scc_root[top] = count + visiting.add(top) + to_search.discard(top) + + # Returned from a recursion, update SCC. + if last_popped_child is not None: + scc_root[top] = min( + scc_root[top], + scc_root[last_popped_child]) + + for child in children: + if child not in visit_order: + # Recurse. + call_stack.append((top, children, child)) + call_stack.append((child, iter(graph[child]), None)) + break + if child in visiting: + scc_root[top] = min( + scc_root[top], + visit_order[child]) + else: + if scc_root[top] == visit_order[top]: + scc = [] + while visit_stack[-1] != top: + scc.append(visit_stack.pop()) + scc.append(visit_stack.pop()) + for item in scc: + visiting.remove(item) + sccs.append(scc) + + return sccs + +# }}} + + def is_interned(s): return s is None or intern(s) is s diff --git a/loopy/type_inference.py b/loopy/type_inference.py index a31f011a0ce8e5403b54984eb45db0970a8370b0..99a16bfc23341dba3d28c71038681c31d3e00dba 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -112,32 +112,28 @@ class TypeInferenceMapper(CombineMapper): 0 <= len(dtype_set) <= 1 for dtype_set in dtype_sets) - if not all( - isinstance(dtype, NumpyType) + from pytools import is_single_valued + + dtypes = [dtype for dtype_set in dtype_sets - for dtype in dtype_set): - from pytools import is_single_valued, single_valued - if not is_single_valued( - dtype - for dtype_set in dtype_sets - for dtype in dtype_set): + for dtype in dtype_set] + + if not all(isinstance(dtype, NumpyType) for dtype in dtypes): + if not is_single_valued(dtypes): raise TypeInferenceFailure( "Nothing known about operations between '%s'" - % ", ".join(str(dtype) - for dtype_set in dtype_sets - for dtype in dtype_set)) + % ", ".join(str(dtype) for dtype in dtypes)) - return single_valued(dtype - for dtype_set in dtype_sets - for dtype in dtype_set) + return [dtypes[0]] - numpy_dtypes = [dtype.dtype - for dtype_set in dtype_sets - for dtype in dtype_set] + numpy_dtypes = [dtype.dtype for dtype in dtypes] if not numpy_dtypes: return [] + if is_single_valued(numpy_dtypes): + return [dtypes[0]] + result = numpy_dtypes.pop() while numpy_dtypes: other = numpy_dtypes.pop() @@ -179,7 +175,6 @@ class TypeInferenceMapper(CombineMapper): else: dtype_sets.append(dtype_set) - from pytools import all if all(dtype.is_integral() for dtype_set in dtype_sets for dtype in dtype_set): @@ -462,6 +457,9 @@ def infer_unknown_types(kernel, expect_completion=False): logger.debug("%s: infer types" % kernel.name) + import time + start_time = time.time() + def debug(s): logger.debug("%s: %s" % (kernel.name, s)) @@ -489,6 +487,27 @@ def infer_unknown_types(kernel, expect_completion=False): # }}} + logger.debug("finding types for {count:d} names".format( + count=len(names_for_type_inference))) + + writer_map = kernel.writer_map() + + dep_graph = dict( + (written_var, set( + read_var + for insn_id in writer_map.get(written_var, []) + for read_var in kernel.id_to_insn[insn_id].read_dependency_names() + if read_var in names_for_type_inference)) + for written_var in names_for_type_inference) + + from loopy.tools import compute_sccs + + # To speed up processing, we sort the variables by computing the SCCs of the + # type dependency graph. Each SCC represents a set of variables whose types + # mutually depend on themselves. The SCCs are returned and processed in + # topological order. + sccs = compute_sccs(dep_graph) + item_lookup = _DictUnionView([ new_temp_vars, new_arg_dict @@ -502,75 +521,89 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.kernel.data import TemporaryVariable, KernelArgument - changed_during_last_queue_run = False - queue = names_for_type_inference[:] - - failed_names = set() - while queue or changed_during_last_queue_run: - if not queue and changed_during_last_queue_run: - changed_during_last_queue_run = False - queue = names_for_type_inference[:] - - name = queue.pop(0) - item = item_lookup[name] - - debug("inferring type for %s %s" % (type(item).__name__, item.name)) - - result, symbols_with_unavailable_types = \ - _infer_var_type(kernel, item.name, type_inf_mapper, subst_expander) - - failed = not result - if not failed: - new_dtype, = result - debug(" success: %s" % new_dtype) - if new_dtype != item.dtype: - debug(" changed from: %s" % item.dtype) - changed_during_last_queue_run = True + for var_chain in sccs: + changed_during_last_queue_run = False + queue = var_chain[:] + failed_names = set() + + while queue or changed_during_last_queue_run: + if not queue and changed_during_last_queue_run: + changed_during_last_queue_run = False + # Optimization: If there's a single variable in the SCC without + # a self-referential dependency, then the type is known after a + # single iteration (we don't need to look at the expressions + # again). + if len(var_chain) == 1: + single_var, = var_chain + if single_var not in dep_graph[single_var]: + break + queue = var_chain[:] + + name = queue.pop(0) + item = item_lookup[name] + + debug("inferring type for %s %s" % (type(item).__name__, item.name)) + + result, symbols_with_unavailable_types = ( + _infer_var_type( + kernel, item.name, type_inf_mapper, subst_expander)) + + failed = not result + if not failed: + new_dtype, = result + debug(" success: %s" % new_dtype) + if new_dtype != item.dtype: + debug(" changed from: %s" % item.dtype) + changed_during_last_queue_run = True + + if isinstance(item, TemporaryVariable): + new_temp_vars[name] = item.copy(dtype=new_dtype) + elif isinstance(item, KernelArgument): + new_arg_dict[name] = item.copy(dtype=new_dtype) + else: + raise LoopyError("unexpected item type in type inference") + else: + debug(" failure") + + if failed: + if item.name in failed_names: + # this item has failed before, give up. + advice = "" + if symbols_with_unavailable_types: + advice += ( + " (need type of '%s'--check for missing arguments)" + % ", ".join(symbols_with_unavailable_types)) + + if expect_completion: + raise LoopyError( + "could not determine type of '%s'%s" + % (item.name, advice)) + + else: + # We're done here. + break - if isinstance(item, TemporaryVariable): - new_temp_vars[name] = item.copy(dtype=new_dtype) - elif isinstance(item, KernelArgument): - new_arg_dict[name] = item.copy(dtype=new_dtype) - else: - raise LoopyError("unexpected item type in type inference") - else: - debug(" failure") - - if failed: - if item.name in failed_names: - # this item has failed before, give up. - advice = "" - if symbols_with_unavailable_types: - advice += ( - " (need type of '%s'--check for missing arguments)" - % ", ".join(symbols_with_unavailable_types)) - - if expect_completion: - raise LoopyError( - "could not determine type of '%s'%s" - % (item.name, advice)) + # remember that this item failed + failed_names.add(item.name) - else: - # We're done here. + if set(queue) == failed_names: + # We did what we could... + print(queue, failed_names, item.name) + assert not expect_completion break - # remember that this item failed - failed_names.add(item.name) - - if set(queue) == failed_names: - # We did what we could... - print(queue, failed_names, item.name) - assert not expect_completion - break - - # can't infer type yet, put back into queue - queue.append(name) - else: - # we've made progress, reset failure markers - failed_names = set() + # can't infer type yet, put back into queue + queue.append(name) + else: + # we've made progress, reset failure markers + failed_names = set() # }}} + end_time = time.time() + logger.debug("type inference took {dur:.2f} seconds".format( + dur=end_time - start_time)) + return unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], diff --git a/test/test_loopy.py b/test/test_loopy.py index e41d55b85e504bcd39db37bd888ddbedbf6122f4..48ccd8ee024325150f8686185678eeb64a7395dd 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -102,6 +102,28 @@ def test_type_inference_no_artificial_doubles(ctx_factory): assert "double" not in code +def test_type_inference_with_type_dependencies(): + knl = lp.make_kernel( + "{[i]: i=0}", + """ + <>a = 99 + a = a + 1 + <>b = 0 + <>c = 1 + b = b + c + 1.0 + c = b + c + <>d = b + 2 + 1j + """, + "...") + knl = lp.infer_unknown_types(knl) + + from loopy.types import to_loopy_type + assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32) + assert knl.temporary_variables["b"].dtype == to_loopy_type(np.float32) + assert knl.temporary_variables["c"].dtype == to_loopy_type(np.float32) + assert knl.temporary_variables["d"].dtype == to_loopy_type(np.complex128) + + def test_sized_and_complex_literals(ctx_factory): ctx = ctx_factory() @@ -1973,6 +1995,52 @@ def test_integer_reduction(ctx_factory): assert function(out) +def assert_barrier_between(knl, id1, id2): + from loopy.schedule import RunInstruction, Barrier + watch_for_barrier = False + seen_barrier = False + + for sched_item in knl.schedule: + if isinstance(sched_item, RunInstruction): + if sched_item.insn_id == id1: + watch_for_barrier = True + elif sched_item.insn_id == id2: + assert watch_for_barrier + assert seen_barrier + return + if isinstance(sched_item, Barrier): + if watch_for_barrier: + seen_barrier = True + + raise RuntimeError("id2 was not seen") + + +def test_barrier_insertion_near_top_of_loop(): + knl = lp.make_kernel( + "{[i,j]: 0 <= i,j < 10 }", + """ + for i + <>a[i] = i {id=ainit} + for j + <>t = a[(i + 1) % 10] {id=tcomp} + <>b[i,j] = a[i] + t {id=bcomp1} + b[i,j] = b[i,j] + 1 {id=bcomp2} + end + end + """, + seq_dependencies=True) + knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.set_temporary_scope(knl, "a", "local") + knl = lp.set_temporary_scope(knl, "b", "local") + knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + + print(knl) + + assert_barrier_between(knl, "ainit", "tcomp") + assert_barrier_between(knl, "tcomp", "bcomp1") + assert_barrier_between(knl, "bcomp1", "bcomp2") + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_misc.py b/test/test_misc.py new file mode 100644 index 0000000000000000000000000000000000000000..384c1326b75850f8c43c50914934f7dc5b097404 --- /dev/null +++ b/test/test_misc.py @@ -0,0 +1,79 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2016 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six # noqa +from six.moves import range + +import sys + +import logging +logger = logging.getLogger(__name__) + + +def test_compute_sccs(): + from loopy.tools import compute_sccs + import random + + rng = random.Random(0) + + def generate_random_graph(nnodes): + graph = dict((i, set()) for i in range(nnodes)) + for i in range(nnodes): + for j in range(nnodes): + # Edge probability 2/n: Generates decently interesting inputs. + if rng.randint(0, nnodes - 1) <= 1: + graph[i].add(j) + return graph + + def verify_sccs(graph, sccs): + visited = set() + + def visit(node): + if node in visited: + return [] + else: + visited.add(node) + result = [] + for child in graph[node]: + result = result + visit(child) + return result + [node] + + for scc in sccs: + result = visit(scc[0]) + assert set(result) == set(scc), (set(result), set(scc)) + + for nnodes in range(10, 20): + for i in range(40): + graph = generate_random_graph(nnodes) + verify_sccs(graph, compute_sccs(graph)) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker