Compare revisions

72f1dd1f · 72f1dd1f · 72f1dd1f · 72f1dd1f · 72f1dd1f · 72f1dd1f
--- a/loopy/cli.py
+++ b/loopy/cli.py
@@ -39,13 +39,13 @@ def defines_to_python_code(defines_str):
    import re
    define_re = re.compile(r"^\#define\s+([a-zA-Z0-9_]+)\s+(.*)$")
    result = []
-    for l in defines_str.split("\n"):
-        if not l.strip():
+    for line in defines_str.split("\n"):
+        if not line.strip():
            continue

-        match = define_re.match(l)
+        match = define_re.match(line)
        if match is None:
-            raise RuntimeError("#define not understood: '%s'" % l)
+            raise RuntimeError("#define not understood: '%s'" % line)

        result.append(
                "%s = %s" % (match.group(1), to_python_literal(match.group(2))))

--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -388,7 +388,7 @@ def generate_code_v2(kernel):
        from loopy.schedule import get_one_scheduled_kernel
        kernel = get_one_scheduled_kernel(kernel)

-    if kernel.state != KernelState.SCHEDULED:
+    if kernel.state != KernelState.LINEARIZED:
        raise LoopyError("cannot generate code for a kernel that has not been "
                "scheduled")


--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -59,6 +59,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
    from loopy.schedule import (
        find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
    from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase,
+                                   VectorizeTag,
                                   IlpBaseTag)

    result = find_active_inames_at(kernel, sched_index)
@@ -67,7 +68,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
    # Find our containing subkernel. Grab inames for all insns from there.
    within_subkernel = False

-    for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index+1]):
+    for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index]):
        from loopy.schedule import CallKernel, ReturnFromKernel
        if isinstance(sched_item, CallKernel):
            within_subkernel = True
@@ -92,11 +93,12 @@ def get_usable_inames_for_conditional(kernel, sched_index):
        #
        # - local indices may not be used in conditionals that cross barriers.
        #
-        # - ILP indices are not available in loop bounds, they only get defined
-        #   at the innermost level of nesting.
+        # - ILP indices and vector lane indices are not available in loop
+        #   bounds, they only get defined at the innermost level of nesting.

        if (
                kernel.iname_tags_of_type(iname, ConcurrentTag)
+                and not kernel.iname_tags_of_type(iname, VectorizeTag)
                and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase)
                    and crosses_barrier)
                and not kernel.iname_tags_of_type(iname, IlpBaseTag)

--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -24,7 +24,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """

-import six
 from loopy.codegen.result import merge_codegen_results, wrap_in_if
 import islpy as isl
 from loopy.schedule import (
@@ -33,30 +32,6 @@ from loopy.schedule import (
 from loopy.diagnostic import LoopyError


-def get_admissible_conditional_inames_for(codegen_state, sched_index):
-    """This function disallows conditionals on local-idx tagged
-    inames if there is a barrier nested somewhere within.
-    """
-
-    kernel = codegen_state.kernel
-
-    from loopy.kernel.data import (LocalIndexTag, HardwareConcurrentTag,
-                                   filter_iname_tags_by_type)
-
-    from loopy.schedule import find_active_inames_at, has_barrier_within
-    result = find_active_inames_at(kernel, sched_index)
-
-    has_barrier = has_barrier_within(kernel, sched_index)
-
-    for iname, tags in six.iteritems(kernel.iname_to_tags):
-        if (filter_iname_tags_by_type(tags, HardwareConcurrentTag)
-                and codegen_state.is_generating_device_code):
-            if not has_barrier or not filter_iname_tags_by_type(tags, LocalIndexTag):
-                result.add(iname)
-
-    return frozenset(result)
-
-
 def synthesize_idis_for_extra_args(kernel, schedule_index):
    """
    :returns: A list of :class:`loopy.codegen.ImplementedDataInfo`
@@ -222,14 +197,14 @@ def get_required_predicates(kernel, sched_index):
    return result


-def group_by(l, key, merge):
-    if not l:
-        return l
+def group_by(entry, key, merge):
+    if not entry:
+        return entry

    result = []
-    previous = l[0]
+    previous = entry[0]

-    for item in l[1:]:
+    for item in entry[1:]:
        if key(previous) == key(item):
            previous = merge(previous, item)

@@ -302,11 +277,13 @@ def build_loop_nest(codegen_state, schedule_index):
        """

    from loopy.schedule import find_used_inames_within
+    from loopy.codegen.bounds import get_usable_inames_for_conditional
+
    sched_index_info_entries = [
            ScheduleIndexInfo(
                schedule_indices=[i],
                admissible_cond_inames=(
-                    get_admissible_conditional_inames_for(codegen_state, i)),
+                    get_usable_inames_for_conditional(kernel, i)),
                required_predicates=get_required_predicates(kernel, i),
                used_inames_within=find_used_inames_within(kernel, i)
                )

--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -274,7 +274,7 @@ def generate_c_instruction_code(codegen_state, insn):
    if body:
        body.append(Line())

-    body.extend(Line(l) for l in insn.code.split("\n"))
+    body.extend(Line(line) for line in insn.code.split("\n"))

    return Block(body)


--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
    kernel = codegen_state.kernel

    from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
-                LocalIndexTag, GroupIndexTag)
+                LocalIndexTag, GroupIndexTag, VectorizeTag)

    from loopy.schedule import get_insn_ids_for_block_at
    insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)
@@ -242,7 +242,8 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
            all_inames_by_insns |= kernel.insn_inames(insn_id)

        hw_inames_left = [iname for iname in all_inames_by_insns
-                if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)]
+                if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)
+                and not kernel.iname_tags_of_type(iname, VectorizeTag)]

    if not hw_inames_left:
        return next_func(codegen_state)

--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -86,17 +86,17 @@ def _extract_loopy_lines(source):
    loopy_lines = []

    in_loopy_code = False
-    for l in lines:
-        comment_match = comment_re.match(l)
+    for line in lines:
+        comment_match = comment_re.match(line)

        if comment_match is None:
            if in_loopy_code:
                raise LoopyError("non-comment source line in loopy block")

-            remaining_lines.append(l)
+            remaining_lines.append(line)

            # Preserves line numbers in loopy code, for debuggability
-            loopy_lines.append("# "+l)
+            loopy_lines.append("# "+line)
            continue

        cmt = comment_match.group(1)
@@ -108,7 +108,7 @@ def _extract_loopy_lines(source):
            in_loopy_code = True

            # Preserves line numbers in loopy code, for debuggability
-            loopy_lines.append("# "+l)
+            loopy_lines.append("# "+line)

        elif cmt_stripped == "$loopy end":
            if not in_loopy_code:
@@ -116,16 +116,16 @@ def _extract_loopy_lines(source):
            in_loopy_code = False

            # Preserves line numbers in loopy code, for debuggability
-            loopy_lines.append("# "+l)
+            loopy_lines.append("# "+line)

        elif in_loopy_code:
            loopy_lines.append(cmt)

        else:
-            remaining_lines.append(l)
+            remaining_lines.append(line)

            # Preserves line numbers in loopy code, for debuggability
-            loopy_lines.append("# "+l)
+            loopy_lines.append("# "+line)

    return "\n".join(remaining_lines), "\n".join(loopy_lines)


--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -339,11 +339,11 @@ class F2LoopyTranslator(FTreeWalkerBase):

        return []

-    map_Logical = map_type_decl
-    map_Integer = map_type_decl
-    map_Real = map_type_decl
-    map_Complex = map_type_decl
-    map_DoublePrecision = map_type_decl
+    map_Logical = map_type_decl  # noqa: N815
+    map_Integer = map_type_decl  # noqa: N815
+    map_Real = map_type_decl  # noqa: N815
+    map_Complex = map_type_decl  # noqa: N815
+    map_DoublePrecision = map_type_decl  # noqa: N815

    def map_Dimension(self, node):
        scope = self.scope_stack[-1]

--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -35,14 +35,13 @@ import islpy as isl
 from islpy import dim_type
 import re

-from pytools import UniqueNameGenerator, generate_unique_names
+from pytools import UniqueNameGenerator, generate_unique_names, natsorted

 from loopy.library.function import (
        default_function_mangler,
        single_arg_function_mangler)

 from loopy.diagnostic import CannotBranchDomainTree, LoopyError
-from loopy.tools import natsorted
 from loopy.diagnostic import StaticValueFindingError
 from loopy.kernel.data import filter_iname_tags_by_type
 from warnings import warn
@@ -99,10 +98,25 @@ class _UniqueVarNameGenerator(UniqueNameGenerator):

 # {{{ loop kernel object

+class _deprecated_KernelState_SCHEDULED(object):  # noqa
+    def __init__(self, f):
+        self.f = f
+
+    def __get__(self, obj, klass):
+        warn(
+            "'KernelState.SCHEDULED' is deprecated. "
+            "Use 'KernelState.LINEARIZED'.",
+            DeprecationWarning, stacklevel=2)
+        return self.f()
+
 class KernelState:  # noqa
    INITIAL = 0
    PREPROCESSED = 1
-    SCHEDULED = 2
+    LINEARIZED = 2
+
+    @_deprecated_KernelState_SCHEDULED
+    def SCHEDULED():  # pylint:disable=no-method-argument
+        return KernelState.LINEARIZED

 # {{{ kernel_state, KernelState compataibility

@@ -228,7 +242,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):

    # {{{ constructor

-    def __init__(self, domains, instructions, args=None, schedule=None,
+    def __init__(self, domains, instructions, args=None,
+            schedule=None,
+            linearization=None,
            name="loopy_kernel",
            preambles=None,
            preamble_generators=None,
@@ -333,10 +349,27 @@ class LoopKernel(ImmutableRecordWithoutPickling):
        if state not in [
                KernelState.INITIAL,
                KernelState.PREPROCESSED,
-                KernelState.SCHEDULED,
+                KernelState.LINEARIZED,
                ]:
            raise ValueError("invalid value for 'state'")

+        # `linearization` is replacing `schedule`, but we're not changing
+        # this under the hood yet, so for now, store it inside `schedule`
+        # and raise deprecation warning anyway
+        if schedule is not None:
+            if linearization is not None:
+                # these should not both be present
+                raise ValueError(
+                    "received both `schedule` and `linearization` args, "
+                    "'LoopKernel.schedule' is deprecated. "
+                    "Use 'LoopKernel.linearization'.")
+            warn(
+                "'LoopKernel.schedule' is deprecated. "
+                "Use 'LoopKernel.linearization'.",
+                DeprecationWarning, stacklevel=2)
+        elif linearization is not None:
+            schedule = linearization
+
        from collections import defaultdict
        assert not isinstance(iname_to_tags, defaultdict)

@@ -1345,7 +1378,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
        if "schedule" in what and kernel.schedule is not None:
            lines.extend(sep)
            if show_labels:
-                lines.append("SCHEDULE:")
+                lines.append("LINEARIZATION:")
            from loopy.schedule import dump_schedule
            lines.append(dump_schedule(kernel, kernel.schedule))

@@ -1395,6 +1428,14 @@ class LoopKernel(ImmutableRecordWithoutPickling):

    # }}}

+    # {{{ handle linearization variable that doesn't yet exist
+
+    @property
+    def linearization(self):
+        return self.schedule
+
+    # }}}
+
    # {{{ direct execution

    def __call__(self, *args, **kwargs):

--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -186,7 +186,7 @@ class LoopedIlpTag(IlpBaseTag):
 # }}}


-class VectorizeTag(UniqueTag):
+class VectorizeTag(UniqueTag, HardwareConcurrentTag):
    def __str__(self):
        return "vec"


--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -66,7 +66,8 @@ class InstructionBase(ImmutableRecord):
    .. attribute:: depends_on_is_final

        A :class:`bool` determining whether :attr:`depends_on` constitutes
-        the *entire* list of iname dependencies.
+        the *entire* list of iname dependencies. If *not* marked final,
+        various semi-broken heuristics will try to add further dependencies.

        Defaults to *False*.

@@ -344,10 +345,13 @@ class InstructionBase(ImmutableRecord):
        """
        raise NotImplementedError

-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, assignee_f=None):
        """Return a new copy of *self* where *f* has been applied to every
        expression occurring in *self*. *args* will be passed as extra
        arguments (in addition to the expression) to *f*.
+
+        If *assignee_f* is passed, then left-hand sides of assignments are
+        passed to it. If it is not given, it defaults to the same as *f*.
        """
        raise NotImplementedError

@@ -959,12 +963,15 @@ class Assignment(MultiAssignmentBase):
    def assignee_subscript_deps(self):
        return (_get_assignee_subscript_deps(self.assignee),)

-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, assignee_f=None):
+        if assignee_f is None:
+            assignee_f = f
+
        return self.copy(
-                assignee=f(self.assignee, *args),
-                expression=f(self.expression, *args),
+                assignee=assignee_f(self.assignee),
+                expression=f(self.expression),
                predicates=frozenset(
-                    f(pred, *args) for pred in self.predicates))
+                    f(pred) for pred in self.predicates))

    # }}}

@@ -1114,12 +1121,15 @@ class CallInstruction(MultiAssignmentBase):
                _get_assignee_subscript_deps(a)
                for a in self.assignees)

-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, assignee_f=None):
+        if assignee_f is None:
+            assignee_f = f
+
        return self.copy(
-                assignees=f(self.assignees, *args),
-                expression=f(self.expression, *args),
+                assignees=assignee_f(self.assignees),
+                expression=f(self.expression),
                predicates=frozenset(
-                    f(pred, *args) for pred in self.predicates))
+                    f(pred) for pred in self.predicates))

    # }}}

@@ -1315,14 +1325,17 @@ class CInstruction(InstructionBase):
                _get_assignee_subscript_deps(a)
                for a in self.assignees)

-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, assignee_f=None):
+        if assignee_f is None:
+            assignee_f = f
+
        return self.copy(
                iname_exprs=[
-                    (name, f(expr, *args))
+                    (name, f(expr))
                    for name, expr in self.iname_exprs],
-                assignees=[f(a, *args) for a in self.assignees],
+                assignees=[assignee_f(a) for a in self.assignees],
                predicates=frozenset(
-                    f(pred, *args) for pred in self.predicates))
+                    f(pred) for pred in self.predicates))

    # }}}

@@ -1357,7 +1370,7 @@ class _DataObliviousInstruction(InstructionBase):
    def assignee_subscript_deps(self):
        return frozenset()

-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, assignee_f=None):
        return self.copy(
                predicates=frozenset(
                    f(pred) for pred in self.predicates))

--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -34,8 +34,7 @@ import numpy as np
 import islpy as isl
 from islpy import dim_type
 from loopy.diagnostic import LoopyError, warn_with_kernel
-from pytools import memoize_on_first_arg
-from loopy.tools import natsorted
+from pytools import memoize_on_first_arg, natsorted

 import logging
 logger = logging.getLogger(__name__)
@@ -1381,7 +1380,7 @@ def draw_dependencies_as_unicode_arrows(
                .replace(style.RESET_ALL, ""))
        return len(s)

-    def truncate_without_color_escapes(s, l):
+    def truncate_without_color_escapes(s, length):
        # FIXME: This is a bit dumb--it removes color escapes when truncation
        # is needed.

@@ -1389,7 +1388,7 @@ def draw_dependencies_as_unicode_arrows(
                .replace(fore.RED, "")
                .replace(style.RESET_ALL, ""))

-        return s[:l] + u"…"
+        return s[:length] + u"…"

    def conform_to_uniform_length(s):
        len_s = len_without_color_escapes(s)
@@ -1428,6 +1427,8 @@ def stringify_instruction_list(kernel):

    def insert_insn_into_order(insn):
        if insn.id in printed_insn_ids:
+            # Note: dependency cycles are deliberately ignored so that printing
+            # succeeds.
            return
        printed_insn_ids.add(insn.id)

@@ -1511,7 +1512,7 @@ def stringify_instruction_list(kernel):
                    ", ".join("%s=%s" % (name, expr)
                        for name, expr in insn.iname_exprs))

-            trailing = [l for l in insn.code.split("\n")]
+            trailing = insn.code.split("\n")
        elif isinstance(insn, lp.BarrierInstruction):
            lhs = ""
            rhs = "... %sbarrier" % insn.synchronization_kind[0]
@@ -1583,6 +1584,13 @@ def stringify_instruction_list(kernel):

 # {{{ global barrier order finding

+def _is_global_barrier(kernel, insn_id):
+    insn = kernel.id_to_insn[insn_id]
+    from loopy.kernel.instruction import BarrierInstruction
+    return isinstance(insn, BarrierInstruction) and \
+        insn.synchronization_kind == "global"
+
+
 @memoize_on_first_arg
 def get_global_barrier_order(kernel):
    """Return a :class:`tuple` of the listing the ids of global barrier instructions
@@ -1590,49 +1598,27 @@ def get_global_barrier_order(kernel):

    See also :class:`loopy.instruction.BarrierInstruction`.
    """
-    barriers = []
-    visiting = set()
-    visited = set()
-
-    unvisited = set(insn.id for insn in kernel.instructions)
-
-    def is_barrier(my_insn_id):
-        insn = kernel.id_to_insn[my_insn_id]
-        from loopy.kernel.instruction import BarrierInstruction
-        return isinstance(insn, BarrierInstruction) and \
-            insn.synchronization_kind == "global"
-
-    while unvisited:
-        stack = [unvisited.pop()]
-
-        while stack:
-            top = stack[-1]
-
-            if top in visiting:
-                visiting.remove(top)
-                if is_barrier(top):
-                    barriers.append(top)
+    dep_graph = {insn.id: set() for insn in kernel.instructions}
+    for insn in kernel.instructions:
+        for dep in insn.depends_on:
+            dep_graph[dep].add(insn.id)

-            if top in visited:
-                stack.pop()
-                continue
+    from pytools.graph import compute_topological_order
+    order = compute_topological_order(dep_graph)

-            visited.add(top)
-            visiting.add(top)
+    barriers = [
+            insn_id for insn_id in order
+            if _is_global_barrier(kernel, insn_id)]

-            for child in kernel.id_to_insn[top].depends_on:
-                # Check for no cycles.
-                assert child not in visiting
-                stack.append(child)
+    del order

    # Ensure this is the only possible order.
    #
    # We do this by looking at the barriers in order.
    # We check for each adjacent pair (a,b) in the order if a < b,
    # i.e. if a is reachable by a chain of dependencies from b.
-
-    visiting.clear()
-    visited.clear()
+    visited = set()
+    visiting = set()

    for prev_barrier, barrier in zip(barriers, barriers[1:]):
        # Check if prev_barrier is reachable from barrier.
@@ -1690,12 +1676,6 @@ def find_most_recent_global_barrier(kernel, insn_id):
    if len(insn.depends_on) == 0:
        return None

-    def is_barrier(my_insn_id):
-        insn = kernel.id_to_insn[my_insn_id]
-        from loopy.kernel.instruction import BarrierInstruction
-        return isinstance(insn, BarrierInstruction) and \
-            insn.synchronization_kind == "global"
-
    global_barrier_to_ordinal = dict(
            (b, i) for i, b in enumerate(global_barrier_order))

@@ -1705,7 +1685,7 @@ def find_most_recent_global_barrier(kernel, insn_id):
                else -1)

    direct_barrier_dependencies = set(
-            dep for dep in insn.depends_on if is_barrier(dep))
+            dep for dep in insn.depends_on if _is_global_barrier(kernel, dep))

    if len(direct_barrier_dependencies) > 0:
        return max(direct_barrier_dependencies, key=get_barrier_ordinal)
@@ -1727,8 +1707,8 @@ def get_subkernels(kernel):
    See also :class:`loopy.schedule.CallKernel`.
    """
    from loopy.kernel import KernelState
-    if kernel.state != KernelState.SCHEDULED:
-        raise LoopyError("Kernel must be scheduled")
+    if kernel.state != KernelState.LINEARIZED:
+        raise LoopyError("Kernel must be linearized")

    from loopy.schedule import CallKernel

@@ -1744,7 +1724,7 @@ def get_subkernel_to_insn_id_map(kernel):
    kernel must be scheduled.
    """
    from loopy.kernel import KernelState
-    if kernel.state != KernelState.SCHEDULED:
+    if kernel.state != KernelState.LINEARIZED:
        raise LoopyError("Kernel must be scheduled")

    from loopy.schedule import (

--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -37,6 +37,7 @@ from loopy.version import DATA_MODEL_VERSION
 from loopy.kernel.data import make_assignment, filter_iname_tags_by_type
 # for the benefit of loopy.statistics, for now
 from loopy.type_inference import infer_unknown_types
+from loopy.transform.iname import remove_any_newly_unused_inames

 import logging
 logger = logging.getLogger(__name__)
@@ -289,7 +290,7 @@ def _classify_reduction_inames(kernel, inames):
    nonlocal_par = []

    from loopy.kernel.data import (
-            LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
+            LocalIndexTagBase, UnrolledIlpTag, UnrollTag,
            ConcurrentTag, filter_iname_tags_by_type)

    for iname in inames:
@@ -303,7 +304,7 @@ def _classify_reduction_inames(kernel, inames):
        elif filter_iname_tags_by_type(iname_tags, LocalIndexTagBase):
            local_par.append(iname)

-        elif filter_iname_tags_by_type(iname_tags, (ConcurrentTag, VectorizeTag)):
+        elif filter_iname_tags_by_type(iname_tags, ConcurrentTag):
            nonlocal_par.append(iname)

        else:
@@ -882,6 +883,7 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
 # }}}


+@remove_any_newly_unused_inames
 def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                      automagic_scans_ok=False, force_scan=False,
                      force_outer_iname_for_scan=None):
@@ -1370,7 +1372,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,

        track_iname = var_name_gen(
                "{sweep_iname}__seq_scan"
-                .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+                .format(sweep_iname=sweep_iname))

        get_or_add_sweep_tracking_iname_and_domain(
                scan_iname, sweep_iname, sweep_min_value, scan_min_value,
@@ -1480,7 +1482,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,

        track_iname = var_name_gen(
                "{sweep_iname}__pre_scan"
-                .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+                .format(sweep_iname=sweep_iname))

        get_or_add_sweep_tracking_iname_and_domain(
                scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
@@ -1924,8 +1926,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,

    kernel = lp.tag_inames(kernel, new_iname_tags)

-    # TODO: remove unused inames...
-
    kernel = (
            _hackily_ensure_multi_assignment_return_values_are_scoped_private(
                kernel))
@@ -1979,7 +1979,7 @@ def find_idempotence(kernel):

    # Find SCCs of dep_graph. These are used for checking if the instruction is
    # in a dependency cycle.
-    from loopy.tools import compute_sccs
+    from pytools.graph import compute_sccs

    sccs = dict((item, scc)
            for scc in compute_sccs(dep_graph)

--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -212,12 +212,12 @@ def find_loop_nest_with_map(kernel):
    """
    result = {}

-    from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
+    from loopy.kernel.data import ConcurrentTag, IlpBaseTag

    all_nonpar_inames = set(
            iname for iname in kernel.all_inames()
            if not kernel.iname_tags_of_type(iname,
-                    (ConcurrentTag, IlpBaseTag, VectorizeTag)))
+                    (ConcurrentTag, IlpBaseTag)))

    iname_to_insns = kernel.iname_to_insns()

@@ -276,7 +276,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):

    result = {}

-    from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
+    from loopy.kernel.data import ConcurrentTag, IlpBaseTag
    for insn in kernel.instructions:
        for iname in kernel.insn_inames(insn):
            if kernel.iname_tags_of_type(iname, ConcurrentTag):
@@ -310,7 +310,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
                        continue

                    if kernel.iname_tags_of_type(dep_insn_iname,
-                                (ConcurrentTag, IlpBaseTag, VectorizeTag)):
+                                (ConcurrentTag, IlpBaseTag)):
                        # Parallel tags don't really nest, so we'll disregard
                        # them here.
                        continue
@@ -1841,7 +1841,7 @@ def generate_loop_schedules(kernel, debug_args={}):

 def generate_loop_schedules_inner(kernel, debug_args={}):
    from loopy.kernel import KernelState
-    if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED):
+    if kernel.state not in (KernelState.PREPROCESSED, KernelState.LINEARIZED):
        raise LoopyError("cannot schedule a kernel that has not been "
                "preprocessed")

@@ -1852,7 +1852,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):

    debug = ScheduleDebugger(**debug_args)

-    preschedule = kernel.schedule if kernel.state == KernelState.SCHEDULED else ()
+    preschedule = kernel.schedule if kernel.state == KernelState.LINEARIZED else ()

    prescheduled_inames = set(
            insn.iname
@@ -1904,7 +1904,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):

            unscheduled_insn_ids=set(insn.id for insn in kernel.instructions),
            scheduled_insn_ids=frozenset(),
-            within_subkernel=kernel.state != KernelState.SCHEDULED,
+            within_subkernel=kernel.state != KernelState.LINEARIZED,
            may_schedule_global_barriers=True,

            preschedule=preschedule,
@@ -1973,11 +1973,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}):

            new_kernel = kernel.copy(
                    schedule=gen_sched,
-                    state=KernelState.SCHEDULED)
+                    state=KernelState.LINEARIZED)

            from loopy.schedule.device_mapping import \
                    map_schedule_onto_host_or_device
-            if kernel.state != KernelState.SCHEDULED:
+            if kernel.state != KernelState.LINEARIZED:
                # Device mapper only gets run once.
                new_kernel = map_schedule_onto_host_or_device(new_kernel)

@@ -2029,6 +2029,15 @@ def _get_one_scheduled_kernel_inner(kernel):


 def get_one_scheduled_kernel(kernel):
+    warn_with_kernel(
+        kernel, "get_one_scheduled_kernel_deprecated",
+        "get_one_scheduled_kernel is deprecated. "
+        "Use get_one_linearized_kernel instead.",
+        DeprecationWarning)
+    return get_one_linearized_kernel(kernel)
+
+
+def get_one_linearized_kernel(kernel):
    from loopy import CACHING_ENABLED

    sched_cache_key = kernel

--- a/loopy/schedule/device_mapping.py
+++ b/loopy/schedule/device_mapping.py
@@ -31,7 +31,7 @@ from loopy.schedule.tools import get_block_boundaries
 def map_schedule_onto_host_or_device(kernel):
    # FIXME: Should be idempotent.
    from loopy.kernel import KernelState
-    assert kernel.state == KernelState.SCHEDULED
+    assert kernel.state == KernelState.LINEARIZED

    from functools import partial
    device_prog_name_gen = partial(

--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1863,75 +1863,4 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):

 # }}}

-
-# {{{ compat goop
-
-def get_lmem_access_poly(knl):
-    """Count the number of local memory accesses in a loopy kernel.
-
-    get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['local'] option.
-
-    """
-    warn_with_kernel(knl, "deprecated_get_lmem_access_poly",
-                     "get_lmem_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['local'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['local'])
-
-
-def get_DRAM_access_poly(knl):
-    """Count the number of global memory accesses in a loopy kernel.
-
-    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['global'] option.
-
-    """
-    warn_with_kernel(knl, "deprecated_get_DRAM_access_poly",
-                     "get_DRAM_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['global'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['global'])
-
-
-def get_gmem_access_poly(knl):
-    """Count the number of global memory accesses in a loopy kernel.
-
-    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['global'] option.
-
-    """
-    warn_with_kernel(knl, "deprecated_get_gmem_access_poly",
-                     "get_DRAM_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['global'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['global'])
-
-
-def get_synchronization_poly(knl):
-    """Count the number of synchronization events each work-item encounters in
-    a loopy kernel.
-
-    get_synchronization_poly is deprecated. Use get_synchronization_map
-    instead.
-
-    """
-    warn_with_kernel(knl, "deprecated_get_synchronization_poly",
-                     "get_synchronization_poly is deprecated. Use "
-                     "get_synchronization_map instead.")
-    return get_synchronization_map(knl)
-
-
-def get_op_poly(knl, numpy_types=True):
-    """Count the number of operations in a loopy kernel.
-
-    get_op_poly is deprecated. Use get_op_map instead.
-
-    """
-    warn_with_kernel(knl, "deprecated_get_op_poly",
-                     "get_op_poly is deprecated. Use get_op_map instead.")
-    return get_op_map(knl, numpy_types)
-
-# }}}
-
 # vim: foldmethod=marker
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -273,8 +273,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase):
        if not isinstance(other, type(expr)):
            return self.treat_mismatch(expr, other, unis)
        if (expr.inames != other.inames
-                or type(expr.operation) != type(other.operation)  # noqa
-                ):
+                or type(expr.operation) != type(other.operation)):  # noqa
            return []

        return self.rec(expr.expr, other.expr, unis)
@@ -971,7 +970,8 @@ class RuleAwareIdentityMapper(IdentityMapper):
                # may perform tasks entirely unrelated to subst rules, so
                # we must map assignees, too.
                self.map_instruction(kernel,
-                    insn.with_transformed_expressions(self, kernel, insn))
+                    insn.with_transformed_expressions(
+                        lambda expr: self(expr, kernel, insn)))
                for insn in kernel.instructions]

        return kernel.copy(instructions=new_insns)

--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -80,6 +80,11 @@ class DTypeRegistryWrapper(object):
 def c99_preamble_generator(preamble_info):
    if any(dtype.is_integral() for dtype in preamble_info.seen_dtypes):
        yield("10_stdint", "#include <stdint.h>")
+    if any(dtype.numpy_dtype == np.dtype("bool")
+           for dtype in preamble_info.seen_dtypes):
+        yield("10_stdbool", "#include <stdbool.h>")
+    if any(dtype.is_complex() for dtype in preamble_info.seen_dtypes):
+        yield("10_complex", "#include <complex.h>")


 def _preamble_generator(preamble_info):
@@ -436,7 +441,7 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True):
                arg_dtypes=arg_dtypes)

    # binary functions
-    if (name in ["fmax", "fmin"]
+    if (name in ["fmax", "fmin", "copysign"]
            and len(arg_dtypes) == 2):

        dtype = np.find_common_type(
@@ -1079,9 +1084,11 @@ class CTarget(CFamilyTarget):
    @memoize_method
    def get_dtype_registry(self):
        from loopy.target.c.compyte.dtypes import (
-                DTypeRegistry, fill_registry_with_c99_stdint_types)
+                DTypeRegistry, fill_registry_with_c99_stdint_types,
+                fill_registry_with_c99_complex_types)
        result = DTypeRegistry()
        fill_registry_with_c99_stdint_types(result)
+        fill_registry_with_c99_complex_types(result)
        return DTypeRegistryWrapper(result)



--- a/compyte @ 7e48e116
+++ b/compyte @ 7e48e116
-Subproject commit 25ee8b48fd0c7d9f0bd987c6862cdb1884fb1372
+Subproject commit 7e48e1166a13cfbb7b60f909b071f088034ffda1
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
+# coding: utf-8
 """OpenCL target integrated with PyOpenCL."""

 from __future__ import division, absolute_import
@@ -285,6 +286,9 @@ class PyOpenCLTarget(OpenCLTarget):
    warnings) and support for complex numbers.
    """

+    # FIXME make prefixes conform to naming rules
+    # (see Reference: Loopy’s Model of a Kernel)
+
    host_program_name_prefix = "_lpy_host_"
    host_program_name_suffix = ""

@@ -299,7 +303,26 @@ class PyOpenCLTarget(OpenCLTarget):
        self.device = device
        self.pyopencl_module_name = pyopencl_module_name

-    comparison_fields = ["device"]
+    # NB: Not including 'device', as that is handled specially here.
+    hash_fields = OpenCLTarget.hash_fields + (
+            "pyopencl_module_name",)
+    comparison_fields = OpenCLTarget.comparison_fields + (
+            "pyopencl_module_name",)
+
+    def __eq__(self, other):
+        if not super(PyOpenCLTarget, self).__eq__(other):
+            return False
+
+        if (self.device is None) != (other.device is None):
+            return False
+
+        if self.device is not None:
+            assert other.device is not None
+            return (self.device.persistent_unique_id
+                    == other.device.persistent_unique_id)
+        else:
+            assert other.device is None
+            return True

    def update_persistent_hash(self, key_hash, key_builder):
        super(PyOpenCLTarget, self).update_persistent_hash(key_hash, key_builder)
No results found