Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tasmith4/loopy
  • ben_sepanski/loopy
  • arghdos/loopy
  • inducer/loopy
  • wence-/loopy
  • isuruf/loopy
  • fikl2/loopy
  • xywei/loopy
  • kaushikcfd/loopy
  • zweiner2/loopy
10 results
Show changes
Showing
with 203 additions and 221 deletions
......@@ -39,13 +39,13 @@ def defines_to_python_code(defines_str):
import re
define_re = re.compile(r"^\#define\s+([a-zA-Z0-9_]+)\s+(.*)$")
result = []
for l in defines_str.split("\n"):
if not l.strip():
for line in defines_str.split("\n"):
if not line.strip():
continue
match = define_re.match(l)
match = define_re.match(line)
if match is None:
raise RuntimeError("#define not understood: '%s'" % l)
raise RuntimeError("#define not understood: '%s'" % line)
result.append(
"%s = %s" % (match.group(1), to_python_literal(match.group(2))))
......
......@@ -388,7 +388,7 @@ def generate_code_v2(kernel):
from loopy.schedule import get_one_scheduled_kernel
kernel = get_one_scheduled_kernel(kernel)
if kernel.state != KernelState.SCHEDULED:
if kernel.state != KernelState.LINEARIZED:
raise LoopyError("cannot generate code for a kernel that has not been "
"scheduled")
......
......@@ -59,6 +59,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
from loopy.schedule import (
find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase,
VectorizeTag,
IlpBaseTag)
result = find_active_inames_at(kernel, sched_index)
......@@ -67,7 +68,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
# Find our containing subkernel. Grab inames for all insns from there.
within_subkernel = False
for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index+1]):
for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index]):
from loopy.schedule import CallKernel, ReturnFromKernel
if isinstance(sched_item, CallKernel):
within_subkernel = True
......@@ -92,11 +93,12 @@ def get_usable_inames_for_conditional(kernel, sched_index):
#
# - local indices may not be used in conditionals that cross barriers.
#
# - ILP indices are not available in loop bounds, they only get defined
# at the innermost level of nesting.
# - ILP indices and vector lane indices are not available in loop
# bounds, they only get defined at the innermost level of nesting.
if (
kernel.iname_tags_of_type(iname, ConcurrentTag)
and not kernel.iname_tags_of_type(iname, VectorizeTag)
and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase)
and crosses_barrier)
and not kernel.iname_tags_of_type(iname, IlpBaseTag)
......
......@@ -24,7 +24,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
import six
from loopy.codegen.result import merge_codegen_results, wrap_in_if
import islpy as isl
from loopy.schedule import (
......@@ -33,30 +32,6 @@ from loopy.schedule import (
from loopy.diagnostic import LoopyError
def get_admissible_conditional_inames_for(codegen_state, sched_index):
"""This function disallows conditionals on local-idx tagged
inames if there is a barrier nested somewhere within.
"""
kernel = codegen_state.kernel
from loopy.kernel.data import (LocalIndexTag, HardwareConcurrentTag,
filter_iname_tags_by_type)
from loopy.schedule import find_active_inames_at, has_barrier_within
result = find_active_inames_at(kernel, sched_index)
has_barrier = has_barrier_within(kernel, sched_index)
for iname, tags in six.iteritems(kernel.iname_to_tags):
if (filter_iname_tags_by_type(tags, HardwareConcurrentTag)
and codegen_state.is_generating_device_code):
if not has_barrier or not filter_iname_tags_by_type(tags, LocalIndexTag):
result.add(iname)
return frozenset(result)
def synthesize_idis_for_extra_args(kernel, schedule_index):
"""
:returns: A list of :class:`loopy.codegen.ImplementedDataInfo`
......@@ -222,14 +197,14 @@ def get_required_predicates(kernel, sched_index):
return result
def group_by(l, key, merge):
if not l:
return l
def group_by(entry, key, merge):
if not entry:
return entry
result = []
previous = l[0]
previous = entry[0]
for item in l[1:]:
for item in entry[1:]:
if key(previous) == key(item):
previous = merge(previous, item)
......@@ -302,11 +277,13 @@ def build_loop_nest(codegen_state, schedule_index):
"""
from loopy.schedule import find_used_inames_within
from loopy.codegen.bounds import get_usable_inames_for_conditional
sched_index_info_entries = [
ScheduleIndexInfo(
schedule_indices=[i],
admissible_cond_inames=(
get_admissible_conditional_inames_for(codegen_state, i)),
get_usable_inames_for_conditional(kernel, i)),
required_predicates=get_required_predicates(kernel, i),
used_inames_within=find_used_inames_within(kernel, i)
)
......
......@@ -274,7 +274,7 @@ def generate_c_instruction_code(codegen_state, insn):
if body:
body.append(Line())
body.extend(Line(l) for l in insn.code.split("\n"))
body.extend(Line(line) for line in insn.code.split("\n"))
return Block(body)
......
......@@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
kernel = codegen_state.kernel
from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
LocalIndexTag, GroupIndexTag)
LocalIndexTag, GroupIndexTag, VectorizeTag)
from loopy.schedule import get_insn_ids_for_block_at
insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)
......@@ -242,7 +242,8 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
all_inames_by_insns |= kernel.insn_inames(insn_id)
hw_inames_left = [iname for iname in all_inames_by_insns
if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)]
if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)
and not kernel.iname_tags_of_type(iname, VectorizeTag)]
if not hw_inames_left:
return next_func(codegen_state)
......
......@@ -86,17 +86,17 @@ def _extract_loopy_lines(source):
loopy_lines = []
in_loopy_code = False
for l in lines:
comment_match = comment_re.match(l)
for line in lines:
comment_match = comment_re.match(line)
if comment_match is None:
if in_loopy_code:
raise LoopyError("non-comment source line in loopy block")
remaining_lines.append(l)
remaining_lines.append(line)
# Preserves line numbers in loopy code, for debuggability
loopy_lines.append("# "+l)
loopy_lines.append("# "+line)
continue
cmt = comment_match.group(1)
......@@ -108,7 +108,7 @@ def _extract_loopy_lines(source):
in_loopy_code = True
# Preserves line numbers in loopy code, for debuggability
loopy_lines.append("# "+l)
loopy_lines.append("# "+line)
elif cmt_stripped == "$loopy end":
if not in_loopy_code:
......@@ -116,16 +116,16 @@ def _extract_loopy_lines(source):
in_loopy_code = False
# Preserves line numbers in loopy code, for debuggability
loopy_lines.append("# "+l)
loopy_lines.append("# "+line)
elif in_loopy_code:
loopy_lines.append(cmt)
else:
remaining_lines.append(l)
remaining_lines.append(line)
# Preserves line numbers in loopy code, for debuggability
loopy_lines.append("# "+l)
loopy_lines.append("# "+line)
return "\n".join(remaining_lines), "\n".join(loopy_lines)
......
......@@ -339,11 +339,11 @@ class F2LoopyTranslator(FTreeWalkerBase):
return []
map_Logical = map_type_decl
map_Integer = map_type_decl
map_Real = map_type_decl
map_Complex = map_type_decl
map_DoublePrecision = map_type_decl
map_Logical = map_type_decl # noqa: N815
map_Integer = map_type_decl # noqa: N815
map_Real = map_type_decl # noqa: N815
map_Complex = map_type_decl # noqa: N815
map_DoublePrecision = map_type_decl # noqa: N815
def map_Dimension(self, node):
scope = self.scope_stack[-1]
......
......@@ -35,14 +35,13 @@ import islpy as isl
from islpy import dim_type
import re
from pytools import UniqueNameGenerator, generate_unique_names
from pytools import UniqueNameGenerator, generate_unique_names, natsorted
from loopy.library.function import (
default_function_mangler,
single_arg_function_mangler)
from loopy.diagnostic import CannotBranchDomainTree, LoopyError
from loopy.tools import natsorted
from loopy.diagnostic import StaticValueFindingError
from loopy.kernel.data import filter_iname_tags_by_type
from warnings import warn
......@@ -99,10 +98,25 @@ class _UniqueVarNameGenerator(UniqueNameGenerator):
# {{{ loop kernel object
class _deprecated_KernelState_SCHEDULED(object): # noqa
def __init__(self, f):
self.f = f
def __get__(self, obj, klass):
warn(
"'KernelState.SCHEDULED' is deprecated. "
"Use 'KernelState.LINEARIZED'.",
DeprecationWarning, stacklevel=2)
return self.f()
class KernelState: # noqa
INITIAL = 0
PREPROCESSED = 1
SCHEDULED = 2
LINEARIZED = 2
@_deprecated_KernelState_SCHEDULED
def SCHEDULED(): # pylint:disable=no-method-argument
return KernelState.LINEARIZED
# {{{ kernel_state, KernelState compataibility
......@@ -228,7 +242,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
# {{{ constructor
def __init__(self, domains, instructions, args=None, schedule=None,
def __init__(self, domains, instructions, args=None,
schedule=None,
linearization=None,
name="loopy_kernel",
preambles=None,
preamble_generators=None,
......@@ -333,10 +349,27 @@ class LoopKernel(ImmutableRecordWithoutPickling):
if state not in [
KernelState.INITIAL,
KernelState.PREPROCESSED,
KernelState.SCHEDULED,
KernelState.LINEARIZED,
]:
raise ValueError("invalid value for 'state'")
# `linearization` is replacing `schedule`, but we're not changing
# this under the hood yet, so for now, store it inside `schedule`
# and raise deprecation warning anyway
if schedule is not None:
if linearization is not None:
# these should not both be present
raise ValueError(
"received both `schedule` and `linearization` args, "
"'LoopKernel.schedule' is deprecated. "
"Use 'LoopKernel.linearization'.")
warn(
"'LoopKernel.schedule' is deprecated. "
"Use 'LoopKernel.linearization'.",
DeprecationWarning, stacklevel=2)
elif linearization is not None:
schedule = linearization
from collections import defaultdict
assert not isinstance(iname_to_tags, defaultdict)
......@@ -1345,7 +1378,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
if "schedule" in what and kernel.schedule is not None:
lines.extend(sep)
if show_labels:
lines.append("SCHEDULE:")
lines.append("LINEARIZATION:")
from loopy.schedule import dump_schedule
lines.append(dump_schedule(kernel, kernel.schedule))
......@@ -1395,6 +1428,14 @@ class LoopKernel(ImmutableRecordWithoutPickling):
# }}}
# {{{ handle linearization variable that doesn't yet exist
@property
def linearization(self):
return self.schedule
# }}}
# {{{ direct execution
def __call__(self, *args, **kwargs):
......
......@@ -186,7 +186,7 @@ class LoopedIlpTag(IlpBaseTag):
# }}}
class VectorizeTag(UniqueTag):
class VectorizeTag(UniqueTag, HardwareConcurrentTag):
def __str__(self):
return "vec"
......
......@@ -66,7 +66,8 @@ class InstructionBase(ImmutableRecord):
.. attribute:: depends_on_is_final
A :class:`bool` determining whether :attr:`depends_on` constitutes
the *entire* list of iname dependencies.
the *entire* list of iname dependencies. If *not* marked final,
various semi-broken heuristics will try to add further dependencies.
Defaults to *False*.
......@@ -344,10 +345,13 @@ class InstructionBase(ImmutableRecord):
"""
raise NotImplementedError
def with_transformed_expressions(self, f, *args):
def with_transformed_expressions(self, f, assignee_f=None):
"""Return a new copy of *self* where *f* has been applied to every
expression occurring in *self*. *args* will be passed as extra
arguments (in addition to the expression) to *f*.
If *assignee_f* is passed, then left-hand sides of assignments are
passed to it. If it is not given, it defaults to the same as *f*.
"""
raise NotImplementedError
......@@ -959,12 +963,15 @@ class Assignment(MultiAssignmentBase):
def assignee_subscript_deps(self):
return (_get_assignee_subscript_deps(self.assignee),)
def with_transformed_expressions(self, f, *args):
def with_transformed_expressions(self, f, assignee_f=None):
if assignee_f is None:
assignee_f = f
return self.copy(
assignee=f(self.assignee, *args),
expression=f(self.expression, *args),
assignee=assignee_f(self.assignee),
expression=f(self.expression),
predicates=frozenset(
f(pred, *args) for pred in self.predicates))
f(pred) for pred in self.predicates))
# }}}
......@@ -1114,12 +1121,15 @@ class CallInstruction(MultiAssignmentBase):
_get_assignee_subscript_deps(a)
for a in self.assignees)
def with_transformed_expressions(self, f, *args):
def with_transformed_expressions(self, f, assignee_f=None):
if assignee_f is None:
assignee_f = f
return self.copy(
assignees=f(self.assignees, *args),
expression=f(self.expression, *args),
assignees=assignee_f(self.assignees),
expression=f(self.expression),
predicates=frozenset(
f(pred, *args) for pred in self.predicates))
f(pred) for pred in self.predicates))
# }}}
......@@ -1315,14 +1325,17 @@ class CInstruction(InstructionBase):
_get_assignee_subscript_deps(a)
for a in self.assignees)
def with_transformed_expressions(self, f, *args):
def with_transformed_expressions(self, f, assignee_f=None):
if assignee_f is None:
assignee_f = f
return self.copy(
iname_exprs=[
(name, f(expr, *args))
(name, f(expr))
for name, expr in self.iname_exprs],
assignees=[f(a, *args) for a in self.assignees],
assignees=[assignee_f(a) for a in self.assignees],
predicates=frozenset(
f(pred, *args) for pred in self.predicates))
f(pred) for pred in self.predicates))
# }}}
......@@ -1357,7 +1370,7 @@ class _DataObliviousInstruction(InstructionBase):
def assignee_subscript_deps(self):
return frozenset()
def with_transformed_expressions(self, f, *args):
def with_transformed_expressions(self, f, assignee_f=None):
return self.copy(
predicates=frozenset(
f(pred) for pred in self.predicates))
......
......@@ -34,8 +34,7 @@ import numpy as np
import islpy as isl
from islpy import dim_type
from loopy.diagnostic import LoopyError, warn_with_kernel
from pytools import memoize_on_first_arg
from loopy.tools import natsorted
from pytools import memoize_on_first_arg, natsorted
import logging
logger = logging.getLogger(__name__)
......@@ -1381,7 +1380,7 @@ def draw_dependencies_as_unicode_arrows(
.replace(style.RESET_ALL, ""))
return len(s)
def truncate_without_color_escapes(s, l):
def truncate_without_color_escapes(s, length):
# FIXME: This is a bit dumb--it removes color escapes when truncation
# is needed.
......@@ -1389,7 +1388,7 @@ def draw_dependencies_as_unicode_arrows(
.replace(fore.RED, "")
.replace(style.RESET_ALL, ""))
return s[:l] + u""
return s[:length] + u""
def conform_to_uniform_length(s):
len_s = len_without_color_escapes(s)
......@@ -1428,6 +1427,8 @@ def stringify_instruction_list(kernel):
def insert_insn_into_order(insn):
if insn.id in printed_insn_ids:
# Note: dependency cycles are deliberately ignored so that printing
# succeeds.
return
printed_insn_ids.add(insn.id)
......@@ -1511,7 +1512,7 @@ def stringify_instruction_list(kernel):
", ".join("%s=%s" % (name, expr)
for name, expr in insn.iname_exprs))
trailing = [l for l in insn.code.split("\n")]
trailing = insn.code.split("\n")
elif isinstance(insn, lp.BarrierInstruction):
lhs = ""
rhs = "... %sbarrier" % insn.synchronization_kind[0]
......@@ -1583,6 +1584,13 @@ def stringify_instruction_list(kernel):
# {{{ global barrier order finding
def _is_global_barrier(kernel, insn_id):
insn = kernel.id_to_insn[insn_id]
from loopy.kernel.instruction import BarrierInstruction
return isinstance(insn, BarrierInstruction) and \
insn.synchronization_kind == "global"
@memoize_on_first_arg
def get_global_barrier_order(kernel):
"""Return a :class:`tuple` of the listing the ids of global barrier instructions
......@@ -1590,49 +1598,27 @@ def get_global_barrier_order(kernel):
See also :class:`loopy.instruction.BarrierInstruction`.
"""
barriers = []
visiting = set()
visited = set()
unvisited = set(insn.id for insn in kernel.instructions)
def is_barrier(my_insn_id):
insn = kernel.id_to_insn[my_insn_id]
from loopy.kernel.instruction import BarrierInstruction
return isinstance(insn, BarrierInstruction) and \
insn.synchronization_kind == "global"
while unvisited:
stack = [unvisited.pop()]
while stack:
top = stack[-1]
if top in visiting:
visiting.remove(top)
if is_barrier(top):
barriers.append(top)
dep_graph = {insn.id: set() for insn in kernel.instructions}
for insn in kernel.instructions:
for dep in insn.depends_on:
dep_graph[dep].add(insn.id)
if top in visited:
stack.pop()
continue
from pytools.graph import compute_topological_order
order = compute_topological_order(dep_graph)
visited.add(top)
visiting.add(top)
barriers = [
insn_id for insn_id in order
if _is_global_barrier(kernel, insn_id)]
for child in kernel.id_to_insn[top].depends_on:
# Check for no cycles.
assert child not in visiting
stack.append(child)
del order
# Ensure this is the only possible order.
#
# We do this by looking at the barriers in order.
# We check for each adjacent pair (a,b) in the order if a < b,
# i.e. if a is reachable by a chain of dependencies from b.
visiting.clear()
visited.clear()
visited = set()
visiting = set()
for prev_barrier, barrier in zip(barriers, barriers[1:]):
# Check if prev_barrier is reachable from barrier.
......@@ -1690,12 +1676,6 @@ def find_most_recent_global_barrier(kernel, insn_id):
if len(insn.depends_on) == 0:
return None
def is_barrier(my_insn_id):
insn = kernel.id_to_insn[my_insn_id]
from loopy.kernel.instruction import BarrierInstruction
return isinstance(insn, BarrierInstruction) and \
insn.synchronization_kind == "global"
global_barrier_to_ordinal = dict(
(b, i) for i, b in enumerate(global_barrier_order))
......@@ -1705,7 +1685,7 @@ def find_most_recent_global_barrier(kernel, insn_id):
else -1)
direct_barrier_dependencies = set(
dep for dep in insn.depends_on if is_barrier(dep))
dep for dep in insn.depends_on if _is_global_barrier(kernel, dep))
if len(direct_barrier_dependencies) > 0:
return max(direct_barrier_dependencies, key=get_barrier_ordinal)
......@@ -1727,8 +1707,8 @@ def get_subkernels(kernel):
See also :class:`loopy.schedule.CallKernel`.
"""
from loopy.kernel import KernelState
if kernel.state != KernelState.SCHEDULED:
raise LoopyError("Kernel must be scheduled")
if kernel.state != KernelState.LINEARIZED:
raise LoopyError("Kernel must be linearized")
from loopy.schedule import CallKernel
......@@ -1744,7 +1724,7 @@ def get_subkernel_to_insn_id_map(kernel):
kernel must be scheduled.
"""
from loopy.kernel import KernelState
if kernel.state != KernelState.SCHEDULED:
if kernel.state != KernelState.LINEARIZED:
raise LoopyError("Kernel must be scheduled")
from loopy.schedule import (
......
......@@ -37,6 +37,7 @@ from loopy.version import DATA_MODEL_VERSION
from loopy.kernel.data import make_assignment, filter_iname_tags_by_type
# for the benefit of loopy.statistics, for now
from loopy.type_inference import infer_unknown_types
from loopy.transform.iname import remove_any_newly_unused_inames
import logging
logger = logging.getLogger(__name__)
......@@ -289,7 +290,7 @@ def _classify_reduction_inames(kernel, inames):
nonlocal_par = []
from loopy.kernel.data import (
LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
LocalIndexTagBase, UnrolledIlpTag, UnrollTag,
ConcurrentTag, filter_iname_tags_by_type)
for iname in inames:
......@@ -303,7 +304,7 @@ def _classify_reduction_inames(kernel, inames):
elif filter_iname_tags_by_type(iname_tags, LocalIndexTagBase):
local_par.append(iname)
elif filter_iname_tags_by_type(iname_tags, (ConcurrentTag, VectorizeTag)):
elif filter_iname_tags_by_type(iname_tags, ConcurrentTag):
nonlocal_par.append(iname)
else:
......@@ -882,6 +883,7 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
# }}}
@remove_any_newly_unused_inames
def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
automagic_scans_ok=False, force_scan=False,
force_outer_iname_for_scan=None):
......@@ -1370,7 +1372,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
track_iname = var_name_gen(
"{sweep_iname}__seq_scan"
.format(scan_iname=scan_iname, sweep_iname=sweep_iname))
.format(sweep_iname=sweep_iname))
get_or_add_sweep_tracking_iname_and_domain(
scan_iname, sweep_iname, sweep_min_value, scan_min_value,
......@@ -1480,7 +1482,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
track_iname = var_name_gen(
"{sweep_iname}__pre_scan"
.format(scan_iname=scan_iname, sweep_iname=sweep_iname))
.format(sweep_iname=sweep_iname))
get_or_add_sweep_tracking_iname_and_domain(
scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
......@@ -1924,8 +1926,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
kernel = lp.tag_inames(kernel, new_iname_tags)
# TODO: remove unused inames...
kernel = (
_hackily_ensure_multi_assignment_return_values_are_scoped_private(
kernel))
......@@ -1979,7 +1979,7 @@ def find_idempotence(kernel):
# Find SCCs of dep_graph. These are used for checking if the instruction is
# in a dependency cycle.
from loopy.tools import compute_sccs
from pytools.graph import compute_sccs
sccs = dict((item, scc)
for scc in compute_sccs(dep_graph)
......
......@@ -212,12 +212,12 @@ def find_loop_nest_with_map(kernel):
"""
result = {}
from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
from loopy.kernel.data import ConcurrentTag, IlpBaseTag
all_nonpar_inames = set(
iname for iname in kernel.all_inames()
if not kernel.iname_tags_of_type(iname,
(ConcurrentTag, IlpBaseTag, VectorizeTag)))
(ConcurrentTag, IlpBaseTag)))
iname_to_insns = kernel.iname_to_insns()
......@@ -276,7 +276,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
result = {}
from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
from loopy.kernel.data import ConcurrentTag, IlpBaseTag
for insn in kernel.instructions:
for iname in kernel.insn_inames(insn):
if kernel.iname_tags_of_type(iname, ConcurrentTag):
......@@ -310,7 +310,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
continue
if kernel.iname_tags_of_type(dep_insn_iname,
(ConcurrentTag, IlpBaseTag, VectorizeTag)):
(ConcurrentTag, IlpBaseTag)):
# Parallel tags don't really nest, so we'll disregard
# them here.
continue
......@@ -1841,7 +1841,7 @@ def generate_loop_schedules(kernel, debug_args={}):
def generate_loop_schedules_inner(kernel, debug_args={}):
from loopy.kernel import KernelState
if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED):
if kernel.state not in (KernelState.PREPROCESSED, KernelState.LINEARIZED):
raise LoopyError("cannot schedule a kernel that has not been "
"preprocessed")
......@@ -1852,7 +1852,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
debug = ScheduleDebugger(**debug_args)
preschedule = kernel.schedule if kernel.state == KernelState.SCHEDULED else ()
preschedule = kernel.schedule if kernel.state == KernelState.LINEARIZED else ()
prescheduled_inames = set(
insn.iname
......@@ -1904,7 +1904,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
unscheduled_insn_ids=set(insn.id for insn in kernel.instructions),
scheduled_insn_ids=frozenset(),
within_subkernel=kernel.state != KernelState.SCHEDULED,
within_subkernel=kernel.state != KernelState.LINEARIZED,
may_schedule_global_barriers=True,
preschedule=preschedule,
......@@ -1973,11 +1973,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
new_kernel = kernel.copy(
schedule=gen_sched,
state=KernelState.SCHEDULED)
state=KernelState.LINEARIZED)
from loopy.schedule.device_mapping import \
map_schedule_onto_host_or_device
if kernel.state != KernelState.SCHEDULED:
if kernel.state != KernelState.LINEARIZED:
# Device mapper only gets run once.
new_kernel = map_schedule_onto_host_or_device(new_kernel)
......@@ -2029,6 +2029,15 @@ def _get_one_scheduled_kernel_inner(kernel):
def get_one_scheduled_kernel(kernel):
warn_with_kernel(
kernel, "get_one_scheduled_kernel_deprecated",
"get_one_scheduled_kernel is deprecated. "
"Use get_one_linearized_kernel instead.",
DeprecationWarning)
return get_one_linearized_kernel(kernel)
def get_one_linearized_kernel(kernel):
from loopy import CACHING_ENABLED
sched_cache_key = kernel
......
......@@ -31,7 +31,7 @@ from loopy.schedule.tools import get_block_boundaries
def map_schedule_onto_host_or_device(kernel):
# FIXME: Should be idempotent.
from loopy.kernel import KernelState
assert kernel.state == KernelState.SCHEDULED
assert kernel.state == KernelState.LINEARIZED
from functools import partial
device_prog_name_gen = partial(
......
......@@ -1863,75 +1863,4 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
# }}}
# {{{ compat goop
def get_lmem_access_poly(knl):
"""Count the number of local memory accesses in a loopy kernel.
get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the
result with the mtype=['local'] option.
"""
warn_with_kernel(knl, "deprecated_get_lmem_access_poly",
"get_lmem_access_poly is deprecated. Use "
"get_mem_access_map and filter the result with the "
"mtype=['local'] option.")
return get_mem_access_map(knl).filter_by(mtype=['local'])
def get_DRAM_access_poly(knl):
"""Count the number of global memory accesses in a loopy kernel.
get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
result with the mtype=['global'] option.
"""
warn_with_kernel(knl, "deprecated_get_DRAM_access_poly",
"get_DRAM_access_poly is deprecated. Use "
"get_mem_access_map and filter the result with the "
"mtype=['global'] option.")
return get_mem_access_map(knl).filter_by(mtype=['global'])
def get_gmem_access_poly(knl):
"""Count the number of global memory accesses in a loopy kernel.
get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
result with the mtype=['global'] option.
"""
warn_with_kernel(knl, "deprecated_get_gmem_access_poly",
"get_DRAM_access_poly is deprecated. Use "
"get_mem_access_map and filter the result with the "
"mtype=['global'] option.")
return get_mem_access_map(knl).filter_by(mtype=['global'])
def get_synchronization_poly(knl):
"""Count the number of synchronization events each work-item encounters in
a loopy kernel.
get_synchronization_poly is deprecated. Use get_synchronization_map
instead.
"""
warn_with_kernel(knl, "deprecated_get_synchronization_poly",
"get_synchronization_poly is deprecated. Use "
"get_synchronization_map instead.")
return get_synchronization_map(knl)
def get_op_poly(knl, numpy_types=True):
"""Count the number of operations in a loopy kernel.
get_op_poly is deprecated. Use get_op_map instead.
"""
warn_with_kernel(knl, "deprecated_get_op_poly",
"get_op_poly is deprecated. Use get_op_map instead.")
return get_op_map(knl, numpy_types)
# }}}
# vim: foldmethod=marker
......@@ -273,8 +273,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase):
if not isinstance(other, type(expr)):
return self.treat_mismatch(expr, other, unis)
if (expr.inames != other.inames
or type(expr.operation) != type(other.operation) # noqa
):
or type(expr.operation) != type(other.operation)): # noqa
return []
return self.rec(expr.expr, other.expr, unis)
......@@ -971,7 +970,8 @@ class RuleAwareIdentityMapper(IdentityMapper):
# may perform tasks entirely unrelated to subst rules, so
# we must map assignees, too.
self.map_instruction(kernel,
insn.with_transformed_expressions(self, kernel, insn))
insn.with_transformed_expressions(
lambda expr: self(expr, kernel, insn)))
for insn in kernel.instructions]
return kernel.copy(instructions=new_insns)
......
......@@ -80,6 +80,11 @@ class DTypeRegistryWrapper(object):
def c99_preamble_generator(preamble_info):
if any(dtype.is_integral() for dtype in preamble_info.seen_dtypes):
yield("10_stdint", "#include <stdint.h>")
if any(dtype.numpy_dtype == np.dtype("bool")
for dtype in preamble_info.seen_dtypes):
yield("10_stdbool", "#include <stdbool.h>")
if any(dtype.is_complex() for dtype in preamble_info.seen_dtypes):
yield("10_complex", "#include <complex.h>")
def _preamble_generator(preamble_info):
......@@ -436,7 +441,7 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True):
arg_dtypes=arg_dtypes)
# binary functions
if (name in ["fmax", "fmin"]
if (name in ["fmax", "fmin", "copysign"]
and len(arg_dtypes) == 2):
dtype = np.find_common_type(
......@@ -1079,9 +1084,11 @@ class CTarget(CFamilyTarget):
@memoize_method
def get_dtype_registry(self):
from loopy.target.c.compyte.dtypes import (
DTypeRegistry, fill_registry_with_c99_stdint_types)
DTypeRegistry, fill_registry_with_c99_stdint_types,
fill_registry_with_c99_complex_types)
result = DTypeRegistry()
fill_registry_with_c99_stdint_types(result)
fill_registry_with_c99_complex_types(result)
return DTypeRegistryWrapper(result)
......
Subproject commit 25ee8b48fd0c7d9f0bd987c6862cdb1884fb1372
Subproject commit 7e48e1166a13cfbb7b60f909b071f088034ffda1
# coding: utf-8
"""OpenCL target integrated with PyOpenCL."""
from __future__ import division, absolute_import
......@@ -285,6 +286,9 @@ class PyOpenCLTarget(OpenCLTarget):
warnings) and support for complex numbers.
"""
# FIXME make prefixes conform to naming rules
# (see Reference: Loopy’s Model of a Kernel)
host_program_name_prefix = "_lpy_host_"
host_program_name_suffix = ""
......@@ -299,7 +303,26 @@ class PyOpenCLTarget(OpenCLTarget):
self.device = device
self.pyopencl_module_name = pyopencl_module_name
comparison_fields = ["device"]
# NB: Not including 'device', as that is handled specially here.
hash_fields = OpenCLTarget.hash_fields + (
"pyopencl_module_name",)
comparison_fields = OpenCLTarget.comparison_fields + (
"pyopencl_module_name",)
def __eq__(self, other):
if not super(PyOpenCLTarget, self).__eq__(other):
return False
if (self.device is None) != (other.device is None):
return False
if self.device is not None:
assert other.device is not None
return (self.device.persistent_unique_id
== other.device.persistent_unique_id)
else:
assert other.device is None
return True
def update_persistent_hash(self, key_hash, key_builder):
super(PyOpenCLTarget, self).update_persistent_hash(key_hash, key_builder)
......