From ca93d84e42f1b271a7a596c860b16614a26481f2 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Sat, 29 Oct 2011 20:10:42 -0400 Subject: [PATCH] A few fixes. Some code shifting. Loosen up owed_barriers checking. --- loopy/__init__.py | 270 ++++++++++++++++++++++++++++++------------ loopy/check.py | 80 +++++++++++++ loopy/compiled.py | 1 - loopy/kernel.py | 277 ++++++++------------------------------------ loopy/schedule.py | 11 +- loopy/symbolic.py | 6 +- test/test_linalg.py | 43 ++++--- test/test_loopy.py | 47 ++++++++ 8 files changed, 403 insertions(+), 332 deletions(-) create mode 100644 test/test_loopy.py diff --git a/loopy/__init__.py b/loopy/__init__.py index 2b8dba210..aab4f1083 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -22,15 +22,209 @@ class LoopyAdvisory(UserWarning): from loopy.kernel import ScalarArg, ArrayArg, ImageArg -from loopy.kernel import make_kernel, AutoFitLocalIndexTag +from loopy.kernel import AutoFitLocalIndexTag from loopy.preprocess import preprocess_kernel from loopy.schedule import generate_loop_schedules from loopy.compiled import CompiledKernel, drive_timing_run +from loopy.check import check_kernels + +# }}} + +# {{{ kernel creation + +def make_kernel(*args, **kwargs): + """Second pass of kernel creation. Think about requests for iname duplication + and temporary variable declaration received as part of string instructions. + """ + + from loopy.kernel import LoopKernel + knl = LoopKernel(*args, **kwargs) + + knl = tag_dimensions( + knl.copy(iname_to_tag_requests=None), + knl.iname_to_tag_requests) + + new_insns = [] + new_domain = knl.domain + new_temp_vars = knl.temporary_variables.copy() + new_iname_to_tag = knl.iname_to_tag.copy() + + newly_created_vars = set() + + # {{{ reduction iname duplication helper function + + def duplicate_reduction_inames(reduction_expr, rec): + duplicate_inames = [iname + for iname, tag in insn.duplicate_inames_and_tags] + + child = rec(reduction_expr.expr) + new_red_inames = [] + did_something = False + + for iname in reduction_expr.inames: + if iname in duplicate_inames: + new_iname = knl.make_unique_var_name(iname, newly_created_vars) + + old_insn_inames.append(iname) + new_insn_inames.append(new_iname) + newly_created_vars.add(new_iname) + new_red_inames.append(new_iname) + did_something = True + else: + new_red_inames.append(iname) + + if did_something: + from loopy.symbolic import SubstitutionMapper + from pymbolic.mapper.substitutor import make_subst_func + from pymbolic import var + subst_dict = dict( + (old_iname, var(new_iname)) + for old_iname, new_iname in zip( + reduction_expr.inames, new_red_inames)) + subst_map = SubstitutionMapper(make_subst_func(subst_dict)) + + child = subst_map(child) + + for old_iname, new_iname in zip(reduction_expr.inames, new_red_inames): + new_iname_to_tag[new_iname] = insn_dup_iname_to_tag[old_iname] + + from loopy.symbolic import Reduction + return Reduction( + operation=reduction_expr.operation, + inames=tuple(new_red_inames), + expr=child) + + # }}} + + for insn in knl.instructions: + # {{{ iname duplication + + if insn.duplicate_inames_and_tags: + + insn_dup_iname_to_tag = dict(insn.duplicate_inames_and_tags) + + # {{{ duplicate non-reduction inames + + reduction_inames = insn.reduction_inames() + + duplicate_inames = [iname + for iname, tag in insn.duplicate_inames_and_tags + if iname not in reduction_inames] + + new_inames = [ + knl.make_unique_var_name( + iname, + extra_used_vars= + newly_created_vars) + for iname in duplicate_inames] + + for old_iname, new_iname in zip(duplicate_inames, new_inames): + new_tag = insn_dup_iname_to_tag[old_iname] + if new_tag is None: + new_tag = AutoFitLocalIndexTag() + new_iname_to_tag[new_iname] = new_tag + + newly_created_vars.update(new_inames) + + from loopy.isl_helpers import duplicate_axes + new_domain = duplicate_axes(new_domain, duplicate_inames, new_inames) + + from loopy.symbolic import SubstitutionMapper + from pymbolic.mapper.substitutor import make_subst_func + from pymbolic import var + old_to_new = dict( + (old_iname, var(new_iname)) + for old_iname, new_iname in zip(duplicate_inames, new_inames)) + subst_map = SubstitutionMapper(make_subst_func(old_to_new)) + new_expression = subst_map(insn.expression) + + # }}} + + # {{{ duplicate reduction inames + + if len(duplicate_inames) < len(insn.duplicate_inames_and_tags): + # there must've been requests to duplicate reduction inames + old_insn_inames = [] + new_insn_inames = [] + + from loopy.symbolic import ReductionCallbackMapper + new_expression = ( + ReductionCallbackMapper(duplicate_reduction_inames) + (new_expression)) + + from loopy.isl_helpers import duplicate_axes + for old, new in zip(old_insn_inames, new_insn_inames): + new_domain = duplicate_axes(new_domain, [old], [new]) + + # }}} + + insn = insn.copy( + assignee=subst_map(insn.assignee), + expression=new_expression, + forced_iname_deps=[ + old_to_new.get(iname, iname) for iname in insn.forced_iname_deps], + ) + + # }}} + + # {{{ temporary variable creation + + from loopy.kernel import ( + find_var_base_indices_and_shape_from_inames, + TemporaryVariable) + + if insn.temp_var_type is not None: + assignee_name = insn.get_assignee_var_name() + + assignee_indices = [] + from pymbolic.primitives import Variable + for index_expr in insn.get_assignee_indices(): + if (not isinstance(index_expr, Variable) + or not index_expr.name in insn.all_inames()): + raise RuntimeError( + "only plain inames are allowed in " + "the lvalue index when declaring the " + "variable '%s' in an instruction" + % assignee_name) + + assignee_indices.append(index_expr.name) + + from loopy.kernel import LocalIndexTagBase + from pytools import any + is_local = any( + isinstance(new_iname_to_tag.get(iname), LocalIndexTagBase) + for iname in assignee_indices) + + base_indices, shape = \ + find_var_base_indices_and_shape_from_inames( + new_domain, assignee_indices) + + new_temp_vars[assignee_name] = TemporaryVariable( + name=assignee_name, + dtype=np.dtype(insn.temp_var_type), + is_local=is_local, + base_indices=base_indices, + shape=shape) + + newly_created_vars.add(assignee_name) + + insn = insn.copy(temp_var_type=None) + + # }}} + + new_insns.append(insn) + + return knl.copy( + instructions=new_insns, + domain=new_domain, + temporary_variables=new_temp_vars, + iname_to_tag=new_iname_to_tag) # }}} # {{{ user-facing kernel manipulation functionality + def split_dimension(kernel, iname, inner_length, padded_length=None, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, @@ -407,80 +601,6 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non -def get_problems(kernel, parameters): - """ - :return: *(max_severity, list of (severity, msg))*, where *severity* ranges from 1-5. - '5' means 'will certainly not run'. - """ - msgs = [] - - def msg(severity, s): - msgs.append((severity, s)) - - glens, llens = kernel.get_grid_sizes_as_exprs() - - from pymbolic import evaluate - from pymbolic.mapper.evaluator import UnknownVariableError - try: - glens = evaluate(glens, parameters) - llens = evaluate(llens, parameters) - except UnknownVariableError, name: - raise RuntimeError("When checking your kernel for problems, " - "a value for parameter '%s' was not available. Pass " - "it in the 'parameters' kwarg to check_kernels()." - % name) - - if (max(len(glens), len(llens)) - > kernel.device.max_work_item_dimensions): - msg(5, "too many work item dimensions") - - for i in range(len(llens)): - if llens[i] > kernel.device.max_work_item_sizes[i]: - msg(5, "group axis %d too big" % i) - - from pytools import product - if product(llens) > kernel.device.max_work_group_size: - msg(5, "work group too big") - - import pyopencl as cl - from pyopencl.characterize import usable_local_mem_size - if kernel.local_mem_use() > usable_local_mem_size(kernel.device): - if kernel.device.local_mem_type == cl.device_local_mem_type.LOCAL: - msg(5, "using too much local memory") - else: - msg(4, "using more local memory than available--" - "possibly OK due to cache nature") - - const_arg_count = sum( - 1 for arg in kernel.args - if isinstance(arg, ArrayArg) and arg.constant_mem) - - if const_arg_count > kernel.device.max_constant_args: - msg(5, "too many constant arguments") - - max_severity = 0 - for sev, msg in msgs: - max_severity = max(sev, max_severity) - return max_severity, msgs - - - - -def check_kernels(kernel_gen, parameters, kill_level_min=3, - warn_level_min=1): - for kernel in kernel_gen: - max_severity, msgs = get_problems(kernel, parameters) - - for severity, msg in msgs: - if severity >= warn_level_min: - from warnings import warn - from loopy import LoopyAdvisory - warn(msg, LoopyAdvisory) - - if max_severity < kill_level_min: - yield kernel - -# }}} # {{{ convenience diff --git a/loopy/check.py b/loopy/check.py index 996ad9f48..f7932a098 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -212,3 +212,83 @@ def check_implemented_domains(kernel, implemented_domains): return True # }}} + +# {{{ user-invoked checks + +def get_problems(kernel, parameters): + """ + :return: *(max_severity, list of (severity, msg))*, where *severity* ranges from 1-5. + '5' means 'will certainly not run'. + """ + msgs = [] + + def msg(severity, s): + msgs.append((severity, s)) + + glens, llens = kernel.get_grid_sizes_as_exprs() + + from pymbolic import evaluate + from pymbolic.mapper.evaluator import UnknownVariableError + try: + glens = evaluate(glens, parameters) + llens = evaluate(llens, parameters) + except UnknownVariableError, name: + raise RuntimeError("When checking your kernel for problems, " + "a value for parameter '%s' was not available. Pass " + "it in the 'parameters' kwarg to check_kernels()." + % name) + + if (max(len(glens), len(llens)) + > kernel.device.max_work_item_dimensions): + msg(5, "too many work item dimensions") + + for i in range(len(llens)): + if llens[i] > kernel.device.max_work_item_sizes[i]: + msg(5, "group axis %d too big" % i) + + from pytools import product + if product(llens) > kernel.device.max_work_group_size: + msg(5, "work group too big") + + import pyopencl as cl + from pyopencl.characterize import usable_local_mem_size + if kernel.local_mem_use() > usable_local_mem_size(kernel.device): + if kernel.device.local_mem_type == cl.device_local_mem_type.LOCAL: + msg(5, "using too much local memory") + else: + msg(4, "using more local memory than available--" + "possibly OK due to cache nature") + + from loopy.kernel import ArrayArg + const_arg_count = sum( + 1 for arg in kernel.args + if isinstance(arg, ArrayArg) and arg.constant_mem) + + if const_arg_count > kernel.device.max_constant_args: + msg(5, "too many constant arguments") + + max_severity = 0 + for sev, msg in msgs: + max_severity = max(sev, max_severity) + return max_severity, msgs + + + + +def check_kernels(kernel_gen, parameters={}, kill_level_min=5, + warn_level_min=1): + for kernel in kernel_gen: + max_severity, msgs = get_problems(kernel, parameters) + + for severity, msg in msgs: + if severity >= warn_level_min: + from warnings import warn + from loopy import LoopyAdvisory + warn(msg, LoopyAdvisory) + + if max_severity < kill_level_min: + yield kernel + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/compiled.py b/loopy/compiled.py index a263ea8f2..0c6ec7580 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -48,7 +48,6 @@ class CompiledKernel: else: self.size_args = size_args - from loopy.kernel import GroupIndexTag, LocalIndexTag gsize_expr, lsize_expr = kernel.get_grid_sizes_as_exprs() if not gsize_expr: gsize_expr = (1,) diff --git a/loopy/kernel.py b/loopy/kernel.py index 036b1257c..bc4588b87 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -305,14 +305,15 @@ class Instruction(Record): from pymbolic.primitives import Variable, Subscript if isinstance(self.assignee, Variable): - result = () + return () elif isinstance(self.assignee, Subscript): result = self.assignee.index + if not isinstance(result, tuple): + result = (result,) + return result else: raise RuntimeError("invalid lvalue '%s'" % self.assignee) - return result - @memoize_method def get_read_var_names(self): from loopy.symbolic import DependencyMapper @@ -437,6 +438,10 @@ class LoopKernel(Record): 'bulk' slabs with fewer conditionals. :ivar temporary_variables: :ivar iname_to_tag: + + The following two instance variables are only used until :func:`loopy.kernel.make_kernel` is + finished: + :ivar iname_to_tag_requests: """ def __init__(self, device, domain, instructions, args=None, schedule=None, @@ -445,14 +450,12 @@ class LoopKernel(Record): iname_slab_increments={}, temporary_variables={}, workgroup_size=None, - iname_to_dim=None, - iname_to_tag={}, - ): + iname_to_tag={}, iname_to_tag_requests=None): """ :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}" """ - assert iname_to_dim is None + assert not iname_to_tag_requests import re @@ -460,12 +463,14 @@ class LoopKernel(Record): ctx = isl.Context() domain = isl.Set.read_from_str(ctx, domain) - DUP_ENTRY_RE = re.compile( + iname_to_tag_requests = {} + + INAME_ENTRY_RE = re.compile( r"^\s*(?P<iname>\w+)\s*(?:\:\s*(?P<tag>[\w.]+))?\s*$") LABEL_DEP_RE = re.compile( r"^\s*(?:(?P<label>\w+):)?" "\s*(?:\[" - "(?P<iname_deps>[\s\w,]*)" + "(?P<iname_deps_and_tags>[\s\w,:.]*)" "(?:\|(?P<duplicate_inames_and_tags>[\s\w,:.]*))?" "\])?" "\s*(?:\<(?P<temp_var_type>.+)\>)?" @@ -473,10 +478,35 @@ class LoopKernel(Record): "\s*?(?:\:\s*(?P<insn_deps>[\s\w,]+))?$" ) + def parse_iname_and_tag_list(s): + dup_entries = [ + dep.strip() for dep in s.split(",")] + result = [] + for entry in dup_entries: + if not entry: + continue + + entry_match = INAME_ENTRY_RE.match(entry) + if entry_match is None: + raise RuntimeError( + "could not parse iname:tag entry '%s'" + % entry) + + groups = entry_match.groupdict() + iname = groups["iname"] + assert iname + + tag = None + if groups["tag"] is not None: + tag = parse_tag(groups["tag"]) + + result.append((iname, tag)) + + return result + def parse_if_necessary(insn): from pymbolic import parse - if isinstance(insn, Instruction): return insn if isinstance(insn, str): @@ -494,35 +524,17 @@ class LoopKernel(Record): else: insn_deps = [] - if groups["iname_deps"] is not None: - forced_iname_deps = [dep.strip() - for dep in groups["iname_deps"].split(",") - if dep.strip()] + if groups["iname_deps_and_tags"] is not None: + inames_and_tags = parse_iname_and_tag_list( + groups["iname_deps_and_tags"]) + forced_iname_deps = [iname for iname, tag in inames_and_tags] + iname_to_tag_requests.update(dict(inames_and_tags)) else: forced_iname_deps = [] if groups["duplicate_inames_and_tags"] is not None: - dup_entries = [ - dep.strip() for dep in groups["duplicate_inames_and_tags"].split(",")] - duplicate_inames_and_tags = [] - for dup_entry in dup_entries: - if not dup_entry: - continue - - dup_entry_match = DUP_ENTRY_RE.match(dup_entry) - if dup_entry_match is None: - raise RuntimeError( - "could not parse iname duplication entry '%s'" - % dup_entry) - - dup_groups = dup_entry_match.groupdict() - dup_iname = dup_groups["iname"] - assert dup_iname - dup_tag = None - if dup_groups["tag"] is not None: - dup_tag = parse_tag(dup_groups["tag"]) - - duplicate_inames_and_tags.append((dup_iname, dup_tag)) + duplicate_inames_and_tags = parse_iname_and_tag_list( + groups["duplicate_inames_and_tags"]) else: duplicate_inames_and_tags = [] @@ -574,7 +586,8 @@ class LoopKernel(Record): iname_slab_increments=iname_slab_increments, temporary_variables=temporary_variables, workgroup_size=workgroup_size, - iname_to_tag=iname_to_tag) + iname_to_tag=iname_to_tag, + iname_to_tag_requests=iname_to_tag_requests) def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()): if insns is None: @@ -830,196 +843,4 @@ def find_var_base_indices_and_shape_from_inames(domain, inames): -# {{{ count number of uses of each reduction iname - -# }}} - - - -# {{{ pass 2 of kernel creation - -def make_kernel(*args, **kwargs): - """Second pass of kernel creation. Think about requests for iname duplication - and temporary variable declaration received as part of string instructions. - """ - - knl = LoopKernel(*args, **kwargs) - - new_insns = [] - new_domain = knl.domain - new_temp_vars = knl.temporary_variables.copy() - new_iname_to_tag = knl.iname_to_tag.copy() - - newly_created_vars = set() - - # {{{ reduction iname duplication helper function - - def duplicate_reduction_inames(reduction_expr, rec): - duplicate_inames = [iname - for iname, tag in insn.duplicate_inames_and_tags] - - child = rec(reduction_expr.expr) - new_red_inames = [] - did_something = False - - for iname in reduction_expr.inames: - if iname in duplicate_inames: - new_iname = knl.make_unique_var_name(iname, newly_created_vars) - - old_insn_inames.append(iname) - new_insn_inames.append(new_iname) - newly_created_vars.add(new_iname) - new_red_inames.append(new_iname) - reduction_iname_uses[iname] -= 1 - did_something = True - else: - new_red_inames.append(iname) - - if did_something: - from loopy.symbolic import SubstitutionMapper - from pymbolic.mapper.substitutor import make_subst_func - from pymbolic import var - subst_dict = dict( - (old_iname, var(new_iname)) - for old_iname, new_iname in zip( - reduction_expr.inames, new_red_inames)) - subst_map = SubstitutionMapper(make_subst_func(subst_dict)) - - child = subst_map(child) - - for old_iname, new_iname in zip(reduction_expr.inames, new_red_inames): - new_iname_to_tag[new_iname] = insn_dup_iname_to_tag[old_iname] - - from loopy.symbolic import Reduction - return Reduction( - operation=reduction_expr.operation, - inames=tuple(new_red_inames), - expr=child) - - # }}} - - for insn in knl.instructions: - # {{{ iname duplication - - if insn.duplicate_inames_and_tags: - - insn_dup_iname_to_tag = dict(insn.duplicate_inames_and_tags) - - # {{{ duplicate non-reduction inames - - reduction_inames = insn.reduction_inames() - - duplicate_inames = [iname - for iname, tag in insn.duplicate_inames_and_tags - if iname not in reduction_inames] - - new_inames = [ - knl.make_unique_var_name( - iname, - extra_used_vars= - newly_created_vars) - for iname in duplicate_inames] - - for old_iname, new_iname in zip(duplicate_inames, new_inames): - new_tag = insn_dup_iname_to_tag[old_iname] - if new_tag is None: - new_tag = AutoFitLocalIndexTag() - new_iname_to_tag[new_iname] = new_tag - - newly_created_vars.update(new_inames) - - from loopy.isl_helpers import duplicate_axes - new_domain = duplicate_axes(new_domain, duplicate_inames, new_inames) - - from loopy.symbolic import SubstitutionMapper - from pymbolic.mapper.substitutor import make_subst_func - old_to_new = dict( - (old_iname, var(new_iname)) - for old_iname, new_iname in zip(duplicate_inames, new_inames)) - subst_map = SubstitutionMapper(make_subst_func(old_to_new)) - new_expression = subst_map(insn.expression) - - # }}} - - # {{{ duplicate reduction inames - - if len(duplicate_inames) < len(insn.duplicate_inames_and_tags): - # there must've been requests to duplicate reduction inames - old_insn_inames = [] - new_insn_inames = [] - - from loopy.symbolic import ReductionCallbackMapper - new_expression = ( - ReductionCallbackMapper(duplicate_reduction_inames) - (new_expression)) - - from loopy.isl_helpers import duplicate_axes - for old, new in zip(old_insn_inames, new_insn_inames): - new_domain = duplicate_axes(new_domain, [old], [new]) - - # }}} - - insn = insn.copy( - assignee=subst_map(insn.assignee), - expression=new_expression, - forced_iname_deps=[ - old_to_new.get(iname, iname) for iname in insn.forced_iname_deps], - ) - - # }}} - - # {{{ temporary variable creation - - if insn.temp_var_type is not None: - assignee_name = insn.get_assignee_var_name() - - assignee_indices = [] - from pymbolic.primitives import Variable - for index_expr in insn.get_assignee_indices(): - if (not isinstance(index_expr, Variable) - or not index_expr.name in insn.all_inames()): - raise RuntimeError( - "only plain inames are allowed in " - "the lvalue index when declaring the " - "variable '%s' in an instruction" - % assignee_name) - - assignee_indices.append(index_expr.name) - - from loopy.kernel import LocalIndexTagBase - from pytools import any - is_local = any( - isinstance(new_iname_to_tag.get(iname), LocalIndexTagBase) - for iname in assignee_indices) - - base_indices, shape = \ - find_var_base_indices_and_shape_from_inames( - new_domain, assignee_indices) - - new_temp_vars[assignee_name] = TemporaryVariable( - name=assignee_name, - dtype=np.dtype(insn.temp_var_type), - is_local=is_local, - base_indices=base_indices, - shape=shape) - - newly_created_vars.add(assignee_name) - - insn = insn.copy(temp_var_type=None) - - # }}} - - new_insns.append(insn) - - return knl.copy( - instructions=new_insns, - domain=new_domain, - temporary_variables=new_temp_vars, - iname_to_tag=new_iname_to_tag) - -# }}} - - - - # vim: foldmethod=marker diff --git a/loopy/schedule.py b/loopy/schedule.py index d61295676..9b597e844 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -410,9 +410,6 @@ def insert_barriers(kernel, schedule, level=0): if assignee_temp_var is not None and assignee_temp_var.is_local: dep = get_barrier_dependent_in_schedule(kernel, insn.id, schedule) - if level == 0: - assert dep - if dep: issue_barrier(is_pre_barrier=True, dep=dep) @@ -438,7 +435,13 @@ def generate_loop_schedules(kernel, loop_priority=[]): for gen_sched in generate_loop_schedules_internal(kernel, loop_priority): gen_sched, owed_barriers = insert_barriers(kernel, gen_sched) - assert not owed_barriers + if owed_barriers: + from warnings import warn + from loopy import LoopyAdvisory + warn("Barrier insertion finished without inserting barriers for " + "local memory writes in these instructions: '%s'. " + "This often means that local memory was " + "written, but never read." % ",".join(owed_barriers), LoopyAdvisory) yield kernel.copy(schedule=gen_sched) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 5e08fbd44..e2f48eed9 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -338,9 +338,13 @@ class LoopyCCodeMapper(CCodeMapper): elif expr.aggregate.name in self.kernel.temporary_variables: temp_var = self.kernel.temporary_variables[expr.aggregate.name] + if isinstance(expr.index, tuple): + index = expr.index + else: + index = (expr.index,) return (temp_var.name + "".join("[%s]" % self.rec(idx, PREC_NONE) - for idx in expr.index)) + for idx in index)) else: raise RuntimeError("nothing known about variable '%s'" % expr.aggregate.name) diff --git a/test/test_linalg.py b/test/test_linalg.py index 83b13f74f..265353abe 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -133,7 +133,7 @@ def test_axpy(ctx_factory): for variant in [variant_cpu, variant_gpu]: kernel_gen = lp.generate_loop_schedules(variant(knl), loop_priority=["i_inner_outer"]) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5) + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) def launcher(kernel, gsize, lsize, check): evt = kernel(queue, gsize(n), lsize(n), 2, a.data, 3, b.data, c.data, n, @@ -178,7 +178,7 @@ def test_transpose(ctx_factory): knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ]) kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, {}, kill_level_min=5) + kernel_gen = lp.check_kernels(kernel_gen, {}) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = cl_array.empty_like(a) @@ -228,7 +228,7 @@ def test_plain_matrix_mul(ctx_factory): knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ]) kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, {}, kill_level_min=5) + kernel_gen = lp.check_kernels(kernel_gen, {}) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) @@ -282,7 +282,7 @@ def test_variable_size_matrix_mul(ctx_factory): knl = lp.realize_cse(knl, "rhsmat", dtype, ["j_inner", "k_inner"]) kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5) + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) @@ -376,7 +376,7 @@ def test_rank_one(ctx_factory): for variant in [variant_1, variant_2, variant_4]: kernel_gen = lp.generate_loop_schedules(variant(knl)) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5) + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = cl_random.rand(queue, n, dtype=dtype) b = cl_random.rand(queue, n, dtype=dtype) @@ -430,7 +430,7 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"]) kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5) + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) @@ -492,7 +492,7 @@ def test_intel_matrix_mul(ctx_factory): kernel_gen = lp.generate_loop_schedules(knl) #hints=["k_outer", "k_inner_outer", "k_inner_inner"] - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5) + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) @@ -526,7 +526,7 @@ def test_magma_fermi_matrix_mul(ctx_factory): knl = lp.make_kernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ - "c[i, j] = a[i, k]*b[k, j]" + "c[i, j] = sum_float32(k, a[i, k]*b[k, j])" ], [ lp.ImageArg("a", dtype, 2), @@ -539,20 +539,18 @@ def test_magma_fermi_matrix_mul(ctx_factory): j_reg = 4 i_chunks = 16 j_chunks = 16 - knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0", no_slabs=True) - knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp", no_slabs=True) - knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1", no_slabs=True) - knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp", no_slabs=True) - knl = lp.split_dimension(knl, "k", 16, no_slabs=True) + knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0") + knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp") + knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1") + knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") + knl = lp.split_dimension(knl, "k", 16) #knl = lp.split_dimension(knl, "k_inner", 8, outer_tag="unr") knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")]) knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),]) - assert knl.get_problems({})[0] <= 2 - kernel_gen = (lp.insert_register_prefetches(knl) - for knl in lp.generate_loop_schedules(knl, - hints=["k_outer", "k_inner_outer", "k_inner_inner"] - )) + kernel_gen = lp.generate_loop_schedules(knl) + #hints=["k_outer", "k_inner_outer", "k_inner_inner"] + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) @@ -603,10 +601,9 @@ def test_image_matrix_mul(ctx_factory): # conflict-free knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"]) - assert knl.get_problems({})[0] <= 2 - kernel_gen = (lp.insert_register_prefetches(knl) - for knl in lp.generate_loop_schedules(knl)) + kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) @@ -662,7 +659,7 @@ def test_image_matrix_mul_ilp(ctx_factory): #knl = lp.add_prefetch(knl, 'b', [("j_inner_outer", "j_inner_inner"), "k_inner"]) kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5) + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) @@ -716,7 +713,7 @@ def test_fancy_matrix_mul(ctx_factory): knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"]) kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5) + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, ran_factor=0) diff --git a/test/test_loopy.py b/test/test_loopy.py new file mode 100644 index 000000000..28561d56f --- /dev/null +++ b/test/test_loopy.py @@ -0,0 +1,47 @@ +from __future__ import division + +import numpy as np +import pyopencl as cl +import pyopencl.array as cl_array +import pyopencl.clrandom as cl_random +import loopy as lp + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + + + + +def test_owed_barriers(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + knl = lp.make_kernel(ctx.devices[0], + "{[i]: 0<=i<100}", + [ + "[i:l.0] <float32> z[i] = a[i]" + ], + [ + lp.ArrayArg("a", dtype, shape=(100,)), + ]) + + kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.check_kernels(kernel_gen) + + for gen_knl in kernel_gen: + compiled = lp.CompiledKernel(ctx, gen_knl) + print compiled.code + + + + +if __name__ == "__main__": + import sys + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) -- GitLab