diff --git a/.gitignore b/.gitignore index c5865116226b2cdc15ed4978548416090265e115..7ae71dc52e28413d3da125b4e5e9a64c2c970a19 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,5 @@ core .coverage htmlcov .ipynb_checkpoints +lextab.py +yacctab.py diff --git a/bin/loopy b/bin/loopy index c8cbea1c615c396d3903df24472822cc918d3461..b47127740b4d016d823de3874fd7398c031d8160 100644 --- a/bin/loopy +++ b/bin/loopy @@ -186,8 +186,8 @@ def main(): code, impl_arg_info = generate_code(kernel) codes.append(code) - if args.outfile: - outfile, = args.outfile + if args.outfile is not None: + outfile = args.outfile else: outfile = "-" diff --git a/doc/reference.rst b/doc/reference.rst index e79f17554119c5efe9351f46b8c501c2c27d2387..52cd92bb8812565e71ba4cedf1826bf64633f247 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -430,6 +430,8 @@ Influencing data access .. autofunction:: tag_data_axes +.. autofunction:: remove_unused_arguments + Padding ^^^^^^^ @@ -537,6 +539,8 @@ Obtaining Kernel Statistics .. autofunction:: get_gmem_access_poly +.. autofunction:: sum_mem_access_to_bytes + .. autofunction:: get_barrier_poly .. vim: tw=75:spell diff --git a/doc/tutorial.rst b/doc/tutorial.rst index c1ce360c88b962ef438809500413d747a3cd4b43..0df707d0db82fcf42b60e8405d1f0d189fb31354 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1233,7 +1233,7 @@ the :class:`loopy.LoopKernel` *inames*). We'll print this map now: .. doctest:: - >>> print(op_map) + >>> print(lp.stringify_stats_mapping(op_map)) float32 : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } float64 : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 } int32 : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 } @@ -1244,9 +1244,9 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} - >>> i32ops = op_map.dict[np.dtype(np.int32)].eval_with_dict(param_dict) - >>> f32ops = op_map.dict[np.dtype(np.float32)].eval_with_dict(param_dict) - >>> f64ops = op_map.dict[np.dtype(np.float64)].eval_with_dict(param_dict) + >>> i32ops = op_map[np.dtype(np.int32)].eval_with_dict(param_dict) + >>> f32ops = op_map[np.dtype(np.float32)].eval_with_dict(param_dict) + >>> f64ops = op_map[np.dtype(np.float64)].eval_with_dict(param_dict) >>> print("integer ops: %i\nfloat32 ops: %i\nfloat64 ops: %i" % ... (i32ops, f32ops, f64ops)) integer ops: 65536 @@ -1264,7 +1264,7 @@ continue using the kernel from the previous example: >>> from loopy.statistics import get_gmem_access_poly >>> load_store_map = get_gmem_access_poly(knl) - >>> print(load_store_map) + >>> print(lp.stringify_stats_mapping(load_store_map)) (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 } (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 } (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 } @@ -1295,13 +1295,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld = load_store_map.dict[(np.dtype(np.float64), "uniform", "load") + >>> f64ld = load_store_map[(np.dtype(np.float64), "uniform", "load") ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map.dict[(np.dtype(np.float64), "uniform", "store") + >>> f64st = load_store_map[(np.dtype(np.float64), "uniform", "store") ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map.dict[(np.dtype(np.float32), "uniform", "load") + >>> f32ld = load_store_map[(np.dtype(np.float32), "uniform", "load") ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map.dict[(np.dtype(np.float32), "uniform", "store") + >>> f32st = load_store_map[(np.dtype(np.float32), "uniform", "store") ... ].eval_with_dict(param_dict) >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % ... (f32ld, f32st, f64ld, f64st)) @@ -1322,8 +1322,8 @@ this time, so we'll print the mapping manually to make it more legible: >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0") >>> load_store_map = get_gmem_access_poly(knl_consec) - >>> for key in sorted(load_store_map.dict.keys(), key=lambda k: str(k)): - ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) + >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)): + ... print("%s :\n%s\n" % (key, load_store_map[key])) (dtype('float32'), 'consecutive', 'load') : [n, m, l] -> { ... } <BLANKLINE> @@ -1345,13 +1345,13 @@ accesses has not changed: .. doctest:: - >>> f64ld = load_store_map.dict[(np.dtype(np.float64), "consecutive", "load") + >>> f64ld = load_store_map[(np.dtype(np.float64), "consecutive", "load") ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map.dict[(np.dtype(np.float64), "consecutive", "store") + >>> f64st = load_store_map[(np.dtype(np.float64), "consecutive", "store") ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map.dict[(np.dtype(np.float32), "consecutive", "load") + >>> f32ld = load_store_map[(np.dtype(np.float32), "consecutive", "load") ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map.dict[(np.dtype(np.float32), "consecutive", "store") + >>> f32st = load_store_map[(np.dtype(np.float32), "consecutive", "store") ... ].eval_with_dict(param_dict) >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % ... (f32ld, f32st, f64ld, f64st)) @@ -1369,8 +1369,8 @@ our parallelization of the kernel: >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1") >>> load_store_map = get_gmem_access_poly(knl_nonconsec) - >>> for key in sorted(load_store_map.dict.keys(), key=lambda k: str(k)): - ... print("%s :\n%s\n" % (key, load_store_map.dict[key])) + >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)): + ... print("%s :\n%s\n" % (key, load_store_map[key])) (dtype('float32'), 'nonconsecutive', 'load') : [n, m, l] -> { ... } <BLANKLINE> @@ -1390,16 +1390,16 @@ elements in memory. The total number of array accesses has not changed: .. doctest:: - >>> f64ld = load_store_map.dict[ + >>> f64ld = load_store_map[ ... (np.dtype(np.float64), "nonconsecutive", "load") ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map.dict[ + >>> f64st = load_store_map[ ... (np.dtype(np.float64), "nonconsecutive", "store") ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map.dict[ + >>> f32ld = load_store_map[ ... (np.dtype(np.float32), "nonconsecutive", "load") ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map.dict[ + >>> f32st = load_store_map[ ... (np.dtype(np.float32), "nonconsecutive", "store") ... ].eval_with_dict(param_dict) >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % diff --git a/loopy/__init__.py b/loopy/__init__.py index 8956856d4735a9554ed8c34741790ef1286d9e54..10fac9cfce3eb5a9a1bb6686fabf453ec6b83a8d 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -64,7 +64,8 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction, infer_unknown_types) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (get_op_poly, get_gmem_access_poly, - get_DRAM_access_poly, get_barrier_poly) + get_DRAM_access_poly, get_barrier_poly, stringify_stats_mapping, + sum_mem_access_to_bytes) from loopy.codegen import generate_code, generate_body from loopy.compiled import CompiledKernel from loopy.options import Options @@ -105,7 +106,7 @@ __all__ = [ "generate_code", "generate_body", "get_op_poly", "get_gmem_access_poly", "get_DRAM_access_poly", - "get_barrier_poly", + "get_barrier_poly", "stringify_stats_mapping", "sum_mem_access_to_bytes", "CompiledKernel", @@ -2089,4 +2090,22 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch"): # }}} + +# {{{ remove_unused_arguments + +def remove_unused_arguments(knl): + new_args = [] + + refd_vars = set(knl.all_params()) + for insn in knl.instructions: + refd_vars.update(insn.dependency_names()) + + for arg in knl.args: + if arg.name in refd_vars: + new_args.append(arg) + + return knl.copy(args=new_args) + +# }}} + # vim: foldmethod=marker diff --git a/loopy/check.py b/loopy/check.py index 3401f7b8d86eb8e89d0bdb0629b5e85d57fff771..01a6e52c268a5c4e76f94653490d05c345e1a7d0 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -373,6 +373,8 @@ def pre_schedule_checks(kernel): check_write_destinations(kernel) logger.info("pre-schedule check %s: done" % kernel.name) + except KeyboardInterrupt: + raise except: print(75*"=") print("failing kernel during pre-schedule check:") diff --git a/loopy/diff.py b/loopy/diff.py new file mode 100644 index 0000000000000000000000000000000000000000..d72465e6fd379eee5b36043f20566e6575e902b4 --- /dev/null +++ b/loopy/diff.py @@ -0,0 +1,397 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2015 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import islpy as isl + +from pymbolic.mapper.differentiator import DifferentiationMapper + +import pymbolic.primitives as p +var = p.Variable + +import loopy as lp +from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext +from loopy.isl_helpers import make_slab +from loopy.diagnostic import LoopyError + + +# {{{ diff mapper + +def func_map(i, func, args): + if func.name == "exp": + return var("exp")(*args) + elif func.name == "log": + return 1/args[0] + + elif func.name == "sin": + return var("cos")(*args) + elif func.name == "cos": + return -var("sin")(*args) + elif func.name == "tan": + return 1+var("tan")(*args)**2 + + elif func.name == "sinh": + return var("cosh")(*args) + elif func.name == "cosh": + return var("sinh")(*args) + elif func.name == "tanh": + return (1 - var("tanh")(*args))**2 + + else: + raise NotImplementedError("derivative of '%s'" % func.name) + + +class LoopyDiffMapper(DifferentiationMapper, RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, diff_context, diff_inames): + RuleAwareIdentityMapper.__init__(self, rule_mapping_context) + self.diff_context = diff_context + self.diff_inames = diff_inames + self.diff_iname_exprs = tuple(var(diname) for diname in diff_inames) + self.function_map = func_map + + def rec_undiff(self, expr, *args): + dc = self.diff_context + + from loopy.symbolic import get_dependencies + deps = get_dependencies(expr) + for dep in deps: + assert isinstance(dep, str) + if (dep in dc.kernel.arg_dict + or dep in dc.kernel.temporary_variables): + dc.import_output_var(dep) + + return expr + + def map_variable(self, expr, *args): + dc = self.diff_context + + if expr.name == dc.by_name: + assert len(self.diff_inames) == 0 + return 1 + + elif (expr.name in dc.kernel.all_inames() + or expr.name in dc.kernel.all_params()): + return expr + + else: + dvar_dby = dc.get_diff_var(expr.name) + if dvar_dby is None: + return 0 + + if self.diff_inames: + return var(dvar_dby)[self.diff_iname_exprs] + else: + return var(dvar_dby) + + map_tagged_variable = map_variable + + def map_subscript(self, expr, *args): + dc = self.diff_context + + if expr.aggregate.name == dc.by_name: + index = expr.index + if not isinstance(expr.index, tuple): + index = (expr.index,) + + assert len(self.diff_inames) == len(index) + + conds = [ + p.Comparison(var(ti), "==", ei) + for ti, ei in zip(self.diff_inames, index) + ] + + if len(conds) == 1: + and_conds, = conds + elif len(conds) > 1: + and_conds = p.LogicalAnd(tuple(conds)) + else: + assert False + + return p.If(and_conds, 1, 0) + + else: + dvar_dby = dc.get_diff_var(expr.aggregate.name) + if dvar_dby is None: + return 0 + + idx = expr.index + if not isinstance(idx, tuple): + idx = (idx,) + + return type(expr)( + var(dvar_dby), + expr.index + self.diff_inames) + + def map_call(self, expr, *args): + dc = self.diff_context + + if expr.function.name in dc.kernel.substitutions: + # FIXME: Deal with subsitution rules + # Need to use chain rule here, too. + raise NotImplementedError("substitution rules in differentiation") + else: + return DifferentiationMapper.map_call(self, expr, *args) + +# }}} + + +# {{{ diff context + +class DifferentiationContext(object): + def __init__(self, kernel, var_name_gen, by_name, diff_iname_prefix, + additional_shape): + self.kernel = kernel + self.by_name = by_name + self.diff_iname_prefix = diff_iname_prefix + self.additional_shape = additional_shape + + self.imported_outputs = set() + self.output_to_diff_output = {} + + self.generate_instruction_id = self.kernel.get_instruction_id_generator() + + self.new_args = [] + self.new_temporary_variables = {} + self.new_instructions = [] + self.imported_instructions = set() + self.new_domains = [] + + self.rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, var_name_gen) + + def get_new_kernel(self): + knl = self.kernel + + new_args = knl.args + self.new_args + new_temp_vars = knl.temporary_variables.copy() + new_temp_vars.update(self.new_temporary_variables) + + knl = knl.copy( + args=new_args, + temporary_variables=new_temp_vars, + instructions=self.new_instructions, + domains=knl.domains + self.new_domains) + + del new_args + del new_temp_vars + + knl = self.rule_mapping_context.finish_kernel(knl) + + return knl + + # {{{ kernel gen entrypoints + + def add_diff_inames(self): + diff_inames = tuple( + self.rule_mapping_context.make_unique_var_name( + self.diff_iname_prefix+str(i)) + for i in range(len(self.additional_shape))) + + diff_parameters = set() + from loopy.symbolic import get_dependencies + for s in self.additional_shape: + diff_parameters.update(get_dependencies(s)) + + diff_domain = isl.BasicSet( + "[%s] -> {[%s]}" + % (", ".join(diff_parameters), ", ".join(diff_inames))) + + for i, diff_iname in enumerate(diff_inames): + diff_domain = diff_domain & make_slab( + diff_domain.space, diff_iname, 0, self.additional_shape[i]) + + self.new_domains.append(diff_domain) + + return diff_inames + + # }}} + + def import_instruction_and_deps(self, insn_id): + if insn_id in self.imported_instructions: + return + + insn = self.kernel.id_to_insn[insn_id] + self.new_instructions.append(insn) + self.imported_instructions.add(insn_id) + + id_map = RuleAwareIdentityMapper(self.rule_mapping_context) + + if isinstance(insn, lp.ExpressionInstruction): + id_map(insn.expression, self.kernel, insn) + else: + raise RuntimeError("do not know how to deal with " + "instruction of type %s" % type(insn)) + + for dep in insn.insn_deps: + self.import_instruction_and_deps(dep) + + def import_output_var(self, var_name): + writers = self.kernel.writer_map().get(var_name, []) + + if len(writers) > 1: + raise LoopyError("%s is written in more than one place" + % var_name) + + if not writers: + return + + insn_id, = writers + self.import_instruction_and_deps(insn_id) + + def get_diff_var(self, var_name): + """ + :return: a string containing the name of a new variable + holding the derivative of *var_name* by the desired + *diff_context.by_name*, or *None* if no dependency exists. + """ + new_var_name = self.rule_mapping_context.make_unique_var_name( + var_name + "_d" + self.by_name) + + writers = self.kernel.writer_map().get(var_name, []) + + if not writers: + # FIXME: There should be hooks to supply earlier dvar_dby + # This would be the spot to think about them. + return None + + if len(writers) > 1: + raise LoopyError("%s is written in more than one place" + % var_name) + + orig_writer_id, = writers + orig_writer_insn = self.kernel.id_to_insn[orig_writer_id] + + diff_inames = self.add_diff_inames() + diff_iname_exprs = tuple(var(diname) for diname in diff_inames) + + # {{{ write code + + diff_mapper = LoopyDiffMapper(self.rule_mapping_context, self, + diff_inames) + + diff_expr = diff_mapper(orig_writer_insn.expression, + self.kernel, orig_writer_insn) + + if not diff_expr: + return None + + (_, lhs_ind), = orig_writer_insn.assignees_and_indices() + new_insn_id = self.generate_instruction_id() + insn = lp.ExpressionInstruction( + id=new_insn_id, + assignee=var(new_var_name)[ + lhs_ind + diff_iname_exprs], + expression=diff_expr) + + self.new_instructions.append(insn) + + # }}} + + # {{{ manage variable declaration + + if var_name in self.kernel.arg_dict: + arg = self.kernel.arg_dict[var_name] + orig_shape = arg.shape + + elif var_name in self.kernel.temporary_variables: + tv = self.kernel.temporary_variables[var_name] + orig_shape = tv.shape + + else: + raise ValueError("%s: variable not found" % var_name) + + shape = orig_shape + self.additional_shape + dim_tags = ("c",) * len(shape) + + if var_name in self.kernel.arg_dict: + self.new_args.append( + lp.GlobalArg( + new_var_name, + arg.dtype, + shape=shape, + dim_tags=dim_tags, + )) + + elif var_name in self.kernel.temporary_variables: + self.new_temporary_variables[new_var_name] = lp.TemporaryVariable( + new_var_name, + tv.dtype, + shape=shape, + dim_tags=dim_tags) + + # }}} + + return new_var_name + + # }}} + +# }}} + + +# {{{ entrypoint + +def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i", + batch_axes_in_by=frozenset(), copy_outputs=set()): + """ + + :arg batch_axes_in_by: a :class:`set` of axis indices in the variable named *by* + that are not part of the differentiation. + :return: a string containing the name of a new variable + holding the derivative of *var_name* by the desired + *diff_context.by_name*, or *None* if no dependency exists. + """ + + from loopy.preprocess import add_default_dependencies + knl = add_default_dependencies(knl) + + if isinstance(diff_outputs, str): + diff_outputs = [ + dout.strip() for dout in diff_outputs.split(",") + if dout.strip()] + + by_arg = knl.arg_dict[by] + additional_shape = by_arg.shape + + var_name_gen = knl.get_var_name_generator() + + # {{{ differentiate instructions + + diff_context = DifferentiationContext( + knl, var_name_gen, by, diff_iname_prefix=diff_iname_prefix, + additional_shape=additional_shape) + + result = {} + for dout in diff_outputs: + result = diff_context.get_diff_var(dout) + + for cout in copy_outputs: + diff_context.import_output_var(cout) + + # }}} + + return diff_context.get_new_kernel(), result + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 4e31db993517fc4426766a6e881fb7f56853915c..c5c0baec6d07fc3daf3adfa5a8e130d712575963 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -345,6 +345,11 @@ class LoopKernel(RecordWithoutPickling): def get_var_name_generator(self): return _UniqueVarNameGenerator(self.all_variable_names()) + def get_instruction_id_generator(self, based_on="insn"): + used_ids = set(insn.id for insn in self.instructions) + + return UniqueNameGenerator(used_ids) + def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()): if insns is None: diff --git a/loopy/statistics.py b/loopy/statistics.py index a6e7c33233728f6df01303ea683916bc1a335828..429a6a2116ab111f6cda85a95afe3255d73014bd 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -76,16 +76,17 @@ class ToCountMap: except KeyError: return isl.PwQPolynomial('{ 0 }') - def __str__(self): - result = "" - for key in sorted(self.dict.keys(), key=lambda k: str(k)): - result += ("%s : %s\n" % (key, self.dict[key])) - return result - def __repr__(self): return repr(self.dict) +def stringify_stats_mapping(m): + result = "" + for key in sorted(m.keys(), key=lambda k: str(k)): + result += ("%s : %s\n" % (key, m[key])) + return result + + class ExpressionOpCounter(CombineMapper): def __init__(self, knl): @@ -447,8 +448,7 @@ def get_op_poly(knl): domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) ops = op_counter(insn.assignee) + op_counter(insn.expression) op_poly = op_poly + ops*count(knl, domain) - - return op_poly + return op_poly.dict def get_gmem_access_poly(knl): # for now just counting subscripts @@ -463,10 +463,12 @@ def get_gmem_access_poly(knl): # for now just counting subscripts - The :class:`numpy.dtype` specifies the type of the data being accessed. - - The first string in the map key specifies the DRAM access type as + - The first string in the map key specifies the global memory + access type as *consecutive*, *nonconsecutive*, or *uniform*. - - The second string in the map key specifies the DRAM access type as a + - The second string in the map key specifies the global memory + access type as a *load*, or a *store*. - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses @@ -515,8 +517,7 @@ def get_gmem_access_poly(knl): # for now just counting subscripts for key, val in six.iteritems(subs_assignee.dict))) subs_poly = subs_poly + (subs_expr + subs_assignee)*count(knl, domain) - - return subs_poly + return subs_poly.dict def get_DRAM_access_poly(knl): @@ -526,6 +527,27 @@ def get_DRAM_access_poly(knl): return get_gmem_access_poly(knl) +def sum_mem_access_to_bytes(m): + """Sum the mapping returned by :func:`get_gmem_access_poly` to a mapping + + **{(** :class:`string` **,** :class:`string` **)** + **:** :class:`islpy.PwQPolynomial` **}** + + i.e., aggregate the transfer numbers for all types into a single byte count. + """ + + result = {} + for (dtype, kind, direction), v in m.items(): + new_key = (kind, direction) + bytes_transferred = int(dtype.itemsize) * v + if new_key in result: + result[new_key] += bytes_transferred + else: + result[new_key] = bytes_transferred + + return result + + def get_barrier_poly(knl): """Count the number of barriers each thread encounters in a loopy kernel. diff --git a/setup.py b/setup.py index 1f1ea68769f71663dc719d274dc38b01b1f602ed..c8c660666ffeed18f92e533b6042cff63988a096 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ setup(name="loo.py", "pymbolic>=2015.2.1", "cgen>=2013.1.2", "islpy>=2014.2", - "six", + "six>=1.8.0", ], scripts=["bin/loopy"], diff --git a/test/test_diff.py b/test/test_diff.py new file mode 100644 index 0000000000000000000000000000000000000000..a13e6d16129ceb997b55bc2990914795ed146bee --- /dev/null +++ b/test/test_diff.py @@ -0,0 +1,107 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2015 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np # noqa +import numpy.linalg as la +import loopy as lp +import pyopencl as cl +import pyopencl.clrandom # noqa + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +def test_diff(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + """{ [i,j]: 0<=i,j<n }""", + """ + <> a = 1/(1+sinh(x[i] + y[j])**2) + z[i] = sum(j, exp(a * x[j])) + """) + + knl = lp.fix_parameters(knl, n=50) + + from loopy.diff import diff_kernel + dknl, diff_map = diff_kernel(knl, "z", "x") + dknl = lp.remove_unused_arguments(dknl) + + print(dknl) + + n = 50 + x = np.random.randn(n) + y = np.random.randn(n) + + dx = np.random.randn(n) + + fac = 1e-1 + h1 = 1e-4 + h2 = h1 * fac + + evt, (z0,) = knl(queue, x=x, y=y) + evt, (z1,) = knl(queue, x=(x + h1*dx), y=y) + evt, (z2,) = knl(queue, x=(x + h2*dx), y=y) + + dknl = lp.set_options(dknl, write_cl=True) + evt, (df,) = dknl(queue, x=x, y=y) + + diff1 = (z1-z0) + diff2 = (z2-z0) + + diff1_predicted = df.dot(h1*dx) + diff2_predicted = df.dot(h2*dx) + + err1 = la.norm(diff1 - diff1_predicted) / la.norm(diff1) + err2 = la.norm(diff2 - diff2_predicted) / la.norm(diff2) + print(err1, err2) + + assert (err2 < err1 * fac * 1.1).all() + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_statistics.py b/test/test_statistics.py index a58ce6d582a8d03d622028156adff35c61009bc0..a504761193fe4acb7dff9a4a9535efb7a74fe2a9 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -49,9 +49,9 @@ def test_op_counter_basic(): n = 512 m = 256 l = 128 - f32 = poly.dict[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) - f64 = poly.dict[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) - i32 = poly.dict[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) + f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) + f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) + i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == 3*n*m*l assert f64 == n*m assert i32 == n*m*2 @@ -71,7 +71,7 @@ def test_op_counter_reduction(): n = 512 m = 256 l = 128 - f32 = poly.dict[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) + f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == 2*n*m*l @@ -91,9 +91,9 @@ def test_op_counter_logic(): n = 512 m = 256 l = 128 - f32 = poly.dict[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) - f64 = poly.dict[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) - i32 = poly.dict[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) + f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) + f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) + i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == n*m assert f64 == 3*n*m assert i32 == n*m @@ -117,9 +117,9 @@ def test_op_counter_specialops(): n = 512 m = 256 l = 128 - f32 = poly.dict[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) - f64 = poly.dict[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) - i32 = poly.dict[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) + f32 = poly[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) + f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) + i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == 4*n*m*l assert f64 == 3*n*m assert i32 == n*m @@ -146,12 +146,11 @@ def test_op_counter_bitwise(): n = 512 m = 256 l = 128 - i32 = poly.dict[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) - i64 = poly.dict[np.dtype(np.int64)].eval_with_dict({'n': n, 'm': m, 'l': l}) # noqa - f64 = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) + i32 = poly[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) + i64 = poly[np.dtype(np.int64)].eval_with_dict({'n': n, 'm': m, 'l': l}) # noqa + assert np.dtype(np.float64) not in poly assert i32 == n*m+3*n*m*l assert i64 == 6*n*m - assert f64 == 0 def test_op_counter_triangular_domain(): @@ -203,19 +202,19 @@ def test_gmem_access_counter_basic(): n = 512 m = 256 l = 128 - f32 = poly.dict[ + f32 = poly[ (np.dtype(np.float32), 'uniform', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) - f64 = poly.dict[ + f64 = poly[ (np.dtype(np.float64), 'uniform', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == 3*n*m*l assert f64 == 2*n*m - f32 = poly.dict[ + f32 = poly[ (np.dtype(np.float32), 'uniform', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) - f64 = poly.dict[ + f64 = poly[ (np.dtype(np.float64), 'uniform', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == n*m*l @@ -236,12 +235,12 @@ def test_gmem_access_counter_reduction(): n = 512 m = 256 l = 128 - f32 = poly.dict[ + f32 = poly[ (np.dtype(np.float32), 'uniform', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == 2*n*m*l - f32 = poly.dict[ + f32 = poly[ (np.dtype(np.float32), 'uniform', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == n*l @@ -263,16 +262,16 @@ def test_gmem_access_counter_logic(): n = 512 m = 256 l = 128 - f32 = poly.dict[ + f32 = poly[ (np.dtype(np.float32), 'uniform', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) - f64 = poly.dict[ + f64 = poly[ (np.dtype(np.float64), 'uniform', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == 2*n*m assert f64 == n*m - f64 = poly.dict[ + f64 = poly[ (np.dtype(np.float64), 'uniform', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f64 == n*m @@ -296,19 +295,19 @@ def test_gmem_access_counter_specialops(): n = 512 m = 256 l = 128 - f32 = poly.dict[ + f32 = poly[ (np.dtype(np.float32), 'uniform', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) - f64 = poly.dict[ + f64 = poly[ (np.dtype(np.float64), 'uniform', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == 2*n*m*l assert f64 == 2*n*m - f32 = poly.dict[ + f32 = poly[ (np.dtype(np.float32), 'uniform', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) - f64 = poly.dict[ + f64 = poly[ (np.dtype(np.float64), 'uniform', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == n*m*l @@ -336,12 +335,12 @@ def test_gmem_access_counter_bitwise(): n = 512 m = 256 l = 128 - i32 = poly.dict[ + i32 = poly[ (np.dtype(np.int32), 'uniform', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert i32 == 4*n*m+2*n*m*l - i32 = poly.dict[ + i32 = poly[ (np.dtype(np.int32), 'uniform', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert i32 == n*m+n*m*l @@ -367,19 +366,19 @@ def test_gmem_access_counter_mixed(): n = 512 m = 256 l = 128 - f64uniform = poly.dict[ + f64uniform = poly[ (np.dtype(np.float64), 'uniform', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) - f32nonconsec = poly.dict[ + f32nonconsec = poly[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f64uniform == 2*n*m assert f32nonconsec == 3*n*m*l - f64uniform = poly.dict[ + f64uniform = poly[ (np.dtype(np.float64), 'uniform', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) - f32nonconsec = poly.dict[ + f32nonconsec = poly[ (np.dtype(np.float32), 'nonconsecutive', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f64uniform == n*m @@ -406,19 +405,19 @@ def test_gmem_access_counter_nonconsec(): n = 512 m = 256 l = 128 - f64nonconsec = poly.dict[ + f64nonconsec = poly[ (np.dtype(np.float64), 'nonconsecutive', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) - f32nonconsec = poly.dict[ + f32nonconsec = poly[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l - f64nonconsec = poly.dict[ + f64nonconsec = poly[ (np.dtype(np.float64), 'nonconsecutive', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) - f32nonconsec = poly.dict[ + f32nonconsec = poly[ (np.dtype(np.float32), 'nonconsecutive', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f64nonconsec == n*m @@ -445,19 +444,19 @@ def test_gmem_access_counter_consec(): m = 256 l = 128 - f64consec = poly.dict[ + f64consec = poly[ (np.dtype(np.float64), 'consecutive', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) - f32consec = poly.dict[ + f32consec = poly[ (np.dtype(np.float32), 'consecutive', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f64consec == 2*n*m assert f32consec == 3*n*m*l - f64consec = poly.dict[ + f64consec = poly[ (np.dtype(np.float64), 'consecutive', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) - f32consec = poly.dict[ + f32consec = poly[ (np.dtype(np.float32), 'consecutive', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f64consec == n*m @@ -531,10 +530,10 @@ def test_all_counters_parallel_matmul(): assert barrier_count == 0 op_map = get_op_poly(knl) - f32ops = op_map.dict[ + f32ops = op_map[ np.dtype(np.float32) ].eval_with_dict({'n': n, 'm': m, 'l': l}) - i32ops = op_map.dict[ + i32ops = op_map[ np.dtype(np.int32) ].eval_with_dict({'n': n, 'm': m, 'l': l}) @@ -542,17 +541,17 @@ def test_all_counters_parallel_matmul(): assert i32ops == n*m*l*4 + l*n*4 subscript_map = get_gmem_access_poly(knl) - f32uncoal = subscript_map.dict[ + f32uncoal = subscript_map[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) - f32coal = subscript_map.dict[ + f32coal = subscript_map[ (np.dtype(np.float32), 'consecutive', 'load') ].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32uncoal == n*m*l assert f32coal == n*m*l - f32coal = subscript_map.dict[ + f32coal = subscript_map[ (np.dtype(np.float32), 'consecutive', 'store') ].eval_with_dict({'n': n, 'm': m, 'l': l})