diff --git a/doc/conf.py b/doc/conf.py index fee2a071c6b2e7c54dd415d135414e5c3e75d06a..7a03b0eeff927065439c60743b897befedb42eb6 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -149,7 +149,7 @@ html_static_path = ['_static'] #html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +html_show_sourcelink = False # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. #html_show_sphinx = True @@ -219,4 +219,8 @@ man_pages = [ # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'http://docs.python.org/': None} +intersphinx_mapping = { + 'http://docs.python.org/': None, + 'http://documen.tician.de/islpy': None, + 'http://documen.tician.de/pyopencl': None + } diff --git a/doc/guide.rst b/doc/guide.rst new file mode 100644 index 0000000000000000000000000000000000000000..026f96c9777cd5807dd950188be189e094e7ad5a --- /dev/null +++ b/doc/guide.rst @@ -0,0 +1,10 @@ +.. _guide: + +What can loopy do? +================== + +This will become an example-based guide to what loopy can do. + +Loopy's Representation of a Kernel +---------------------------------- + diff --git a/doc/index.rst b/doc/index.rst index 0809b1555d942020ef21d98acd83081748483dac..032b11e5e0ba6d45b568108a0cc47fe4311f72bd 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -1,15 +1,13 @@ -.. loopy documentation master file, created by - sphinx-quickstart on Tue Aug 9 13:40:49 2011. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - Welcome to loopy's documentation! ================================= Contents: .. toctree:: - :maxdepth: 2 + :maxdepth: 2 + + guide + reference Indices and tables ================== diff --git a/doc/reference.rst b/doc/reference.rst new file mode 100644 index 0000000000000000000000000000000000000000..e657e47ed8586cb48dd21a173496d14b4d3b3fe7 --- /dev/null +++ b/doc/reference.rst @@ -0,0 +1,150 @@ +Reference Guide +=============== + +.. module:: loopy +.. moduleauthor:: Andreas Kloeckner + +This guide defines all functionality exposed by loopy. If you would like +a more gentle introduction, you may consider reading the example-based +guide :ref:`guide` instead. + +.. _tags: + +Tags +---- + +===================== ==================================================== +Tag Meaning +===================== ==================================================== +`None` | `"for"` Sequential loop +`"l.N"` Local (intra-group) axis N +`"l.auto"` Automatically chosen local (intra-group) axis +`"unr"` Plain unrolling +`"ilp"` Unroll using instruction-level parallelism +===================== ==================================================== + +(Throughout this table, `N` must be replaced by an actual number.) + +.. _automatic-axes: + +Automatic Axis Assignment +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Automatic local axes are chosen as follows: + +#. For each instruction containing `"l.auto"` inames: + #. Find the lowest-numbered unused axis. If none exists, + use sequential unrolling instead. + #. Find the iname that has the smallest stride in any global + array access occurring in the instruction. + #. Assign the low-stride iname to the available axis, splitting + the iname if it is too long for the available axis size. + +If you need different behavior, use :func:`tag_dimensions` and +:func:`split_dimension` to change the assignment of `"l.auto"` axes +manually. + +.. _creating-kernels: + +Creating Kernels +---------------- + +.. _arguments: + +Arguments +^^^^^^^^^ + +.. autoclass:: ScalarArg + :members: + :undoc-members: + +.. autoclass:: ArrayArg + :members: + :undoc-members: + +.. autoclass:: ConstantArrayArg + :members: + :undoc-members: + +.. autoclass:: ImageArg + :members: + :undoc-members: + +.. _syntax: + +String Syntax +^^^^^^^^^^^^^ + +* Substitution rules + +* Instructions + +Kernels +^^^^^^^ + +.. autoclass:: LoopKernel + +Do not create :class:`LoopKernel` objects directly. Instead, use the following +function, which takes the same arguments, but does some extra post-processing. + +.. autofunction:: make_kernel + +Wrangling dimensions +-------------------- + +.. autofunction:: split_dimension + +.. autofunction:: join_dimensions + +.. autofunction:: tag_dimensions + +Dealing with Substitution Rules +------------------------------- + +.. autofunction:: extract_subst + +.. autofunction:: apply_subst + +Precomputation and Prefetching +------------------------------ + +.. autofunction:: precompute + +.. autofunction:: add_prefetch + + Uses :func:`extract_subst` and :func:`precompute`. + +Finishing up +------------ + +.. autofunction:: generate_loop_schedules + +.. autofunction:: check_kernels + +.. autofunction:: generate_code + +Automatic Testing +----------------- + +.. autofunction:: auto_test_vs_seq + +Troubleshooting +--------------- + +Printing :class:`LoopKernel` objects +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you're confused about things loopy is referring to in an error message or +about the current state of the :class:`LoopKernel` you are transforming, the +following always works:: + + print kernel + +(And it yields a human-readable--albeit terse--representation of *kernel*.) + +.. autofunction:: preprocess_kernel + +.. autofunction:: get_dot_dependency_graph + +Investigating Scheduler Problems +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/upload-docs.sh b/doc/upload-docs.sh index 22e22844cbc26c38fc6b783a0ddf83e3596f3b63..bcc0a79bed23995780efbf7fced1dfbe8cf306bd 100755 --- a/doc/upload-docs.sh +++ b/doc/upload-docs.sh @@ -1,3 +1,3 @@ #! /bin/sh -rsync --progress --verbose --archive --delete build/html/* buster:doc/loopy +rsync --progress --verbose --archive --delete _build/html/* buster:doc/loopy diff --git a/loopy/__init__.py b/loopy/__init__.py index 67b1b2d2157b5fdc8b15bd56907d5cbb6fe2740f..5d25efdedd0ae710b00ab26d6e771e40cc237473 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -22,7 +22,7 @@ class LoopyAdvisory(UserWarning): from loopy.kernel import ScalarArg, ArrayArg, ConstantArrayArg, ImageArg -from loopy.kernel import AutoFitLocalIndexTag, get_dot_dependency_graph +from loopy.kernel import AutoFitLocalIndexTag, get_dot_dependency_graph, LoopKernel from loopy.subst import extract_subst, apply_subst from loopy.cse import precompute from loopy.preprocess import preprocess_kernel @@ -31,7 +31,7 @@ from loopy.codegen import generate_code from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_seq from loopy.check import check_kernels -__all__ = ["ScalarArg", "ArrayArg", "ConstantArrayArg", "ImageArg", +__all__ = ["ScalarArg", "ArrayArg", "ConstantArrayArg", "ImageArg", "LoopKernel", "get_dot_dependency_graph", "preprocess_kernel", "generate_loop_schedules", "generate_code", diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 62be45ce016d45529020909781f5793a62f248f7..c397e951f8f35c06f3d4c1f1598c0d4cbe0583de 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -176,7 +176,7 @@ def make_initial_assignments(kernel): # {{{ main code generation entrypoint -def generate_code(kernel): +def generate_code(kernel, with_annotation=False): from cgen import (FunctionBody, FunctionDeclaration, POD, Value, ArrayOf, Module, Block, Line, Const, LiteralLines, Initializer) @@ -187,7 +187,8 @@ def generate_code(kernel): from loopy.symbolic import LoopyCCodeMapper, pw_aff_to_expr ccm = LoopyCCodeMapper(kernel).copy_and_assign_many( - make_initial_assignments(kernel)) + make_initial_assignments(kernel), + with_annotation=with_annotation) mod = Module() diff --git a/loopy/compiled.py b/loopy/compiled.py index 335395db0ecf5345ff0a411c44b4da2829b56146..0b6c9604b7ae7b4d693155745831f7008aea01fb 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -11,10 +11,10 @@ import numpy as np class CompiledKernel: def __init__(self, context, kernel, size_args=None, options=[], - edit_code=False): + edit_code=False, with_annotation=False): self.kernel = kernel from loopy.codegen import generate_code - self.code = generate_code(kernel) + self.code = generate_code(kernel, with_annotation=with_annotation) if edit_code: from pytools import invoke_editor @@ -227,7 +227,7 @@ def make_args(queue, kernel, seq_input_arrays, parameters): def auto_test_vs_seq(seq_knl, ctx, kernel_gen, op_count, op_label, parameters, print_seq_code=False, print_code=True, warmup_rounds=2, timing_rounds=100, - edit_code=False, dump_binary=False): + edit_code=False, dump_binary=False, with_annotation=False): from time import time # {{{ set up CL context for sequential run @@ -263,7 +263,8 @@ def auto_test_vs_seq(seq_knl, ctx, kernel_gen, op_count, op_label, parameters, seq_sched_kernel = knl break - seq_compiled = CompiledKernel(seq_ctx, seq_sched_kernel) + seq_compiled = CompiledKernel(seq_ctx, seq_sched_kernel, + with_annotation=with_annotation) if print_seq_code: print "----------------------------------------------------------" print "Sequential Code:" @@ -302,7 +303,9 @@ def auto_test_vs_seq(seq_knl, ctx, kernel_gen, op_count, op_label, parameters, if args is None: args, output_arrays = make_args(queue, kernel, seq_input_arrays, parameters) - compiled = CompiledKernel(ctx, kernel, edit_code=edit_code) + compiled = CompiledKernel(ctx, kernel, edit_code=edit_code, + with_annotation=with_annotation) + print "----------------------------------------------------------" print "Kernel #%d:" % i print "----------------------------------------------------------" diff --git a/loopy/cse.py b/loopy/cse.py index 124f462b9c891c5dfcdf68e50d9ee2c3812b6a07..c6150a621b32477984bb590221cbcf4993c4eeb7 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -37,24 +37,45 @@ def to_parameters_or_project_out(param_inames, set_inames, set): def get_footprint(kernel, subst_name, old_arg_names, arg_names, - sweep_inames, invocation_descriptors): + sweep_axes, invocation_descriptors): global_footprint_map = None - # {{{ find sweep inames referenced by arguments + # {{{ deal with argument names as sweep inames - processed_sweep_inames = set() + # An argument name as a sweep iname means that *all* + # inames contained in *all* uses of the rule will be + # made sweep inames. + + sweep_inames = set() for invdesc in invocation_descriptors: - for iname in sweep_inames: + for iname in sweep_axes: if iname in old_arg_names: arg_idx = old_arg_names.index(iname) - processed_sweep_inames.update( + sweep_inames.update( get_dependencies(invdesc.args[arg_idx])) else: - processed_sweep_inames.add(iname) + sweep_inames.add(iname) + + sweep_inames = list(sweep_inames) + + # }}} + + # {{{ see if we need extra storage dimensions + + # Realize that by default our storage dimensions are our arguments. If + # we're given a sweep iname that no (usage-site) argument depends on, then + # this sweep isn't covered in our storage. This necessitates adding an + # extra storage dimension. - sweep_inames = list(processed_sweep_inames) - del processed_sweep_inames + # find inames used in argument dependencies + + usage_arg_deps = set() + for invdesc in invocation_descriptors: + for arg in invdesc.args: + usage_arg_deps.update(get_dependencies(arg)) + + extra_storage_dims = list(set(sweep_inames) - usage_arg_deps) # }}} @@ -72,36 +93,37 @@ def get_footprint(kernel, subst_name, old_arg_names, arg_names, # }}} - # {{{ construct arg mapping + # {{{ construct storage map - # map goes from substitution arguments to domain_dup_sweep + # The storage map goes from storage dimension to domain_dup_sweep. + # The first len(arg_names) storage dimensions are the rule's arguments. for invdesc in invocation_descriptors: map_space = domain_dup_sweep.get_space() - ln = len(arg_names) + stor_dim = len(arg_names) + len(extra_storage_dims) rn = map_space.dim(dim_type.out) - map_space = map_space.add_dims(dim_type.in_, ln) + map_space = map_space.add_dims(dim_type.in_, stor_dim) for i, iname in enumerate(arg_names): # arg names are initially primed, to be replaced with unprimed # base-0 versions below map_space = map_space.set_dim_name(dim_type.in_, i, iname+"'") - # map_space: [arg_names] -> [domain](dup_sweep_index)[dup_sweep] + # map_space: [stor_dims] -> [domain](dup_sweep_index)[dup_sweep] set_space = map_space.move_dims( dim_type.out, rn, - dim_type.in_, 0, ln).range() + dim_type.in_, 0, stor_dim).range() - # set_space: (dup_sweep_index) + # set_space: (dup_sweep_index) footprint_map = None from loopy.symbolic import aff_from_expr for uarg_name, arg_val in zip(arg_names, invdesc.args): cns = isl.Constraint.equality_from_aff( - aff_from_expr(set_space, + aff_from_expr(set_space, var(uarg_name+"'") - prime_sweep_inames(arg_val))) cns_map = isl.BasicMap.from_constraint(cns) @@ -112,7 +134,7 @@ def get_footprint(kernel, subst_name, old_arg_names, arg_names, footprint_map = footprint_map.move_dims( dim_type.in_, 0, - dim_type.out, rn, ln) + dim_type.out, rn, stor_dim) # footprint_map is back in map_space @@ -242,8 +264,37 @@ def simplify_via_aff(expr): -def precompute(kernel, subst_name, dtype, sweep_inames=[], - new_arg_names=None, arg_name_to_tag={}, default_tag="l.auto"): +def precompute(kernel, subst_name, dtype, sweep_axes=[], + storage_axes=None, new_arg_names=None, arg_name_to_tag={}, + default_tag="l.auto"): + """Precompute the expression described in the substitution rule *subst_name* + and store it in a temporary array. A precomputation needs two things to operate, + a list of *sweep_axes* (order irrelevant) and an ordered list of *storage_axes* + (whose order will describe the axis ordering of the temporary array). + + This function will then examine all usage sites of the substitution rule and + determine what the storage footprint of that sweep is. + + The following cases can arise for each sweep axis: + + * The axis is an iname that occurs within arguments specified at + usage sites of the substitution rule. This case is assumed covered + by the storage axes provided for the argument. + + * The axis is an iname that occurs within the *value* of the rule, but not + within its arguments. A new, dedicated storage axis is allocated for + such an axis. + + * The axis is a formal argument name of the substitution rule. + This is equivalent to specifying *all* inames occurring within + the so-named formal argument at *all* usage sites. + + :arg sweep_axes: A :class:`list` of inames and/or rule argument names to be swept. + :arg storage_dims: A :class:`list` of inames and/or rule argument names/indices to be used as storage axes. + + Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are + eliminated. + """ subst = kernel.substitutions[subst_name] arg_names = subst.arguments @@ -252,7 +303,6 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[], # {{{ gather up invocations invocation_descriptors = [] - invocation_arg_deps = set() def gather_substs(expr, name, args, rec): if len(args) != len(subst.arguments): @@ -264,8 +314,6 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[], raise RuntimeError("CSE arguments in '%s' do not consist " "exclusively of inames" % expr) - invocation_arg_deps.update(arg_deps) - invocation_descriptors.append( InvocationDescriptor(expr=expr, args=args)) return expr @@ -285,12 +333,6 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[], scm(subst_expander(insn.expression)) - allowable_sweep_inames = invocation_arg_deps | set(arg_names) - if not set(sweep_inames) <= allowable_sweep_inames: - raise RuntimeError("independent iname(s) '%s' do not occur as arg names " - "of subsitution rule or in arguments of invocation" % (",".join( - set(sweep_inames)-allowable_sweep_inames))) - # }}} # {{{ process ind_iname_to_tag argument @@ -358,7 +400,7 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[], (non1_arg_names, new_domain, arg_base_indices, non1_arg_base_indices, non1_shape) = \ get_footprint(kernel, subst_name, old_arg_names, arg_names, - sweep_inames, invocation_descriptors) + sweep_axes, invocation_descriptors) new_domain = new_domain.coalesce() @@ -367,6 +409,15 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[], if hull_new_domain <= new_domain: new_domain = hull_new_domain + if len(new_domain.get_basic_sets()) > 1: + print("Substitution '%s' yielded a footprint that was not " + "obviously convex. Now computing convex hull. " + "This might take a *long* time." % subst_name) + + hull_new_domain = new_domain.convex_hull() + if hull_new_domain <= new_domain: + new_domain = hull_new_domain + if isinstance(new_domain, isl.Set): dom_bsets = new_domain.get_basic_sets() if len(dom_bsets) > 1: diff --git a/loopy/kernel.py b/loopy/kernel.py index 539130fd088e0d8548400b4b6f9a5f2ab86a5d29..0729d6dbd846fb7a02a73f370022224187be1b8b 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -260,7 +260,7 @@ class Instruction(Record): :ivar boostable_into: a set of inames into which the instruction may need to be boosted, as a heuristic help for the scheduler. - The following two instance variables are only used until :func:`loopy.kernel.make_kernel` is + The following two instance variables are only used until :func:`loopy.make_kernel` is finished: :ivar temp_var_type: if not None, a type that will be assigned to the new temporary variable @@ -480,7 +480,7 @@ class LoopKernel(Record): :ivar cache_manager: - The following instance variables are only used until :func:`loopy.kernel.make_kernel` is + The following instance variables are only used until :func:`loopy.make_kernel` is finished: :ivar iname_to_tag_requests: @@ -814,8 +814,7 @@ class LoopKernel(Record): def find_readers(self): """ - :return: a dict that maps variable names to ids of insns that - read that variable. + :return: a dict that maps variable names to ids of insns that read that variable. """ result = {} @@ -829,8 +828,7 @@ class LoopKernel(Record): def find_writers(self): """ - :return: a dict that maps variable names to ids of insns that - write to that variable. + :return: a dict that maps variable names to ids of insns that write to that variable. """ result = {} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 59ac1fc05d366a776ee8dd62504a14e62039403d..0a374edf74e99ea1328717155516751bcb813c4a 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -519,7 +519,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): # }}} if axis is None: - new_tag = None + new_tag = UnrollTag() else: new_tag = LocalIndexTag(axis) if desired_length > local_size[axis]: diff --git a/loopy/subst.py b/loopy/subst.py index c89bc586991792b0c6208e06dbc9743b88f5ff5d..636252304263587e59dd4718dea63d199acb2274 100644 --- a/loopy/subst.py +++ b/loopy/subst.py @@ -19,16 +19,12 @@ class ExprDescriptor(Record): def extract_subst(kernel, subst_name, template, parameters): """ - :arg template: An expression against which all targeted subexpressions - must unify + :arg subst_name: The name of the substitution rule to be created. + :arg template: Unification template expression. - If None, a unification template will be chosen from among the targeted - CSEs. That CSE is chosen to depend on all the variables in - *parameters*. It is an error if no such expression can be - found. - - May contain '*' wildcards that will have to match exactly across all - unifications. + All targeted subexpressions must match ('unify with') *template* + The template may contain '*' wildcards that will have to match exactly across all + unifications. """ newly_created_var_names = set() diff --git a/loopy/symbolic.py b/loopy/symbolic.py index e1be279653f2ac95251c475e0fdc9c059832d377..c65f6e7ac202ecebece95b386e89dcc8561f973b 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -297,7 +297,7 @@ class ArrayAccessFinder(CombineMapper): class LoopyCCodeMapper(CCodeMapper): def __init__(self, kernel, cse_name_list=[], var_subst_map={}, - with_annotation=True): + with_annotation=False): def constant_mapper(c): if isinstance(c, float): # FIXME: type-variable @@ -318,7 +318,8 @@ class LoopyCCodeMapper(CCodeMapper): if cse_name_list is None: cse_name_list = self.cse_name_list return LoopyCCodeMapper(self.kernel, - cse_name_list=cse_name_list, var_subst_map=var_subst_map) + cse_name_list=cse_name_list, var_subst_map=var_subst_map, + with_annotation=self.with_annotation) def copy_and_assign(self, name, value): var_subst_map = self.var_subst_map.copy() diff --git a/test/test_fem_assembly.py b/test/test_fem_assembly.py index 136fccfbc8165235d40669c31c4de1844cff3fc9..0efc205147009d3095d96f30998b113238ef30c8 100644 --- a/test/test_fem_assembly.py +++ b/test/test_fem_assembly.py @@ -70,13 +70,14 @@ def test_laplacian_stiffness(ctx_factory): # no ILP across elements, precompute dPsiTransf knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) - knl = lp.precompute(knl, "dPsi", np.float32) + knl = lp.precompute(knl, "dPsi", np.float32, + sweep_inames=["K_inner"]) knl = lp.add_prefetch(knl, "jacInv", ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"]) return knl - for variant in [variant_1, variant_2]: - #for variant in [variant_3]: + #for variant in [variant_1, variant_2, variant_3]: + for variant in [variant_3]: kernel_gen = lp.generate_loop_schedules(variant(knl), loop_priority=["jacInv_dim_0", "jacInv_dim_1"]) kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc))