From f05a6a827a8ba5cfff03248e9f1cc803b85429a0 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Fri, 21 Jun 2013 23:22:51 -0400 Subject: [PATCH] Introduce CInstruction, fix up rest of loopy to deal with it --- doc/reference.rst | 24 ++- loopy/__init__.py | 12 +- loopy/check.py | 169 +++++++++++---------- loopy/codegen/__init__.py | 3 + loopy/codegen/instruction.py | 39 ++++- loopy/diagnostic.py | 4 + loopy/kernel/__init__.py | 23 ++- loopy/kernel/creation.py | 137 ++++++++++------- loopy/kernel/data.py | 282 ++++++++++++++++++++++++++++++----- loopy/kernel/tools.py | 7 + loopy/precompute.py | 10 +- loopy/preprocess.py | 109 +++++++++----- loopy/schedule.py | 8 +- loopy/subst.py | 12 +- loopy/symbolic.py | 14 +- test/test_dg.py | 4 + test/test_loopy.py | 28 +++- 17 files changed, 635 insertions(+), 250 deletions(-) diff --git a/doc/reference.rst b/doc/reference.rst index bf62ec89e..b415e40af 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -135,14 +135,28 @@ Arguments :members: :undoc-members: -.. _syntax: +Temporary Variables +^^^^^^^^^^^^^^^^^^^ -String Syntax -^^^^^^^^^^^^^ +.. autoclass:: TemporaryVariable + :members: + :undoc-members: + +Substitution rules +^^^^^^^^^^^^^^^^^^ + +.. autoclass:: SubstitutionRule + +String sytnax: FIXME + +Instructions +^^^^^^^^^^^^ + +.. autoclass:: ExpressionInstruction -* Substitution rules +.. autoclass:: CInstruction -* Instructions +String sytnax: FIXME Kernels ^^^^^^^ diff --git a/loopy/__init__.py b/loopy/__init__.py index dc4e7bf32..b15f447d0 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -50,8 +50,10 @@ from loopy.library.preamble import default_preamble_generator from loopy.library.symbol import opencl_symbol_mangler from loopy.kernel.data import ( + auto, ValueArg, GlobalArg, ConstantArg, ImageArg, - ExpressionInstruction, CInstruction) + ExpressionInstruction, CInstruction, + TemporaryVariable) from loopy.kernel import LoopKernel from loopy.kernel.tools import ( @@ -76,6 +78,8 @@ __all__ = [ "LoopKernel", "ValueArg", "ScalarArg", "GlobalArg", "ArrayArg", "ConstantArg", "ImageArg", + "TemporaryVariable", + "ExpressionInstruction", "CInstruction", "default_function_mangler", "single_arg_function_mangler", @@ -104,12 +108,6 @@ __all__ = [ ] -class auto: - """A generic placeholder object for something that should be automatically - detected. See, for example, the *shape* or *strides* argument of - :class:`GlobalArg`. - """ - # }}} diff --git a/loopy/check.py b/loopy/check.py index b48b39a8b..a871508b9 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -26,6 +26,7 @@ THE SOFTWARE. from islpy import dim_type import islpy as isl from loopy.symbolic import WalkMapper +from loopy.diagnostic import LoopyError, LoopyWarning import logging logger = logging.getLogger(__name__) @@ -35,54 +36,6 @@ from loopy.diagnostic import WriteRaceConditionError # {{{ sanity checks run during scheduling -def check_sizes(kernel): - import loopy as lp - - from loopy.diagnostic import LoopyAdvisory - - parameters = {} - for arg in kernel.args: - if isinstance(arg, lp.ValueArg) and arg.approximately is not None: - parameters[arg.name] = arg.approximately - - glens, llens = kernel.get_grid_sizes_as_exprs() - - if (max(len(glens), len(llens)) - > kernel.device.max_work_item_dimensions): - raise RuntimeError("too many work item dimensions") - - from pymbolic import evaluate - from pymbolic.mapper.evaluator import UnknownVariableError - try: - glens = evaluate(glens, parameters) - llens = evaluate(llens, parameters) - except UnknownVariableError, name: - from warnings import warn - warn("could not check axis bounds because no value " - "for variable '%s' was passed to check_kernels()" - % name, LoopyAdvisory) - else: - for i in range(len(llens)): - if llens[i] > kernel.device.max_work_item_sizes[i]: - raise RuntimeError("group axis %d too big" % i) - - from pytools import product - if product(llens) > kernel.device.max_work_group_size: - raise RuntimeError("work group too big") - - from pyopencl.characterize import usable_local_mem_size - if kernel.local_mem_use() > usable_local_mem_size(kernel.device): - raise RuntimeError(5, "using too much local memory") - - from loopy.kernel.data import ConstantArg - const_arg_count = sum( - 1 for arg in kernel.args - if isinstance(arg, ConstantArg)) - - if const_arg_count > kernel.device.max_constant_args: - raise RuntimeError("too many constant arguments") - - def check_for_unused_hw_axes_in_insns(kernel): group_size, local_size = kernel.get_grid_sizes_as_exprs() @@ -107,16 +60,16 @@ def check_for_unused_hw_axes_in_insns(kernel): elif isinstance(tag, GroupIndexTag): group_axes_used.add(tag.axis) elif isinstance(tag, AutoLocalIndexTagBase): - raise RuntimeError("auto local tag encountered") + raise LoopyError("auto local tag encountered") if group_axes != group_axes_used: - raise RuntimeError("instruction '%s' does not use all group hw axes " + raise LoopyError("instruction '%s' does not use all group hw axes " "(available: %s used:%s)" % (insn.id, ",".join(str(i) for i in group_axes), ",".join(str(i) for i in group_axes_used))) if local_axes != local_axes_used: - raise RuntimeError("instruction '%s' does not use all local hw axes" + raise LoopyError("instruction '%s' does not use all local hw axes" "(available: %s used:%s)" % (insn.id, ",".join(str(i) for i in local_axes), @@ -133,22 +86,18 @@ def check_for_double_use_of_hw_axes(kernel): if isinstance(tag, UniqueTag): key = tag.key if key in insn_tag_keys: - raise RuntimeError("instruction '%s' has multiple " + raise LoopyError("instruction '%s' has multiple " "inames tagged '%s'" % (insn.id, tag)) insn_tag_keys.add(key) def check_for_inactive_iname_access(kernel): - from loopy.symbolic import DependencyMapper - depmap = DependencyMapper() - for insn in kernel.instructions: - expression_indices = depmap(insn.expression) - expression_inames = expression_indices & kernel.all_inames() + expression_inames = insn.read_dependency_names() & kernel.all_inames() if not expression_inames <= kernel.insn_inames(insn): - raise RuntimeError( + raise LoopyError( "instructiosn '%s' references " "inames that the instruction does not depend on" % insn.id) @@ -173,7 +122,7 @@ def check_for_write_races(kernel): assignee_inames = assignee_indices & kernel.all_inames() if not assignee_inames <= kernel.insn_inames(insn): - raise RuntimeError( + raise LoopyError( "assignee of instructiosn '%s' references " "iname that the instruction does not depend on" % insn.id) @@ -207,11 +156,11 @@ def check_for_write_races(kernel): LocalIndexTagBase)) else: - raise RuntimeError("temp var '%s' hasn't decided on " + raise LoopyError("temp var '%s' hasn't decided on " "whether it is local" % temp_var.name) else: - raise RuntimeError("invalid assignee name in instruction '%s'" + raise LoopyError("invalid assignee name in instruction '%s'" % insn.id) race_inames = \ @@ -235,7 +184,7 @@ def check_for_orphaned_user_hardware_axes(kernel): break if not found: - raise RuntimeError("user-requested local hardware axis %d " + raise LoopyError("user-requested local hardware axis %d " "has no iname mapped to it" % axis) @@ -254,7 +203,7 @@ def check_for_data_dependent_parallel_bounds(kernel): parameters = set(dom.get_var_names(dim_type.param)) for par in parameters: if par in kernel.temporary_variables: - raise RuntimeError("Domain number %d has a data-dependent " + raise LoopyError("Domain number %d has a data-dependent " "parameter '%s' and contains parallel " "inames '%s'. This is not allowed (for now)." % (i, par, ", ".join(par_inames))) @@ -295,7 +244,7 @@ class _AccessCheckMapper(WalkMapper): return if len(subscript) != len(shape): - raise RuntimeError("subscript to '%s' in '%s' has the wrong " + raise LoopyError("subscript to '%s' in '%s' has the wrong " "number of indices (got: %d, expected: %d)" % ( expr.aggregate.name, expr, len(subscript), len(shape))) @@ -316,7 +265,7 @@ class _AccessCheckMapper(WalkMapper): shape_domain = shape_domain.intersect(slab) if not access_range.is_subset(shape_domain): - raise RuntimeError("'%s' in instruction '%s' " + raise LoopyError("'%s' in instruction '%s' " "accesses out-of-bounds array element" % (expr, self.insn_id)) @@ -331,40 +280,38 @@ def check_bounds(kernel): continue acm = _AccessCheckMapper(kernel, domain, insn.id) - acm(insn.expression) - acm(insn.assignee) + insn.with_transformed_expressions(acm) def check_write_destinations(kernel): for insn in kernel.instructions: for wvar, _ in insn.assignees_and_indices(): if wvar in kernel.all_inames(): - raise RuntimeError("iname '%s' may not be written" % wvar) + raise LoopyError("iname '%s' may not be written" % wvar) insn_domain = kernel.get_inames_domain(kernel.insn_inames(insn)) insn_params = set(insn_domain.get_var_names(dim_type.param)) if wvar in kernel.all_params(): if wvar not in kernel.temporary_variables: - raise RuntimeError("domain parameter '%s' may not be written" + raise LoopyError("domain parameter '%s' may not be written" "--it is not a temporary variable" % wvar) if wvar in insn_params: - raise RuntimeError("domain parameter '%s' may not be written " + raise LoopyError("domain parameter '%s' may not be written " "inside a domain dependent on it" % wvar) if not (wvar in kernel.temporary_variables or wvar in kernel.arg_dict) and wvar not in kernel.all_params(): - raise RuntimeError + raise LoopyError # }}} -def run_automatic_checks(kernel): +def pre_schedule_checks(kernel): try: - logger.info("sanity-check %s: start" % kernel.name) + logger.info("pre-schedule check %s: start" % kernel.name) - check_sizes(kernel) check_for_orphaned_user_hardware_axes(kernel) check_for_double_use_of_hw_axes(kernel) check_for_unused_hw_axes_in_insns(kernel) @@ -374,16 +321,84 @@ def run_automatic_checks(kernel): check_bounds(kernel) check_write_destinations(kernel) - logger.info("sanity-check %s: done" % kernel.name) + logger.info("pre-schedule check %s: done" % kernel.name) except: print 75*"=" - print "failing kernel after processing:" + print "failing kernel during pre-schedule check:" print 75*"=" print kernel print 75*"=" raise +# {{{ pre-code-generation checks + +def check_sizes(kernel): + import loopy as lp + + from loopy.diagnostic import LoopyAdvisory + + parameters = {} + for arg in kernel.args: + if isinstance(arg, lp.ValueArg) and arg.approximately is not None: + parameters[arg.name] = arg.approximately + + glens, llens = kernel.get_grid_sizes_as_exprs() + + if (max(len(glens), len(llens)) + > kernel.device.max_work_item_dimensions): + raise LoopyError("too many work item dimensions") + + from pymbolic import evaluate + from pymbolic.mapper.evaluator import UnknownVariableError + try: + glens = evaluate(glens, parameters) + llens = evaluate(llens, parameters) + except UnknownVariableError, name: + from warnings import warn + warn("could not check axis bounds because no value " + "for variable '%s' was passed to check_kernels()" + % name, LoopyAdvisory) + else: + for i in range(len(llens)): + if llens[i] > kernel.device.max_work_item_sizes[i]: + raise LoopyError("group axis %d too big" % i) + + from pytools import product + if product(llens) > kernel.device.max_work_group_size: + raise LoopyError("work group too big") + + from pyopencl.characterize import usable_local_mem_size + if kernel.local_mem_use() > usable_local_mem_size(kernel.device): + raise LoopyError(5, "using too much local memory") + + from loopy.kernel.data import ConstantArg + const_arg_count = sum( + 1 for arg in kernel.args + if isinstance(arg, ConstantArg)) + + if const_arg_count > kernel.device.max_constant_args: + raise LoopyError("too many constant arguments") + + +def pre_codegen_checks(kernel): + try: + logger.info("pre-codegen check %s: start" % kernel.name) + + check_sizes(kernel) + + logger.info("pre-codegen check %s: done" % kernel.name) + except: + print 75*"=" + print "failing kernel during pre-schedule check:" + print 75*"=" + print kernel + print 75*"=" + raise + +# }}} + + # {{{ sanity-check for implemented domains of each instruction def check_implemented_domains(kernel, implemented_domains, code=None): @@ -453,7 +468,7 @@ def check_implemented_domains(kernel, implemented_domains, code=None): print get_highlighted_cl_code(code) print 79*"-" - raise RuntimeError("sanity check failed--implemented and desired " + raise LoopyError("sanity check failed--implemented and desired " "domain for instruction '%s' do not match\n\n" "implemented: %s\n\n" "desired:%s\n\n%s" diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index b61174c60..8b4e0d22f 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -258,6 +258,9 @@ def generate_code(kernel, with_annotation=False, from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) + from loopy.check import pre_codegen_checks + pre_codegen_checks(kernel) + from cgen import (FunctionBody, FunctionDeclaration, Value, Module, Block, Line, Const, LiteralLines, Initializer) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 5015d6234..f333baadb 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -25,6 +25,7 @@ THE SOFTWARE. import islpy as isl +from loopy.codegen import GeneratedInstruction def wrap_in_bounds_checks(ccm, domain, check_inames, implemented_domain, stmt): @@ -60,7 +61,6 @@ def generate_instruction_code(kernel, insn, codegen_state): def generate_expr_instruction_code(kernel, insn, codegen_state): - from loopy.codegen import GeneratedInstruction ccm = codegen_state.c_code_mapper @@ -108,6 +108,41 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): def generate_c_instruction_code(kernel, insn, codegen_state): - raise NotImplementedError + ccm = codegen_state.c_code_mapper + + body = [] + + from loopy.codegen import POD + from cgen import Initializer, Block, Line + + from pymbolic.primitives import Variable + for name, iname_expr in insn.iname_exprs: + if (isinstance(iname_expr, Variable) + and name not in ccm.var_subst_map): + # No need, the bare symbol will work + continue + + body.append( + Initializer( + POD(kernel.index_dtype, name), + codegen_state.c_code_mapper( + iname_expr, prec=None, type_context="i"))) + + if body: + body.append(Line()) + + body.extend(Line(l) for l in insn.code.split("\n")) + + insn_inames = kernel.insn_inames(insn) + insn_code, impl_domain = wrap_in_bounds_checks( + ccm, kernel.get_inames_domain(insn_inames), insn_inames, + codegen_state.implemented_domain, + Block(body)) + + return GeneratedInstruction( + insn_id=insn.id, + implemented_domain=impl_domain, + ast=insn_code) + # vim: foldmethod=marker diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index d3ed8770f..dd2860a0e 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -29,6 +29,10 @@ class LoopyWarningBase(UserWarning): pass +class LoopyWarning(LoopyWarningBase): + pass + + class LoopyAdvisory(LoopyWarningBase): pass diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 68dac0dae..8a7c53f84 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -860,7 +860,22 @@ class LoopKernel(Record): lines.append(sep) lines.append("INSTRUCTIONS:") loop_list_width = 35 + + import loopy as lp for insn in self.instructions: + if isinstance(insn, lp.ExpressionInstruction): + lhs = str(insn.assignee) + rhs = str(insn.expression) + trailing = [] + elif isinstance(insn, lp.CInstruction): + lhs = ", ".join(str(a) for a in insn.assignees) + rhs = "CODE(%s|%s)" % ( + ", ".join(str(x) for x in insn.read_variables), + ", ".join("%s=%s" % (name, expr) + for name, expr in insn.iname_exprs)) + + trailing = [" "+l for l in insn.code.split("\n")] + loop_list = ",".join(sorted(self.insn_inames(insn))) options = [insn.id] @@ -870,12 +885,14 @@ class LoopKernel(Record): if len(loop_list) > loop_list_width: lines.append("[%s]" % loop_list) lines.append("%s%s <- %s # %s" % ( - (loop_list_width+2)*" ", insn.assignee, - insn.expression, ", ".join(options))) + (loop_list_width+2)*" ", lhs, + rhs, ", ".join(options))) else: lines.append("[%s]%s%s <- %s # %s" % ( loop_list, " "*(loop_list_width-len(loop_list)), - insn.assignee, insn.expression, ", ".join(options))) + lhs, rhs, ", ".join(options))) + + lines.extend(trailing) lines.append(sep) lines.append("DEPENDENCIES:") diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index f4cd45c27..157b7cbf3 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -34,6 +34,9 @@ from islpy import dim_type import re +import logging +logger = logging.getLogger(__name__) + # {{{ identifier wrangling @@ -477,13 +480,8 @@ def guess_kernel_args_if_requested(domains, instructions, temporary_variables, def tag_reduction_inames_as_sequential(knl): result = set() - def map_reduction(red_expr, rec): - rec(red_expr.expr) - result.update(red_expr.inames) - - from loopy.symbolic import ReductionCallbackMapper for insn in knl.instructions: - ReductionCallbackMapper(map_reduction)(insn.expression) + result.update(insn.reduction_inames()) from loopy.kernel.data import ParallelTag, ForceSequentialTag @@ -615,7 +613,7 @@ def expand_cses(knl): new_temp_vars[new_var_name] = TemporaryVariable( name=new_var_name, dtype=dtype, - is_local=None, + is_local=lp.auto, shape=()) from pymbolic.primitives import Variable @@ -656,29 +654,13 @@ def create_temporaries(knl): new_insns = [] new_temp_vars = knl.temporary_variables.copy() - from loopy.symbolic import AccessRangeMapper + import loopy as lp for insn in knl.instructions: - if not isinstance(insn, ExpressionInstruction): - continue - - from loopy.kernel.data import TemporaryVariable - - if insn.temp_var_type is not None: + if isinstance(insn, ExpressionInstruction) \ + and insn.temp_var_type is not None: (assignee_name, _), = insn.assignees_and_indices() - armap = AccessRangeMapper(knl, assignee_name) - armap(insn.assignee, knl.insn_inames(insn)) - - if armap.access_range is not None: - base_indices, shape = zip(*[ - knl.cache_manager.base_index_and_length( - armap.access_range, i) - for i in xrange(armap.access_range.dim(dim_type.set))]) - else: - base_indices = () - shape = () - if assignee_name in new_temp_vars: raise RuntimeError("cannot create temporary variable '%s'--" "already exists" % assignee_name) @@ -686,12 +668,15 @@ def create_temporaries(knl): raise RuntimeError("cannot create temporary variable '%s'--" "already exists as argument" % assignee_name) - new_temp_vars[assignee_name] = TemporaryVariable( + logger.debug("%s: creating temporary %s" + % (knl.name, assignee_name)) + + new_temp_vars[assignee_name] = lp.TemporaryVariable( name=assignee_name, dtype=insn.temp_var_type, - is_local=None, - base_indices=base_indices, - shape=shape) + is_local=lp.auto, + base_indices=lp.auto, + shape=lp.auto) insn = insn.copy(temp_var_type=None) @@ -704,29 +689,43 @@ def create_temporaries(knl): # }}} -# {{{ check for reduction iname duplication +# {{{ determine shapes of temporaries + +def determine_shapes_of_temporaries(knl): + new_temp_vars = knl.temporary_variables.copy() -def check_for_reduction_inames_duplication_requests(kernel): + from loopy.symbolic import AccessRangeMapper + from pymbolic import var + import loopy as lp - # {{{ helper function + new_temp_vars = {} + for tv in knl.temporary_variables.itervalues(): + if tv.shape is lp.auto or tv.base_indices is lp.auto: + armap = AccessRangeMapper(knl, tv.name) + for insn in knl.instructions: + for assignee_name, assignee_index in insn.assignees_and_indices(): + if assignee_index: + armap(var(assignee_name)[assignee_index], + knl.insn_inames(insn)) - def check_reduction_inames(reduction_expr, rec): - for iname in reduction_expr.inames: - if iname.startswith("@"): - raise RuntimeError( - "Reduction iname duplication with '@' is no " - "longer supported. Use loopy.duplicate_inames " - "instead.") + if armap.access_range is not None: + base_indices, shape = zip(*[ + knl.cache_manager.base_index_and_length( + armap.access_range, i) + for i in xrange(armap.access_range.dim(dim_type.set))]) + else: + base_indices = () + shape = () - # }}} + if tv.base_indices is lp.auto: + tv = tv.copy(base_indices=base_indices) + if tv.shape is lp.auto: + tv = tv.copy(shape=shape) - from loopy.symbolic import ReductionCallbackMapper - rcm = ReductionCallbackMapper(check_reduction_inames) - for insn in kernel.instructions: - rcm(insn.expression) + new_temp_vars[tv.name] = tv - for sub_name, sub_rule in kernel.substitutions.iteritems(): - rcm(sub_rule.expression) + return knl.copy( + temporary_variables=new_temp_vars) # }}} @@ -767,10 +766,11 @@ def guess_arg_shape_if_requested(kernel, default_order): armap = AccessRangeMapper(kernel, arg.name) for insn in kernel.instructions: - armap(submap(insn.assignee, insn.id), - kernel.insn_inames(insn)) - armap(submap(insn.expression, insn.id), - kernel.insn_inames(insn)) + if isinstance(insn, lp.ExpressionInstruction): + armap(submap(insn.assignee, insn.id), + kernel.insn_inames(insn)) + armap(submap(insn.expression, insn.id), + kernel.insn_inames(insn)) if armap.access_range is None: # no subscripts found, let's call it a scalar @@ -832,13 +832,20 @@ def apply_default_order_to_args(kernel, default_order): # {{{ kernel creation top-level -def make_kernel(device, domains, instructions, kernel_args=["..."], **kwargs): +def make_kernel(device, domains, instructions, kernel_data=["..."], **kwargs): """User-facing kernel creation entrypoint. :arg device: :class:`pyopencl.Device` :arg domains: :class:`islpy.BasicSet` :arg instructions: - :arg kernel_args: + :arg kernel_data: + + A list of :class:`ValueArg`, :class:`GlobalArg`, ... (etc.) instances. + The order of these arguments determines the order of the arguments + to the generated kernel. + + May also contain :class:`TemporaryVariable` instances(which do not + give rise to kernel-level arguments). The following keyword arguments are recognized: @@ -873,13 +880,28 @@ def make_kernel(device, domains, instructions, kernel_args=["..."], **kwargs): :arg local_sizes: A dictionary from integers to integers, mapping workgroup axes to their sizes, e.g. *{0: 16}* forces axis 0 to be length 16. - :arg temporary_variables: """ defines = kwargs.pop("defines", {}) default_order = kwargs.pop("default_order", "C") default_offset = kwargs.pop("default_offset", 0) + # {{{ separate temporary variables and arguments + + from loopy.kernel.data import TemporaryVariable + + kernel_args = [] + temporary_variables = {} + for dat in kernel_data: + if isinstance(dat, TemporaryVariable): + temporary_variables[dat.name] = dat + else: + kernel_args.append(dat) + + del kernel_data + + # }}} + # {{{ instruction/subst parsing parsed_instructions = [] @@ -916,18 +938,19 @@ def make_kernel(device, domains, instructions, kernel_args=["..."], **kwargs): domains = parse_domains(isl_context, domains, defines) kernel_args = guess_kernel_args_if_requested(domains, instructions, - kwargs.get("temporary_variables", {}), substitutions, + temporary_variables, substitutions, duplicate_args_with_commas(kernel_args), default_offset) from loopy.kernel import LoopKernel - knl = LoopKernel(device, domains, instructions, kernel_args, **kwargs) + knl = LoopKernel(device, domains, instructions, kernel_args, + temporary_variables=temporary_variables, **kwargs) check_for_nonexistent_iname_deps(knl) - check_for_reduction_inames_duplication_requests(knl) knl = tag_reduction_inames_as_sequential(knl) knl = create_temporaries(knl) + knl = determine_shapes_of_temporaries(knl) knl = expand_cses(knl) knl = expand_defines_in_shapes(knl, defines) knl = guess_arg_shape_if_requested(knl, default_order) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index c0dcd036f..c844995d6 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -30,6 +30,13 @@ from pytools import Record, memoize_method from loopy.kernel.array import ArrayBase +class auto: + """A generic placeholder object for something that should be automatically + detected. See, for example, the *shape* or *strides* argument of + :class:`GlobalArg`. + """ + + # {{{ iname tags class IndexTag(Record): @@ -231,6 +238,10 @@ class TemporaryVariable(ArrayBase): .. attribute:: storage_shape .. attribute:: base_indices .. attribute:: is_local + + Whether this is temporary lives in ``local`` memory. + May be *True*, *False*, or :class:`loopy.auto` if this is + to be automatically determined. """ min_target_axes = 0 @@ -242,9 +253,19 @@ class TemporaryVariable(ArrayBase): "is_local" ] - def __init__(self, name, dtype, shape, is_local, + def __init__(self, name, dtype, shape=(), is_local=auto, dim_tags=None, offset=0, strides=None, order=None, base_indices=None, storage_shape=None): + """ + :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype` + :arg shape: :class:`loopy.auto` or a shape tuple + :arg base_indices: :class:`loopy.auto` or a tuple of base indices + """ + + if is_local is None: + raise ValueError("is_local is None is no longer supported. " + "Use loopy.auto.") + if base_indices is None: base_indices = (0,) * len(shape) @@ -372,6 +393,13 @@ class InstructionBase(Record): """ raise NotImplementedError + def with_transformed_expressions(self, f, *args): + """Return a new copy of *self* where *f* has been applied to every + expression occurring in *self*. *args* will be passed as extra + arguments (in addition to the expression) to *f*. + """ + raise NotImplementedError + # }}} @memoize_method @@ -387,7 +415,7 @@ class InstructionBase(Record): from loopy.symbolic import get_dependencies result.update(get_dependencies(indices)) - return result + return frozenset(result) def dependency_names(self): return self.read_dependency_names() | self.write_dependency_names() @@ -395,6 +423,45 @@ class InstructionBase(Record): def assignee_var_names(self): return (var_name for var_name, _ in self.assignees_and_indices()) + def get_str_options(self): + result = [] + + if self.boostable is True: + if self.boostable_into: + result.append("boostable into '%s'" % ",".join(self.boostable_into)) + else: + result.append("boostable") + elif self.boostable is False: + result.append("not boostable") + elif self.boostable is None: + pass + else: + raise RuntimeError("unexpected value for Instruction.boostable") + + if self.insn_deps: + result.append("deps="+":".join(self.insn_deps)) + if self.priority: + result.append("priority=%d" % self.priority) + + return result + + +def _get_assignee_and_index(expr): + from pymbolic.primitives import Variable, Subscript + if isinstance(expr, Variable): + return (expr.name, ()) + elif isinstance(expr, Subscript): + agg = expr.aggregate + assert isinstance(agg, Variable) + + idx = expr.index + if not isinstance(idx, tuple): + idx = (idx,) + + return (agg.name, idx) + else: + raise RuntimeError("invalid lvalue '%s'" % expr) + class ExpressionInstruction(InstructionBase): """ @@ -402,7 +469,7 @@ class ExpressionInstruction(InstructionBase): .. attribute:: expression - The following instance variables are only used until + The following attributes are only used until :func:`loopy.make_kernel` is finished: .. attribute:: temp_var_type @@ -415,8 +482,8 @@ class ExpressionInstruction(InstructionBase): set("assignee expression temp_var_type".split()) def __init__(self, - id, assignee, expression, - forced_iname_deps=frozenset(), insn_deps=set(), boostable=None, + assignee, expression, + id=None, forced_iname_deps=frozenset(), insn_deps=set(), boostable=None, boostable_into=None, temp_var_type=None, priority=0): @@ -461,24 +528,12 @@ class ExpressionInstruction(InstructionBase): @memoize_method def assignees_and_indices(self): - from pymbolic.primitives import Variable, Subscript - - if isinstance(self.assignee, Variable): - return [(self.assignee.name, ())] - elif isinstance(self.assignee, Subscript): - agg = self.assignee.aggregate - assert isinstance(agg, Variable) - var_name = agg.name - - idx = self.assignee.index - if not isinstance(idx, tuple): - idx = (idx,) + return [_get_assignee_and_index(self.assignee)] - return [(agg.name, idx)] - else: - raise RuntimeError("invalid lvalue '%s'" % self.assignee) - - return var_name + def with_transformed_expressions(self, f, *args): + return self.copy( + assignee=f(self.assignee, *args), + expression=f(self.expression, *args)) # }}} @@ -486,30 +541,177 @@ class ExpressionInstruction(InstructionBase): result = "%s: %s <- %s" % (self.id, self.assignee, self.expression) - if self.boostable is True: - if self.boostable_into: - result += " (boostable into '%s')" % ",".join(self.boostable_into) - else: - result += " (boostable)" - elif self.boostable is False: - result += " (not boostable)" - elif self.boostable is None: - pass - else: - raise RuntimeError("unexpected value for Instruction.boostable") + options = self.get_str_options() + if options: + result += " (%s)" % (": ".join(options)) - options = [] + return result - if self.insn_deps: - options.append("deps="+":".join(self.insn_deps)) - if self.priority: - options.append("priority=%d" % self.priority) - return result +def _remove_common_indentation(code): + if not "\n" in code: + return code + + # accommodate pyopencl-ish syntax highlighting + code = code.lstrip("//CL//") + + if not code.startswith("\n"): + raise ValueError("expected newline as first character " + "in literal lines") + + lines = code.split("\n") + while lines[0].strip() == "": + lines.pop(0) + while lines[-1].strip() == "": + lines.pop(-1) + + if lines: + base_indent = 0 + while lines[0][base_indent] in " \t": + base_indent += 1 + + for line in lines[1:]: + if line[:base_indent].strip(): + raise ValueError("inconsistent indentation") + + return "\n".join(line[base_indent:] for line in lines) class CInstruction(InstructionBase): - pass + """ + .. atttribute:: iname_exprs + + A list of tuples *(name, expr)* of inames or expressions based on them + that the instruction needs access to. + + .. attribute:: code + + The C code to be executed. + + The code should obey the following rules: + + * It should only write to temporary variables, specifically the + temporary variables + + .. note:: + + Of course, nothing in :mod:`loopy` will prevent you from doing + 'forbidden' things in your C code. If you ignore the rules and + something breaks, you get to keep both pieces. + + .. attribute:: read_variables + + A :class:`frozenset` of variable names that :attr:`code` reads. This is + optional and only used for figuring out dependencies. + + .. attribute:: assignees + + A sequence of variable references (with or without subscript) as + :class:`pymbolic.primitives.Expression` instances that :attr:`code` + writes to. This is optional and only used for figuring out dependencies. + """ + + fields = InstructionBase.fields | \ + set("iname_exprs code read_variables assignees".split()) + + def __init__(self, + iname_exprs, code, + read_variables=frozenset(), assignees=frozenset(), + id=None, insn_deps=set(), forced_iname_deps=frozenset(), priority=0, + boostable=None, boostable_into=None): + """ + :arg iname_exprs: Like :attr:`iname_exprs`, but instead of tuples, + simple strings pepresenting inames are also allowed. A single + string is also allowed, which should consists of comma-separated + inames. + :arg assignees: Like :attr:`assignees`, but may also be a + semicolon-separated string of such expressions or a + sequence of strings parseable into the desired format. + """ + + InstructionBase.__init__(self, + id=id, + forced_iname_deps=forced_iname_deps, + insn_deps=insn_deps, boostable=boostable, + boostable_into=boostable_into, + priority=priority) + + # {{{ normalize iname_exprs + + if isinstance(iname_exprs, str): + iname_exprs = [i.strip() for i in iname_exprs.split(",")] + iname_exprs = [i for i in iname_exprs if i] + + from pymbolic import var + new_iname_exprs = [] + for i in iname_exprs: + if isinstance(i, str): + new_iname_exprs.append((i, var(i))) + else: + new_iname_exprs.append(i) + + # }}} + + # {{{ normalize assignees + + if isinstance(assignees, str): + assignees = [i.strip() for i in assignees.split(";")] + assignees = [i for i in assignees if i] + + new_assignees = [] + from loopy.symbolic import parse + for i in assignees: + if isinstance(i, str): + new_assignees.append(parse(i)) + else: + new_assignees.append(i) + # }}} + + self.iname_exprs = new_iname_exprs + self.code = _remove_common_indentation(code) + self.read_variables = read_variables + self.assignees = new_assignees + + # {{{ abstract interface + + def read_dependency_names(self): + result = set(self.read_variables) + + from loopy.symbolic import get_dependencies + for name, iname_expr in self.iname_exprs: + result.update(get_dependencies(iname_expr)) + + return frozenset(result) + + def reduction_inames(self): + return set() + + def assignees_and_indices(self): + return [_get_assignee_and_index(expr) + for expr in self.assignees] + + def with_transformed_expressions(self, f, *args): + return self.copy( + iname_exprs=[ + (name, f(expr, *args)) + for name, expr in self.iname_exprs], + assignees=[f(a, *args) for a in self.assignees]) + + # }}} + + def __str__(self): + first_line = "%s: %s <- CODE(%s|%s)" % (self.id, + ", ".join(str(a) for a in self.assignees), + ", ".join(str(x) for x in self.read_variables), + ", ".join("%s=%s" % (name, expr) + for name, expr in self.iname_exprs)) + + options = self.get_str_options() + if options: + first_line += " (%s)" % (": ".join(options)) + + return first_line + "\n " + "\n ".join( + self.code.split("\n")) # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 51cc266db..72baba59d 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -99,6 +99,10 @@ def find_all_insn_inames(kernel): deps & kernel.all_inames() | insn.forced_iname_deps) + assert isinstance(read_deps, frozenset), type(insn) + assert isinstance(write_deps, frozenset), type(insn) + assert isinstance(iname_deps, frozenset), type(insn) + insn_id_to_inames[insn.id] = iname_deps insn_assignee_inames[insn.id] = write_deps & kernel.all_inames() @@ -174,6 +178,9 @@ def find_all_insn_inames(kernel): if not did_something: break + for v in insn_id_to_inames.itervalues(): + assert isinstance(v, frozenset) + return insn_id_to_inames # }}} diff --git a/loopy/precompute.py b/loopy/precompute.py index 1c6634b13..3b33e895a 100644 --- a/loopy/precompute.py +++ b/loopy/precompute.py @@ -672,8 +672,10 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, invg = InvocationGatherer(kernel, subst_name, subst_tag, within) + import loopy as lp for insn in kernel.instructions: - invg(insn.expression, insn.id) + if isinstance(insn, lp.ExpressionInstruction): + invg(insn.expression, insn.id) for invdesc in invg.invocation_descriptors: invocation_descriptors.append( @@ -857,15 +859,15 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, else: dtype = np.dtype(dtype) - from loopy.kernel.data import TemporaryVariable + import loopy as lp new_temporary_variables = kernel.temporary_variables.copy() - temp_var = TemporaryVariable( + temp_var = lp.TemporaryVariable( name=target_var_name, dtype=dtype, base_indices=(0,)*len(non1_storage_shape), shape=tuple(non1_storage_shape), - is_local=None) + is_local=lp.auto) new_temporary_variables[target_var_name] = temp_var diff --git a/loopy/preprocess.py b/loopy/preprocess.py index f71d98bba..d41a36405 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -25,6 +25,7 @@ THE SOFTWARE. import pyopencl as cl import pyopencl.characterize as cl_char +from loopy.diagnostic import LoopyError, LoopyWarning import logging logger = logging.getLogger(__name__) @@ -36,31 +37,38 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if var_name in kernel.all_params(): return kernel.index_dtype + def debug(s): + logger.debug("%s: %s" % (kernel.name, s)) + dtypes = [] + import loopy as lp + from loopy.codegen.expression import DependencyTypeInferenceFailure for writer_insn_id in kernel.writer_map().get(var_name, []): - expr = subst_expander( - kernel.id_to_insn[writer_insn_id].expression, - insn_id=writer_insn_id) + writer_insn = kernel.id_to_insn[writer_insn_id] + if not isinstance(writer_insn, lp.ExpressionInstruction): + continue + + expr = subst_expander(writer_insn.expression, insn_id=writer_insn_id) try: - logger.debug(" via expr %s" % expr) + debug(" via expr %s" % expr) result = type_inf_mapper(expr) - logger.debug(" result: %s" % result) + debug(" result: %s" % result) dtypes.append(result) except DependencyTypeInferenceFailure, e: - logger.debug(" failed: %s" % e) + debug(" failed: %s" % e) if not dtypes: return None from pytools import is_single_valued if not is_single_valued(dtypes): - raise RuntimeError("ambiguous type inference for '%s'" + raise LoopyError("ambiguous type inference for '%s'" % var_name) return dtypes[0] @@ -89,6 +97,20 @@ class _DictUnionView: def infer_unknown_types(kernel, expect_completion=False): """Infer types on temporaries and argumetns.""" + logger.debug("%s: infer types" % kernel.name) + + def debug(s): + logger.debug("%s: %s" % (kernel.name, s)) + + if kernel.substitutions: + from warnings import warn + warn("type inference called when substitution " + "rules are still unexpanded, expanding", + LoopyWarning, stacklevel=2) + + from loopy.subst import expand_subst + kernel = expand_subst(kernel) + new_temp_vars = kernel.temporary_variables.copy() new_arg_dict = kernel.arg_dict.copy() @@ -127,27 +149,27 @@ def infer_unknown_types(kernel, expect_completion=False): while queue: item = queue.pop(0) - logger.debug("inferring type for %s %s" % (type(item).__name__, item.name)) + debug("inferring type for %s %s" % (type(item).__name__, item.name)) result = _infer_var_type(kernel, item.name, type_inf_mapper, subst_expander) failed = result is None if not failed: - logger.debug(" success: %s" % result) + debug(" success: %s" % result) if isinstance(item, TemporaryVariable): new_temp_vars[item.name] = item.copy(dtype=result) elif isinstance(item, KernelArgument): new_arg_dict[item.name] = item.copy(dtype=result) else: - raise RuntimeError("unexpected item type in type inference") + raise LoopyError("unexpected item type in type inference") else: - logger.debug(" failure") + debug(" failure") if failed: if item.name in failed_names: # this item has failed before, give up. if expect_completion: - raise RuntimeError( + raise LoopyError( "could not determine type of '%s'" % item.name) else: # We're done here. @@ -183,8 +205,11 @@ def infer_unknown_types(kernel, expect_completion=False): # {{{ decide which temporaries are local def mark_local_temporaries(kernel): + logger.debug("%s: mark local temporaries" % kernel.name) + new_temp_vars = {} from loopy.kernel.data import LocalIndexTagBase + import loopy as lp writers = kernel.writer_map() @@ -194,7 +219,7 @@ def mark_local_temporaries(kernel): # Only fill out for variables that do not yet know if they're # local. (I.e. those generated by implicit temporary generation.) - if temp_var.is_local is not None: + if temp_var.is_local is not lp.auto: new_temp_vars[temp_var.name] = temp_var continue @@ -251,7 +276,7 @@ def mark_local_temporaries(kernel): is_local = wants_to_be_local_per_insn[0] from pytools import all if not all(wtbl == is_local for wtbl in wants_to_be_local_per_insn): - raise RuntimeError("not all instructions agree on whether " + raise LoopyError("not all instructions agree on whether " "temporary '%s' should be in local memory" % temp_var.name) new_temp_vars[temp_var.name] = temp_var.copy(is_local=is_local) @@ -276,6 +301,8 @@ def realize_reduction(kernel, insn_id_filter=None): be realized. """ + logger.debug("%s: realize reduction" % kernel.name) + new_insns = [] var_name_gen = kernel.get_var_name_generator() @@ -306,7 +333,7 @@ def realize_reduction(kernel, insn_id_filter=None): outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = set(expr.inames) & outer_insn_inames if bad_inames: - raise RuntimeError("reduction used within loop(s) that it was " + raise LoopyError("reduction used within loop(s) that it was " "supposed to reduce over: " + ", ".join(bad_inames)) new_id = temp_kernel.make_unique_instruction_id( @@ -346,13 +373,15 @@ def realize_reduction(kernel, insn_id_filter=None): temp_kernel = kernel + import loopy as lp while insn_queue: new_insn_insn_deps = set() generated_insns = [] insn = insn_queue.pop(0) - if insn_id_filter is not None and insn.id != insn_id_filter: + if insn_id_filter is not None and insn.id != insn_id_filter \ + or not isinstance(insn, lp.ExpressionInstruction): new_insns.append(insn) continue @@ -423,10 +452,11 @@ class ExtraInameIndexInserter(IdentityMapper): def duplicate_private_temporaries_for_ilp(kernel): + logger.debug("%s: duplicate temporaries for ilp" % kernel.name) + wmap = kernel.writer_map() from loopy.kernel.data import IlpBaseTag - from loopy.symbolic import get_dependencies var_to_new_ilp_inames = {} @@ -440,13 +470,13 @@ def duplicate_private_temporaries_for_ilp(kernel): if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag)) referenced_ilp_inames = (ilp_inames - & get_dependencies(writer_insn.assignee)) + & writer_insn.write_dependency_names()) new_ilp_inames = ilp_inames - referenced_ilp_inames if tv.name in var_to_new_ilp_inames: if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]): - raise RuntimeError("instruction '%s' requires adding " + raise LoopyError("instruction '%s' requires adding " "indices for ILP inames '%s' on var '%s', but previous " "instructions required inames '%s'" % (writer_insn_id, ", ".join(new_ilp_inames), @@ -502,9 +532,7 @@ def duplicate_private_temporaries_for_ilp(kernel): for var_name, inames in var_to_new_ilp_inames.iteritems())) new_insns = [ - insn.copy( - assignee=eiii(insn.assignee), - expression=eiii(insn.expression)) + insn.with_transformed_expressions(eiii) for insn in kernel.instructions] return kernel.copy( @@ -517,6 +545,8 @@ def duplicate_private_temporaries_for_ilp(kernel): # {{{ automatic dependencies, find boostability of instructions def add_boostability_and_automatic_dependencies(kernel): + logger.debug("%s: automatic deps, boostability" % kernel.name) + writer_map = kernel.writer_map() arg_names = set(arg.name for arg in kernel.args) @@ -604,6 +634,8 @@ def limit_boostability(kernel): and then limits boostability to just those inames. """ + logger.debug("%s: limit boostability" % kernel.name) + iname_occurs_with = {} for insn in kernel.instructions: insn_inames = kernel.insn_inames(insn) @@ -621,7 +653,7 @@ def limit_boostability(kernel): new_insns = [] for insn in kernel.instructions: if insn.boostable is None: - raise RuntimeError("insn '%s' has undetermined boostability" % insn.id) + raise LoopyError("insn '%s' has undetermined boostability" % insn.id) elif insn.boostable: boostable_into = set() for iname in kernel.insn_inames(insn): @@ -654,7 +686,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): if arg.approximately is not None: approximate_arg_values[arg.name] = arg.approximately else: - raise RuntimeError("No approximate arg value specified for '%s'" + raise LoopyError("No approximate arg value specified for '%s'" % arg.name) # {{{ find all array accesses in insn @@ -762,6 +794,8 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): # {{{ assign automatic axes def assign_automatic_axes(kernel, axis=0, local_size=None): + logger.debug("%s: assign automatic axes" % kernel.name) + from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag) # Realize that at this point in time, axis lengths are already @@ -831,7 +865,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): axis=recursion_axis, local_size=local_size) if not isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase): - raise RuntimeError("trying to reassign '%s'" % iname) + raise LoopyError("trying to reassign '%s'" % iname) new_iname_to_tag = kernel.iname_to_tag.copy() new_iname_to_tag[iname] = new_tag @@ -845,7 +879,12 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): # assignment proceeds in one phase per axis, each time assigning the # smallest-stride available iname to the current axis + import loopy as lp + for insn in kernel.instructions: + if not isinstance(insn, lp.ExpressionInstruction): + continue + auto_axis_inames = [ iname for iname in kernel.insn_inames(insn) @@ -900,6 +939,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): # {{{ temp storage adjust for bank conflict def adjust_local_temp_var_storage(kernel): + logger.debug("%s: adjust temp var storage" % kernel.name) + new_temp_vars = {} lmem_size = cl_char.usable_local_mem_size(kernel.device) @@ -975,49 +1016,35 @@ def adjust_local_temp_var_storage(kernel): def preprocess_kernel(kernel): - logger.info("preprocess %s: start" % kernel.name) + logger.info("%s: preprocess start" % kernel.name) from loopy.subst import expand_subst - logger.debug("preprocess %s: expand subst" % kernel.name) kernel = expand_subst(kernel) # Ordering restriction: # Type inference doesn't handle substitutions. Get them out of the # way. - logger.debug("preprocess %s: infer types" % kernel.name) kernel = infer_unknown_types(kernel, expect_completion=False) # Ordering restriction: # realize_reduction must happen after type inference because it needs # to be able to determine the types of the reduced expressions. - logger.debug("preprocess %s: realize reduction" % kernel.name) kernel = realize_reduction(kernel) # Ordering restriction: # duplicate_private_temporaries_for_ilp because reduction accumulators # need to be duplicated by this. - logger.debug("preprocess %s: duplicate temporaries for ilp" % kernel.name) kernel = duplicate_private_temporaries_for_ilp(kernel) - - logger.debug("preprocess %s: mark local temporaries" % kernel.name) kernel = mark_local_temporaries(kernel) - - logger.debug("preprocess %s: assign automatic axes" % kernel.name) kernel = assign_automatic_axes(kernel) - - logger.debug("preprocess %s: automatic deps, boostability" % kernel.name) kernel = add_boostability_and_automatic_dependencies(kernel) - - logger.debug("preprocess %s: limit boostability" % kernel.name) kernel = limit_boostability(kernel) - - logger.debug("preprocess %s: adjust temp var storage" % kernel.name) kernel = adjust_local_temp_var_storage(kernel) - logger.info("preprocess %s: done" % kernel.name) + logger.info("%s: preprocess done" % kernel.name) return kernel diff --git a/loopy/schedule.py b/loopy/schedule.py index c60505204..93565cde1 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -794,10 +794,10 @@ def generate_loop_schedules(kernel, debug_args={}): from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) - from loopy.check import run_automatic_checks - run_automatic_checks(kernel) + from loopy.check import pre_schedule_checks + pre_schedule_checks(kernel) - logger.info("schedule %s: start" % kernel.name) + logger.info("%s: schedule start" % kernel.name) schedule_count = 0 @@ -874,7 +874,7 @@ def generate_loop_schedules(kernel, debug_args={}): raise RuntimeError("no valid schedules found") - logger.info("schedule %s: done" % kernel.name) + logger.info("%s: schedule done" % kernel.name) # }}} diff --git a/loopy/subst.py b/loopy/subst.py index 2c3669c83..171a2daa6 100644 --- a/loopy/subst.py +++ b/loopy/subst.py @@ -30,6 +30,10 @@ from pytools import Record from pymbolic import var +import logging +logger = logging.getLogger(__name__) + + class ExprDescriptor(Record): __slots__ = ["insn", "expr", "unif_var_dict"] @@ -184,12 +188,18 @@ def extract_subst(kernel, subst_name, template, parameters): def expand_subst(kernel, ctx_match=None): + logger.debug("%s: expand subst" % kernel.name) + from loopy.symbolic import SubstitutionRuleExpander from loopy.context_matching import parse_stack_match submap = SubstitutionRuleExpander(kernel.substitutions, kernel.get_var_name_generator(), parse_stack_match(ctx_match)) - return submap.map_kernel(kernel) + kernel = submap.map_kernel(kernel) + if ctx_match is None: + return kernel.copy(substitutions={}) + else: + return kernel # vim: foldmethod=marker diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 2352269d4..81837f98d 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -330,8 +330,9 @@ class SubstitutionRuleRenamer(IdentityMapper): def rename_subst_rules_in_instructions(insns, renames): subst_renamer = SubstitutionRuleRenamer(renames) + return [ - insn.copy(expression=subst_renamer(insn.expression)) + insn.with_transformed_expressions(subst_renamer) for insn in insns] @@ -486,14 +487,11 @@ class ExpandingIdentityMapper(IdentityMapper): def map_kernel(self, kernel): new_insns = [ - insn.copy( - # While subst rules are not allowed in assignees, the mapper - # may perform tasks entirely unrelated to subst rules, so - # we must map assignees, too. - assignee=self(insn.assignee, insn.id), - - expression=self(insn.expression, insn.id)) + # While subst rules are not allowed in assignees, the mapper + # may perform tasks entirely unrelated to subst rules, so + # we must map assignees, too. + insn.with_transformed_expressions(self, insn.id) for insn in kernel.instructions] new_substs, renames = self._get_new_substitutions_and_renames() diff --git a/test/test_dg.py b/test/test_dg.py index 8de1c8eec..956bee2d7 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -27,11 +27,15 @@ import numpy as np import pyopencl as cl import loopy as lp +import logging # noqa + from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) def test_dg_volume(ctx_factory): + #logging.basicConfig(level=logging.DEBUG) + dtype = np.float32 dtype4 = cl.array.vec.float4 ctx = ctx_factory() diff --git a/test/test_loopy.py b/test/test_loopy.py index 5407fea58..ea222bc82 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -481,7 +481,7 @@ def test_fuzz_code_generator(ctx_factory): return np.float64 knl = lp.make_kernel(ctx.devices[0], "{ : }", - [lp.ExpressionInstruction(None, "value", expr)], + [lp.ExpressionInstruction("value", expr)], [lp.GlobalArg("value", np.complex128, shape=())] + [ lp.ValueArg(name, get_dtype(val)) @@ -1165,6 +1165,32 @@ def test_convolution_like(ctx_factory): parameters={"im_w": 1024, "im_h": 1024, "f_w": 7}) +def test_c_instruction(ctx_factory): + logging.basicConfig(level=logging.DEBUG) + ctx = ctx_factory() + + knl = lp.make_kernel(ctx.devices[0], [ + "{[i,j]: 0<=i,j<n }", + ], + [ + lp.CInstruction("i", """ + x = sin((float) i); + """, assignees="x"), + "a[i*i] = x", + ], + [ + lp.GlobalArg("a", shape="n"), + lp.ValueArg("n"), + lp.TemporaryVariable("x", np.float32), + ], + assumptions="n>=1") + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + + print knl + print lp.CompiledKernel(ctx, knl).get_highlighted_code() + + if __name__ == "__main__": import sys if len(sys.argv) > 1: -- GitLab