diff --git a/MEMO b/MEMO index a9257030773de874450e13129960b1b72f9e22b4..b7a492cfce9bfe94c583984d37e5a5e4fc9d49b1 100644 --- a/MEMO +++ b/MEMO @@ -50,13 +50,17 @@ To-do - when are link_inames, duplicate_inames safe? +- rename IndexTag -> InameTag + - Data implementation tags TODO initial bringup: - implemented_arg_info - Arg declaration - Temp var declaration - Adapt padding + - Adapt automatic padding of temp variables - loopy.compiled + - turn base_indices into offset TODO further: - vectorization @@ -64,6 +68,10 @@ To-do - write_image() - change_arg_to_image (test!) + - automatic copies from an array with one set of tags + to the same array with another set. + + - Make tests run on GPUs Fixes: diff --git a/loopy/__init__.py b/loopy/__init__.py index a609ee599fe61e23ba9dedc6db1c7605bcdd9a0e..60954f9cb989d035efad4567c8c3415e8537d7a0 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -45,7 +45,7 @@ class LoopyAdvisory(UserWarning): # {{{ imported user interface from loopy.kernel.data import ( - ValueArg, ScalarArg, GlobalArg, ArrayArg, ConstantArg, ImageArg, + ValueArg, GlobalArg, ConstantArg, ImageArg, default_function_mangler, single_arg_function_mangler, opencl_function_mangler, @@ -802,7 +802,7 @@ def _process_footprint_subscripts(kernel, rule_name, sweep_inames, if not isinstance(fsub, tuple): fsub = (fsub,) - if len(fsub) != arg.dimensions: + if len(fsub) != arg.num_user_axes(): raise ValueError("sweep index '%s' has the wrong number of dimensions") for subst_map in kernel.applied_iname_rewrites: @@ -911,7 +911,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, # {{{ make parameter names and unification template parameters = [] - for i in range(arg.dimensions): + for i in range(arg.num_user_axes()): based_on = "%s_dim_%d" % (c_name, i) if dim_arg_names is not None and i < len(dim_arg_names): based_on = dim_arg_names[i] diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index a369ad4a7e78a0e2e969d20aecff3d3eda1aaade..750330db17c6d90ffb46191a4c3dc955f1591208 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -215,16 +215,25 @@ class POD(PODBase): # }}} +class CLArgumentInfo(Record): + """ + .. attribute:: name + .. attribute:: base_name + .. attribute:: dtype + .. attribute:: shape + .. attribute:: offset_for_name + """ + + # {{{ main code generation entrypoint def generate_code(kernel, with_annotation=False, allow_complex=None): from cgen import (FunctionBody, FunctionDeclaration, - Value, ArrayOf, Module, Block, + Value, Module, Block, Line, Const, LiteralLines, Initializer) - from cgen.opencl import (CLKernel, CLGlobal, CLRequiredWorkGroupSize, - CLLocal, CLImage, CLConstant) + from cgen.opencl import (CLKernel, CLRequiredWorkGroupSize) allow_complex = False for var in kernel.args + list(kernel.temporary_variables.itervalues()): @@ -246,46 +255,32 @@ def generate_code(kernel, with_annotation=False, # {{{ examine arg list - def restrict_ptr_if_not_nvidia(arg): - from cgen import Pointer, RestrictPointer + from loopy.kernel.data import ImageArg, ValueArg + from loopy.kernel.array import ArrayBase - if "nvidia" in kernel.device.platform.name.lower(): - return Pointer(arg) - else: - return RestrictPointer(arg) - - has_image = False + arg_decls = [] + cl_arg_info = [] - from loopy.kernel.data import GlobalArg, ConstantArg, ImageArg, ValueArg - - args = [] for arg in kernel.args: - if isinstance(arg, (ConstantArg, GlobalArg)): - arg_decl = restrict_ptr_if_not_nvidia( - POD(arg.dtype, arg.name)) - if arg_decl.name not in kernel.get_written_variables(): - arg_decl = Const(arg_decl) - if isinstance(arg, ConstantArg): - arg_decl = CLConstant(arg_decl) - else: - arg_decl = CLGlobal(arg_decl) - elif isinstance(arg, ImageArg): - if arg.name in kernel.get_written_variables(): - mode = "w" - else: - mode = "r" - - arg_decl = CLImage(arg.dimensions, mode, arg.name) + if isinstance(arg, ArrayBase): + for cdecl, clai in arg.decl_info( + is_written=arg.name in kernel.get_written_variables(), + index_dtype=kernel.index_dtype): + arg_decls.append(cdecl) + cl_arg_info.append(clai) - has_image = True elif isinstance(arg, ValueArg): - arg_decl = Const(POD(arg.dtype, arg.name)) + arg_decls.append(Const(POD(arg.dtype, arg.name))) + cl_arg_info.append(CLArgumentInfo( + name=arg.name, + base_name=arg.name, + dtype=arg.dtype, + shape=None)) + else: raise ValueError("argument type not understood: '%s'" % type(arg)) - args.append(arg_decl) - - if has_image: + if any(isinstance(arg, ImageArg) for arg in kernel.args): body.append(Initializer(Const(Value("sampler_t", "loopy_sampler")), "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP " "| CLK_FILTER_NEAREST")) @@ -303,20 +298,9 @@ def generate_code(kernel, with_annotation=False, # {{{ build lmem array declarators for temporary variables for tv in kernel.temporary_variables.itervalues(): - temp_var_decl = POD(tv.dtype, tv.name) - - try: - storage_shape = tv.storage_shape - except AttributeError: - storage_shape = tv.shape - - for l in storage_shape: - temp_var_decl = ArrayOf(temp_var_decl, l) - - if tv.is_local: - temp_var_decl = CLLocal(temp_var_decl) - - body.append(temp_var_decl) + for cdecl, clai in tv.decl_info( + is_written=True, index_dtype=kernel.index_dtype): + body.append(cdecl) # }}} @@ -339,7 +323,7 @@ def generate_code(kernel, with_annotation=False, CLRequiredWorkGroupSize( kernel.get_grid_sizes_as_exprs()[1], CLKernel(FunctionDeclaration( - Value("void", kernel.name), args))), + Value("void", kernel.name), arg_decls))), body)) # {{{ handle preambles @@ -374,7 +358,7 @@ def generate_code(kernel, with_annotation=False, assert check_implemented_domains(kernel, gen_code.implemented_domains, result) - return result + return result, cl_arg_info # }}} diff --git a/loopy/codegen/expression.py b/loopy/codegen/expression.py index c0f333d5b58ce2eee82778f8d2d238d7ae1d8020..6baae343efbfedf41641e40ded5c34e88c68ddfc 100644 --- a/loopy/codegen/expression.py +++ b/loopy/codegen/expression.py @@ -242,6 +242,19 @@ def dtype_to_type_context(dtype): return None +VEC_AXES = "xyzw" + + +def get_opencl_vec_member(idx): + if idx is None: + return idx + + if idx < len(VEC_AXES): + return VEC_AXES[idx] + else: + return "s%d" % idx + + class LoopyCCodeMapper(RecursiveMapper): def __init__(self, kernel, seen_dtypes, seen_functions, var_subst_map={}, with_annotation=False, allow_complex=False): @@ -340,9 +353,17 @@ class LoopyCCodeMapper(RecursiveMapper): enclosing_prec, type_context)) elif expr.name in self.kernel.arg_dict: arg = self.kernel.arg_dict[expr.name] - from loopy.kernel.data import ShapedArg - if isinstance(arg, ShapedArg) and arg.shape == (): - return "*"+expr.name + from loopy.kernel.array import ArrayBase + if isinstance(arg, ArrayBase): + if arg.shape == (): + if arg.offset: + # FIXME + raise NotImplementedError("in-memory scalar with offset") + else: + return "*"+expr.name + else: + raise RuntimeError("unsubscripted reference to array '%s'" + % expr.name) for mangler in self.kernel.symbol_manglers: result = mangler(expr.name) @@ -374,87 +395,92 @@ class LoopyCCodeMapper(RecursiveMapper): return base_impl(expr, enclosing_prec, type_context) if expr.aggregate.name in self.kernel.arg_dict: - arg = self.kernel.arg_dict[expr.aggregate.name] + ary = self.kernel.arg_dict[expr.aggregate.name] + elif expr.aggregate.name in self.kernel.temporary_variables: + ary = self.kernel.temporary_variables[expr.aggregate.name] + else: + raise RuntimeError("nothing known about subscripted variable '%s'" + % expr.aggregate.name) - from loopy.kernel.data import ImageArg - if isinstance(arg, ImageArg): - assert isinstance(expr.index, tuple) - - base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" - % (arg.name, arg.dimensions, - ", ".join(self.rec(idx, PREC_NONE, 'i') - for idx in expr.index[::-1]))) - - if arg.dtype == np.float32: - return base_access+".x" - if arg.dtype in cl.array.vec.type_to_scalar_and_count: - return base_access - elif arg.dtype == np.float64: - return "as_double(%s.xy)" % base_access - else: - raise NotImplementedError( - "non-floating-point images not supported for now") + from loopy.kernel.array import ArrayBase + if not isinstance(ary, ArrayBase): + raise RuntimeError("subscripted variable '%s' is not an array" + % expr.aggregate.name) - else: - # GlobalArg - index_expr = expr.index - if not isinstance(expr.index, tuple): - index_expr = (index_expr,) + from loopy.kernel.array import get_access_info + from pymbolic import evaluate - if arg.strides is None: - raise RuntimeError("index access to '%s' requires known " - "strides" % arg.name) + access_info = get_access_info(ary, expr.index, + lambda expr: evaluate(expr, self.var_subst_map)) - if len(arg.strides) != len(index_expr): - raise RuntimeError("subscript to '%s' in '%s' has the wrong " - "number of indices (got: %d, expected: %d)" % ( - expr.aggregate.name, expr, - len(index_expr), len(arg.strides))) + vec_member = get_opencl_vec_member(access_info.vector_index) - from pymbolic.primitives import Subscript, Variable - if arg.offset: - offset = Variable(arg.offset) - else: - offset = 0 + from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable - return base_impl( - Subscript(expr.aggregate, offset+sum( - stride*expr_i for stride, expr_i in zip( - arg.strides, index_expr))), - enclosing_prec, type_context) + if isinstance(ary, ImageArg): + base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" + % (ary.name, ary.dimensions, + ", ".join(self.rec(idx, PREC_NONE, 'i') + for idx in expr.index[::-1]))) + + if ary.dtype == np.float32: + return base_access+".x" + if ary.dtype in cl.array.vec.type_to_scalar_and_count: + return base_access + elif ary.dtype == np.float64: + return "as_double(%s.xy)" % base_access + else: + raise NotImplementedError( + "non-floating-point images not supported for now") + + elif isinstance(ary, (GlobalArg, TemporaryVariable)): + if len(access_info.subscripts) == 0: + if isinstance(ary, GlobalArg): + # unsubscripted global args are pointers + if vec_member is not None: + return "%s%s->%s" % ( + expr.aggregate.name, access_info.array_suffix, + vec_member) + else: + return "*" + expr.aggregate.name+access_info.array_suffix + + else: + # unsubscripted temp vars are scalars + if vec_member is not None: + return "%s%s.%s" % ( + expr.aggregate.name, access_info.array_suffix, + vec_member) + else: + return expr.aggregate.name+access_info.array_suffix - elif expr.aggregate.name in self.kernel.temporary_variables: - temp_var = self.kernel.temporary_variables[expr.aggregate.name] - if isinstance(expr.index, tuple): - index = expr.index else: - index = (expr.index,) + subscript, = access_info.subscripts + result = self.parenthesize_if_needed( + "%s[%s]" % ( + expr.aggregate.name+access_info.array_suffix, + self.rec(subscript, PREC_NONE, 'i')), + enclosing_prec, PREC_CALL) - return (temp_var.name + "".join("[%s]" % self.rec(idx, PREC_NONE, 'i') - for idx in index)) + if vec_member: + result += "."+vec_member + + return result else: - raise RuntimeError( - "nothing known about variable '%s'" % expr.aggregate.name) + assert False def map_linear_subscript(self, expr, enclosing_prec, type_context): - def base_impl(expr, enclosing_prec, type_context): - return self.parenthesize_if_needed( - "%s[%s]" % ( - self.rec(expr.aggregate, PREC_CALL, type_context), - self.rec(expr.index, PREC_NONE, 'i')), - enclosing_prec, PREC_CALL) - from pymbolic.primitives import Variable if not isinstance(expr.aggregate, Variable): - return base_impl(expr, enclosing_prec, type_context) + raise RuntimeError("linear indexing on non-variable: %s" + % expr) if expr.aggregate.name in self.kernel.arg_dict: arg = self.kernel.arg_dict[expr.aggregate.name] from loopy.kernel.data import ImageArg if isinstance(arg, ImageArg): - raise RuntimeError("linear indexing doesn't work on images: %s" + raise RuntimeError("linear indexing is not supported on images: %s" % expr) else: @@ -464,13 +490,14 @@ class LoopyCCodeMapper(RecursiveMapper): else: offset = 0 - from pymbolic.primitives import Subscript - return base_impl( - Subscript(expr.aggregate, offset+expr.index), - enclosing_prec, type_context) + return self.parenthesize_if_needed( + "%s[%s]" % ( + expr.aggregate.name, + self.rec(offset + expr.index, PREC_NONE, 'i')), + enclosing_prec, PREC_CALL) elif expr.aggregate.name in self.kernel.temporary_variables: - raise RuntimeError("linear indexing doesn't work on temporaries: %s" + raise RuntimeError("linear indexing is not supported on temporaries: %s" % expr) else: diff --git a/loopy/compiled.py b/loopy/compiled.py index beb6a645393e4485bda713d6f9e6612a0229d51d..0089c77b0faa18e0e1a12ee7ca64307d498c3ce9 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -122,7 +122,7 @@ class CompiledKernel: self.options = options @memoize_method - def get_kernel_info(self, arg_to_dtype_set, arg_to_has_offset_set): + def get_kernel_info(self, arg_to_dtype_set): kernel = self.kernel import loopy as lp @@ -134,28 +134,6 @@ class CompiledKernel: from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) - if arg_to_has_offset_set: - arg_to_has_offset = dict(arg_to_has_offset_set) - - vng = kernel.get_var_name_generator() - - new_args = [] - for arg in kernel.args: - if getattr(arg, "offset", None) is lp.auto: - if arg_to_has_offset[arg.name]: - offset_arg_name = vng(arg.name+"_offset") - new_args.append(arg.copy(offset=offset_arg_name)) - new_args.append( - lp.ValueArg( - offset_arg_name, kernel.index_dtype)) - else: - new_args.append(arg.copy(offset=0)) - else: - new_args.append(arg) - - kernel = kernel.copy(args=new_args) - - import loopy as lp if kernel.schedule is None: kernel = _get_kernel_from_iterable( lp.generate_loop_schedules(kernel)) @@ -179,14 +157,13 @@ class CompiledKernel: ) @memoize_method - def get_cl_kernel(self, - arg_to_dtype_set, arg_to_has_offset_set, code_op=False): - kernel_info = self.get_kernel_info( - arg_to_dtype_set, arg_to_has_offset_set) + def get_cl_kernel_info(self, + arg_to_dtype_set, code_op=False): + kernel_info = self.get_kernel_info(arg_to_dtype_set) kernel = kernel_info.kernel from loopy.codegen import generate_code - code = generate_code(kernel, **self.codegen_kwargs) + code, cl_arg_info = generate_code(kernel, **self.codegen_kwargs) if code_op == "print": print code @@ -213,35 +190,34 @@ class CompiledKernel: print "[Loopy] "+70*"-" raise - from loopy.kernel.data import ValueArg - arg_types = [] - for arg in kernel.args: - if isinstance(arg, ValueArg): - arg_types.append(arg.dtype) + for arg_info in cl_arg_info: + if arg_info.shape is None: + arg_types.append(arg_info.dtype) else: arg_types.append(None) cl_kernel.set_scalar_arg_dtypes(arg_types) - return kernel_info, cl_kernel + return kernel_info.copy( + cl_kernel=cl_kernel, + cl_arg_info=cl_arg_info) # {{{ debugging aids - def get_code(self, arg_to_dtype=None, arg_to_has_offset=None): + def get_code(self, arg_to_dtype=None): if arg_to_dtype is not None: arg_to_dtype = frozenset(arg_to_dtype.iteritems()) - if arg_to_has_offset is not None: - arg_to_has_offset = frozenset(arg_to_has_offset.iteritems()) - kernel_info = self.get_kernel_info(arg_to_dtype, arg_to_has_offset) + kernel_info = self.get_kernel_info(arg_to_dtype) from loopy.codegen import generate_code - return generate_code(kernel_info.kernel, **self.codegen_kwargs) + code, arg_info = generate_code(kernel_info.kernel, **self.codegen_kwargs) + return code - def get_highlighted_code(self, arg_to_dtype=None, arg_to_has_offset=None): + def get_highlighted_code(self, arg_to_dtype=None): return get_highlighted_code( - self.get_code(arg_to_dtype, arg_to_has_offset)) + self.get_code(arg_to_dtype)) @property def code(self): @@ -274,7 +250,6 @@ class CompiledKernel: import loopy as lp arg_to_dtype = {} - arg_to_has_offset = {} for arg in self.kernel.args: val = kwargs.get(arg.name) @@ -286,24 +261,15 @@ class CompiledKernel: else: arg_to_dtype[arg.name] = dtype - if getattr(arg, "offset", None) is lp.auto: - if val is not None and isinstance(val, cl_array.Array): - has_offset = val.offset != 0 - else: - has_offset = False - arg_to_has_offset[arg.name] = has_offset - - kernel_info, cl_kernel = self.get_cl_kernel( + kernel_info = self.get_cl_kernel_info( frozenset(arg_to_dtype.iteritems()), - frozenset(arg_to_has_offset.iteritems()), code_op) kernel = kernel_info.kernel + cl_kernel = kernel_info.cl_kernel del arg_to_dtype # }}} - import loopy as lp - kwargs.update( kernel.domain_parameter_finder()(kwargs)) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 750b6a58b41a2242a9217dca1cae2fdd6e89c074..744805e9c8c715ca5dd8ce50717bbbe6b3095e86 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -29,6 +29,7 @@ import numpy as np from pytools import Record, memoize_method import islpy as isl from islpy import dim_type +import re from pytools import UniqueNameGenerator, generate_unique_possibilities @@ -47,6 +48,39 @@ class CannotBranchDomainTree(RuntimeError): pass +# {{{ unique var names + +def _is_var_name_conflicting_with_longer(name_a, name_b): + # Array dimensions implemented as separate arrays generate + # names by appending '_s<NUMBER>'. Make sure that no + # conflicts can arise from these names. + + # Only deal with the case of b longer than a. + if not name_b.startswith(name_a): + return False + + return re.match("^%s_s[0-9]+" % re.escape(name_b), name_a) is not None + + +def _is_var_name_conflicting(name_a, name_b): + if name_a == name_b: + return True + + return ( + _is_var_name_conflicting_with_longer(name_a, name_b) + or _is_var_name_conflicting_with_longer(name_b, name_a)) + + +class _UniqueVarNameGenerator(UniqueNameGenerator): + def is_name_conflicting(self, name): + from pytools import any + return any( + _is_var_name_conflicting(name, other_name) + for other_name in self.existing_names) + +# }}} + + # {{{ loop kernel object class LoopKernel(Record): @@ -246,7 +280,7 @@ class LoopKernel(Record): | set(self.all_inames())) def get_var_name_generator(self): - return UniqueNameGenerator(self.all_variable_names()) + return _UniqueVarNameGenerator(self.all_variable_names()) def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()): diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py new file mode 100644 index 0000000000000000000000000000000000000000..08da01b98288b377735858282b6ec6a5013edb7b --- /dev/null +++ b/loopy/kernel/array.py @@ -0,0 +1,711 @@ +"""Implementation tagging of array axes.""" + +from __future__ import division + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import re +from pytools import Record, memoize_method + +import pyopencl as cl # noqa +import pyopencl.array # noqa + +import numpy as np + + +# {{{ array dimension tags + +class ArrayDimImplementationTag(Record): + pass + + +class _StrideArrayDimTagBase(ArrayDimImplementationTag): + pass + + +class FixedStrideArrayDimTag(_StrideArrayDimTagBase): + """An arg dimension implementation tag for a fixed (potentially + symbolic) stride. + + The stride is given in units of :attr:`ArrayBase.dtype`. + + .. attribute :: target_axis + + For objects (such as images) with more than one axis, *target_axis* + sets which of these indices is being targeted by this dimension. + Note that there may be multiple dim_tags with the same *target_axis*, + their contributions are combined additively. + + Note that "normal" arrays only have one *target_axis*. + """ + + def __init__(self, stride, target_axis=0): + _StrideArrayDimTagBase.__init__(self, stride=stride, target_axis=target_axis) + self.stride = stride + self.target_axis = target_axis + + def __str__(self): + return "stride:%s->%d" % (self.stride, self.target_axis) + + def map_expr(self, mapper): + return self.copy(stride=mapper(self.stride)) + + +class ComputedStrideArrayDimTag(_StrideArrayDimTagBase): + """ + :arg order: "C" or "F", indicating whether this argument dimension will be added + as faster-moving ("C") or more-slowly-moving ("F") than the previous + argument. + :arg pad_to: :attr:`ArrayBase.dtype` granularity to which to pad this dimension + + This type of stride arg dim gets converted to :class:`FixedStrideArrayDimTag` + on input to :class:`ArrayBase` subclasses. + """ + + def __init__(self, order, pad_to=None, target_axis=0): + order = order.upper() + if order not in "CF": + raise ValueError("'order' must be either 'C' or 'F'") + + _StrideArrayDimTagBase.__init__(self, order=order, pad_to=pad_to, + target_axis=target_axis) + + def __str__(self): + if self.pad_to is None: + return self.order + else: + return "%s(pad=%s)" % (self.order, self.pad_to) + + def map_expr(self, mapper): + raise TypeError("ComputedStrideArrayDimTag is a transient type only used " + "for construction of arrays. It should never have to map its " + "expressions.") + + +class SeparateArrayArrayDimTag(ArrayDimImplementationTag): + def __str__(self): + return "sep" + + def map_expr(self, mapper): + return self + + +class VectorArrayDimTag(ArrayDimImplementationTag): + def __str__(self): + return "vec" + + def map_expr(self, mapper): + return self + + +PADDED_STRIDE_TAG = re.compile(r"^([a-zA-Z]+)\(pad=(.*)\)$") +TARGET_AXIS_RE = re.compile(r"->([0-9])$") + + +def parse_array_dim_tag(tag): + if isinstance(tag, ArrayDimImplementationTag): + return tag + + if not isinstance(tag, str): + raise TypeError("arg dimension implementation tag must be " + "string or tag object") + + if tag.startswith("stride:"): + from loopy.symbolic import parse + return FixedStrideArrayDimTag(parse(tag[7:])) + elif tag == "sep": + return SeparateArrayArrayDimTag() + elif tag == "vec": + return VectorArrayDimTag() + + target_axis_match = TARGET_AXIS_RE.search(tag) + + if target_axis_match is not None: + target_axis = int(target_axis_match.group(1)) + tag = tag[:target_axis_match.start()] + else: + target_axis = 0 + + if tag in ["c", "C", "f", "F"]: + return ComputedStrideArrayDimTag(tag, target_axis=target_axis) + else: + padded_stride_match = PADDED_STRIDE_TAG.match(tag) + if padded_stride_match is None: + raise ValueError("invalid arg dim tag: '%s'" % tag) + + order = padded_stride_match.group(1) + pad = parse(padded_stride_match.group(2)) + + if order not in ["c", "C", "f", "F"]: + raise ValueError("invalid arg dim tag: '%s'" % tag) + + return ComputedStrideArrayDimTag(order, pad, target_axis=target_axis) + + +def parse_array_dim_tags(dim_tags): + if isinstance(dim_tags, str): + dim_tags = dim_tags.split(",") + + def parse_dim_tag_if_necessary(dt): + if isinstance(dt, str): + dt = parse_array_dim_tag(dt) + return dt + + return [parse_dim_tag_if_necessary(dt) for dt in dim_tags] + + +def convert_computed_to_fixed_dim_tags(name, num_user_axes, num_target_axes, + shape, dim_tags): + + # Just to clarify: + # + # - user axes are user-facing--what the user actually uses for indexing. + # + # - target axes are implementation facing. Normal in-memory arrays have one. + # 3D images have three. + + # {{{ pick apart arg dim tags into computed, fixed and vec + + vector_dim = None + + # one list of indices into dim_tags for each target axis + computed_stride_dim_tags = [[] for i in range(num_target_axes)] + fixed_stride_dim_tags = [[] for i in range(num_target_axes)] + + for i, dt in enumerate(dim_tags): + if isinstance(dt, VectorArrayDimTag): + if vector_dim is not None: + raise ValueError("arg '%s' may only have one vector-tagged " + "argument dimension" % name) + + vector_dim = i + + elif isinstance(dt, FixedStrideArrayDimTag): + fixed_stride_dim_tags[dt.target_axis].append(i) + + elif isinstance(dt, ComputedStrideArrayDimTag): + if dt.order in "cC": + computed_stride_dim_tags[dt.target_axis].insert(0, i) + elif dt.order in "fF": + computed_stride_dim_tags[dt.target_axis].append(i) + else: + raise ValueError("invalid value '%s' for " + "ComputedStrideArrayDimTag.order" % dt.order) + + elif isinstance(dt, SeparateArrayArrayDimTag): + pass + + else: + raise ValueError("invalid array dim tag") + + # }}} + + # {{{ convert computed to fixed stride dim tags + + new_dim_tags = dim_tags[:] + + for target_axis in range(num_target_axes): + if (computed_stride_dim_tags[target_axis] + and fixed_stride_dim_tags[target_axis]): + error_msg = "computed and fixed stride arg dim tags may " \ + "not be mixed for argument '%s'" % name + + if num_target_axes > 1: + error_msg += " (target axis %d)" % target_axis + + raise ValueError(error_msg) + + stride_so_far = 1 + + if fixed_stride_dim_tags[target_axis]: + for i in fixed_stride_dim_tags[target_axis]: + dt = dim_tags[i] + new_dim_tags[i] = dt + else: + for i in computed_stride_dim_tags[target_axis]: + dt = dim_tags[i] + new_dim_tags[i] = FixedStrideArrayDimTag(stride_so_far) + + if shape is None: + # unable to normalize without known shape + return None + + stride_so_far *= shape[i] + + if dt.pad_to is not None: + from pytools import div_ceil + stride_so_far = ( + div_ceil(stride_so_far, dt.pad_to) + * stride_so_far) + + # }}} + + return new_dim_tags + +# }}} + + +# {{{ array base class (for arguments and temporary arrays) + +def _pymbolic_parse_if_necessary(x): + if isinstance(x, str): + from pymbolic import parse + return parse(x) + else: + return x + + +def _parse_shape_or_strides(x): + import loopy as lp + if x == "auto": + from warnings import warn + warn("use of 'auto' as a shape or stride won't work " + "any more--use loopy.auto instead", + stacklevel=3) + x = _pymbolic_parse_if_necessary(x) + if isinstance(x, lp.auto): + return x + if not isinstance(x, tuple): + assert x is not lp.auto + x = (x,) + + return tuple(_pymbolic_parse_if_necessary(xi) for xi in x) + + +class ArrayBase(Record): + """ + .. attribute :: name + + .. attribute :: dtype + + .. attribute :: shape + + .. attribute:: dim_tags + + a list of :class:`ArrayDimImplementationTag` instances. + or a list of strings that :func:`parse_array_dim_tag` understands, + or a comma-separated string of such tags. + + .. attribute:: offset + + """ + + # Note that order may also wind up in attributes, if the + # number of dimensions has not yet been determined. + + def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0, + strides=None, order=None, **kwargs): + """ + All of the following are optional. Specify either strides or shape. + + :arg name: May contain multiple names separated by + commas, in which case multiple arguments, + each with identical properties, are created + for each name. + :arg dtype: the :class:`numpy.dtype` of the array. + If this is *None*, :mod:`loopy` will try to continue + without knowing the type of this array. + + Note that some operations, such as :func:`loopy.add_padding` + will not work without the *dtype*. + + :class:`loopy.CompiledKernel` will automatically compile a kernel + with the right dtype when called with a concrete array on a kernel + with argument whose *dtype* is *None*. + :arg shape: May be one of the following: + + * *None*. In this case, no shape is intended to be specified, + only the strides will be used to access the array. Bounds checking + will not be performed. + + * :class:`loopy.auto`. The shape will be determined by finding the + access footprint. + + * a tuple like like :attr:`numpy.ndarray.shape`. + + Each entry of the tuple is also allowed to be a :mod:`pymbolic` + expression involving kernel parameters, or a (potentially-comma + separated) or a string that can be parsed to such an expression. + + * A string which can be parsed into the previous form. + + :arg dim_tags: A comma-separated list of tags as understood by + :func:`parse_array_dim_tag`. + + :arg strides: May be one of the following: + + * None + + * :class:`loopy.auto`. The strides will be determined by *order* + and the access footprint. + + * a tuple like like :attr:`numpy.ndarray.shape`. + + Each entry of the tuple is also allowed to be a :mod:`pymbolic` + expression involving kernel parameters, or a (potentially-comma + separated) or a string that can be parsed to such an expression. + + * A string which can be parsed into the previous form. + + :arg order: "F" or "C" for C (row major) or Fortran + (column major). Defaults to the *default_order* argument + passed to :func:`loopy.make_kernel`. + :arg offset: Offset from the beginning of the buffer to the point from + which the strides are counted. May be one of + + * 0 + * a string (that is interpreted as an argument name). + * :class:`loopy.auto`, in which case an offset argument + is added automatically, immediately following this argument. + :class:`loopy.CompiledKernel` is even smarter in its treatment of + this case and will compile custom versions of the kernel based on + whether the passed arrays have offsets or not. + """ + + import loopy as lp + + if dtype is not None and dtype is not lp.auto: + dtype = np.dtype(dtype) + + strides_known = strides is not None and strides is not lp.auto + shape_known = shape is not None and shape is not lp.auto + + if strides_known: + strides = _parse_shape_or_strides(strides) + + if shape_known: + shape = _parse_shape_or_strides(shape) + + # {{{ convert strides to dim_tags (Note: strides override order) + + if dim_tags is not None and strides_known: + raise TypeError("may not specify both strides and dim_tags") + + if dim_tags is None and strides_known: + dim_tags = [FixedStrideArrayDimTag(s) for s in strides] + strides = None + + # }}} + + # {{{ determine number of user axes + + num_user_axes = None + if shape_known: + num_user_axes = len(shape) + if dim_tags is not None: + new_num_user_axes = len(dim_tags) + + if num_user_axes is None: + num_user_axes = new_num_user_axes + else: + if new_num_user_axes != num_user_axes: + raise ValueError("contradictory values for number of dimensions " + "from shape, strides, or dim_tags") + + del new_num_user_axes + + # }}} + + # {{{ convert order to dim_tags + + if dim_tags is None and num_user_axes is not None and order is not None: + dim_tags = num_user_axes*[order] + order = None + + # }}} + + if dim_tags is not None: + dim_tags = parse_array_dim_tags(dim_tags) + + # {{{ find number of target axes + + target_axes = set() + for dt in dim_tags: + if isinstance(dt, _StrideArrayDimTagBase): + target_axes.add(dt.target_axis) + + if target_axes != set(xrange(len(target_axes))): + raise ValueError("target axes for variable '%s' are non-" + "contiguous" % self.name) + + num_target_axes = len(target_axes) + del target_axes + + # }}} + + if not (self.min_target_axes <= num_target_axes <= self.max_target_axes): + raise ValueError("%s only supports between %d and %d target axes " + "('%s' has %d)" % (type(self).__name__, self.min_target_axes, + self.max_target_axes, self.name, num_target_axes)) + + new_dim_tags = convert_computed_to_fixed_dim_tags( + name, num_user_axes, num_target_axes, + shape, dim_tags) + + if new_dim_tags is not None: + # successfully normalized + dim_tags = new_dim_tags + del new_dim_tags + + if dim_tags is not None: + # for hashability + dim_tags = tuple(dim_tags) + order = None + + Record.__init__(self, + name=name, + dtype=dtype, + shape=shape, + dim_tags=dim_tags, + offset=offset, + order=order, + strides=strides, + **kwargs) + + def __str__(self): + import loopy as lp + + info_entries = [type(self).__name__, str(self.dtype)] + + if self.shape is None: + pass + elif self.shape is lp.auto: + info_entries.append("shape: auto") + else: + info_entries.append("shape: (%s)" + % ",".join(str(i) for i in self.shape)) + + if self.dim_tags is not None: + info_entries.append("dim_tags: (%s)" + % ",".join(str(i) for i in self.dim_tags)) + + if self.offset: + info_entries.append("offset: %s" % self.offset) + + return "%s: %s" % (self.name, ", ".join(info_entries)) + + def __repr__(self): + return "<%s>" % self.__str__() + + @property + @memoize_method + def numpy_strides(self): + return tuple(self.dtype.itemsize*s for s in self.strides) + + def num_target_axes(self): + target_axes = set() + for dt in self.dim_tags: + if isinstance(dt, _StrideArrayDimTagBase): + target_axes.add(dt.target_axis) + + return len(target_axes) + + def num_user_axes(self, require_answer=True): + if self.shape is not None: + return len(self.shape) + if self.dim_tags is not None: + return len(self.dim_tags) + if require_answer: + raise RuntimeError("number of user axes of array '%s' cannot be found" + % self.name) + else: + return None + + def map_exprs(self, mapper): + """Return a copy of self with all expressions replaced with what *mapper* + transformed them into. + """ + kwargs = {} + import loopy as lp + + if self.shape is not None and self.shape is not lp.auto: + kwargs["shape"] = tuple(mapper(s) for s in self.shape) + + if self.dim_tags is not None: + kwargs["dim_tags"] = [dt.map_expr(mapper) for dt in self.dim_tags] + + # offset is not an expression, do not map. + + return self.copy(**kwargs) + + def decl_info(self, is_written, index_dtype): + """Return a list of tuples ``(cgen_decl, arg_info)``, where + *cgen_decl* is a :mod:`cgen` argument declarations, *arg_info* + is a :class:`CLArgumentInfo` instance. + """ + + from loopy.codegen import CLArgumentInfo + + def gen_decls(name_suffix, shape, dtype, user_index): + if dtype is None: + dtype = self.dtype + + user_axis = len(user_index) + + num_user_axes = self.num_user_axes(require_answer=False) + + if num_user_axes is None or user_axis >= num_user_axes: + # implemented by various argument types + yield (self.get_arg_decl(name_suffix, shape, dtype, is_written), + CLArgumentInfo( + name=self.name + name_suffix, + base_name=self.name, + dtype=dtype, + shape=shape, + offset_for_name=None)) + + if self.offset: + from cgen import Const, POD + yield (Const(POD(index_dtype, + self.name+name_suffix+"_offset")), + CLArgumentInfo( + name=self.name + name_suffix, + base_name=self.name, + dtype=dtype, + shape=shape, + offset_for_name=None)) + + return + + dim_tag = self.dim_tags[user_axis] + + if isinstance(dim_tag, FixedStrideArrayDimTag): + if self.shape is None: + new_shape = shape + (None,) + else: + new_shape = shape + (self.shape[user_axis],) + + for res in gen_decls(name_suffix, new_shape, dtype, + user_index + (None,)): + yield res + + elif isinstance(dim_tag, SeparateArrayArrayDimTag): + shape_i = self.shape[user_axis] + if not isinstance(shape_i, int): + raise RuntimeError("shape of '%s' has non-constant " + "integer axis %d (0-based)" % ( + self.name, user_axis)) + + for i in xrange(shape_i): + for res in gen_decls(name_suffix + "_s%d" % i, + shape + (self.shape[user_axis],), dtype, + user_index + (i,)): + yield res + + elif isinstance(dim_tag, VectorArrayDimTag): + shape_i = self.shape[user_axis] + if not isinstance(shape_i, int): + raise RuntimeError("shape of '%s' has non-constant " + "integer axis %d (0-based)" % ( + self.name, user_axis)) + + for res in gen_decls(name_suffix, shape, + cl.array.vec.types[dtype, shape_i], + user_index + (None,)): + yield res + + else: + raise RuntimeError("unsupported array dim implementation tag '%s' " + "in array '%s'" % (dim_tag, self.name)) + + for res in gen_decls("", (), self.dtype, ()): + yield res + +# }}} + + +# {{{ access code generation + +class AccessInfo(Record): + """ + :ivar array_suffix: + :ivar vector_index: + :ivar subscripts: List of expressions, one for each target axis + """ + + +def get_access_info(ary, index, eval_expr): + """ + :arg ary: an object of type :class:`ArrayBase` + :arg index: a tuple of indices representing a subscript into ary + """ + if not isinstance(index, tuple): + index = (index,) + + if ary.shape is None: + return AccessInfo(subscripts=index, vector_index=0) + + if len(ary.shape) != len(index): + raise RuntimeError("subscript to '%s[%s]' has the wrong " + "number of indices (got: %d, expected: %d)" % ( + ary.name, index, len(index), len(ary.shape))) + + num_target_axes = ary.num_target_axes() + + array_suffix = "" + vector_index = None + subscripts = [0] * num_target_axes + + for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)): + if isinstance(dim_tag, FixedStrideArrayDimTag): + subscripts[dim_tag.target_axis] += dim_tag.stride*idx + elif isinstance(dim_tag, SeparateArrayArrayDimTag): + idx = eval_expr(idx) + if not isinstance(idx, int): + raise RuntimeError("subscript '%s[%s]' has non-constant " + "index for separate-array axis %d (0-based)" % ( + ary.name, index, i)) + array_suffix += "_s%d" % idx + elif isinstance(dim_tag, VectorArrayDimTag): + idx = eval_expr(idx) + + if not isinstance(idx, int): + raise RuntimeError("subscript '%s[%s]' has non-constant " + "index for separate-array axis %d (0-based)" % ( + ary.name, index, i)) + assert vector_index is None + vector_index = idx + else: + raise RuntimeError("unsupported array dim implementation tag '%s' " + "in array '%s'" % (dim_tag, ary.name)) + + from pymbolic import var + import loopy as lp + if ary.offset: + if num_target_axes > 1: + raise NotImplementedError("offsets for multiple image axes") + + offset_name = ary.offset + if offset_name is lp.auto: + offset_name = ary.name+array_suffix+"_offset" + + subscripts[0] = var(offset_name) + subscripts[0] + + return AccessInfo( + array_suffix=array_suffix, + vector_index=vector_index, + subscripts=subscripts) + +# }}} + +# vim: fdm=marker diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 18c95fe9881db1d80e2c96c9da1ddb038b9f58bd..cf136a9f4f04318fcc06038fdbc9882f206036ad 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -694,8 +694,7 @@ def check_for_reduction_inames_duplication_requests(kernel): # {{{ duplicate arguments and expand defines in shapes def dup_args_and_expand_defines_in_shapes(kernel, defines): - import loopy as lp - from loopy.kernel.data import ShapedArg + from loopy.kernel.array import ArrayBase from loopy.kernel.creation import expand_defines_in_expr processed_args = [] @@ -705,14 +704,9 @@ def dup_args_and_expand_defines_in_shapes(kernel, defines): continue new_arg = arg.copy(name=arg_name) - if isinstance(arg, ShapedArg): - if arg.shape is not None and arg.shape is not lp.auto: - new_arg = new_arg.copy( - shape=expand_defines_in_expr(arg.shape, defines)) - if arg.strides is not None and arg.strides is not lp.auto: - new_arg = new_arg.copy( - strides=expand_defines_in_expr( - arg.strides, defines)) + if isinstance(arg, ArrayBase): + new_arg = arg.map_exprs( + lambda expr: expand_defines_in_expr(expr, defines)) processed_args.append(new_arg) @@ -727,15 +721,14 @@ def guess_arg_shape_if_requested(kernel, default_order): new_args = [] import loopy as lp - from loopy.kernel.data import ShapedArg + from loopy.kernel.array import ArrayBase from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper submap = SubstitutionRuleExpander(kernel.substitutions, kernel.get_var_name_generator()) for arg in kernel.args: - if isinstance(arg, ShapedArg) and ( - arg.shape is lp.auto or arg.strides is lp.auto): + if isinstance(arg, ArrayBase) and arg.shape is lp.auto: armap = AccessRangeMapper(kernel, arg.name) for insn in kernel.instructions: @@ -783,11 +776,11 @@ def guess_arg_shape_if_requested(kernel, default_order): # {{{ apply default_order to args def apply_default_order_to_args(kernel, default_order): - from loopy.kernel.data import ShapedArg + from loopy.kernel.array import ArrayBase processed_args = [] for arg in kernel.args: - if isinstance(arg, ShapedArg): + if isinstance(arg, ArrayBase): arg = arg.copy(order=default_order) processed_args.append(arg) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 1daf63747137bc9da2caa8c78056b7cab01945e9..61399640335f8fd719fa2ca0fe71fc83a1dfb95b 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -27,6 +27,7 @@ THE SOFTWARE. import numpy as np from pytools import Record, memoize_method +from loopy.kernel.array import ArrayBase # {{{ iname tags @@ -147,187 +148,61 @@ def parse_tag(tag): # {{{ arguments -def make_strides(shape, order): - from pyopencl.compyte.array import ( - f_contiguous_strides, - c_contiguous_strides) - - if order == "F": - return f_contiguous_strides(1, shape) - elif order == "C": - return c_contiguous_strides(1, shape) - else: - raise ValueError("invalid order: %s" % order) - class KernelArgument(Record): pass -class ShapedArg(KernelArgument): - def __init__(self, name, dtype=None, shape=None, strides=None, order=None, - offset=0): - """ - All of the following are optional. Specify either strides or shape. - - :arg name: May contain multiple names separated by - commas, in which case multiple arguments, - each with identical properties are created - for each name. - :arg dtype: the :class:`numpy.dtype` of the array. - If this is *None*, :mod:`loopy` will try to continue - without knowing the type of this array. - - Note that some operations, such as :func:`loopy.add_padding` - require this information to work. - - :class:`loopy.CompiledKernel` will automatically compile a kernel - with the right dtype when called with a concrete array on a kernel - with argument whose *dtype* is *None*. - :arg shape: like :attr:`numpy.ndarray.shape`. - Also allowed to be :class:`loopy.auto`, in - which case shape is determined by finding the - access footprint. - - This is also allowed to be an expression involving - kernel parameters, or a (potentially-comma separated) - string that can be parsed to such an expression. - :arg strides: like :attr:`numpy.ndarray.strides`, - but in multiples of data type size. - Also allowed to be :class:`loopy.auto`, in which - case strides are determined from shape and - *default_order* of :func:`loopy.make_kernel`. - - This is also allowed to be an expression involving - kernel parameters, or a (potentially-comma separated) - string that can be parsed to such an expression. - :arg order: "F" or "C" for C (row major) or Fortran - (column major) - :arg offset: Offset from the beginning of the buffer to the point from - which the strides are counted. May be one of - - * 0 - * a string (that is interpreted as an argument name). - * :class:`loopy.auto`, in which case an offset argument - is added automatically, immediately following this argument. - :class:`loopy.CompiledKernel` is even smarter in its treatment of - this case and will compile custom versions of the kernel based on - whether the passed arrays have offsets or not. - """ - if dtype is not None: - dtype = np.dtype(dtype) +class GlobalArg(ArrayBase, KernelArgument): + min_target_axes = 0 + max_target_axes = 1 - def parse_if_necessary(x): - if isinstance(x, str): - from pymbolic import parse - return parse(x) - else: - return x - - def process_tuple(x): - if x == "auto": - from warnings import warn - warn("use of 'auto' as a shape or stride won't work " - "any more--use loopy.auto instead", - stacklevel=3) - x = parse_if_necessary(x) - if isinstance(x, lp.auto): - return x - if not isinstance(x, tuple): - assert x is not lp.auto - x = (x,) - - return tuple(parse_if_necessary(xi) for xi in x) - - import loopy as lp - strides_known = strides is not None and strides is not lp.auto - shape_known = shape is not None and shape is not lp.auto - - if strides_known: - strides = process_tuple(strides) - - if shape_known: - shape = process_tuple(shape) - - if not strides_known and shape_known: - if len(shape) == 1: - # don't need order to know that - strides = (1,) - elif order is not None: - strides = make_strides(shape, order) + def get_arg_decl(self, name_suffix, shape, dtype, is_written): + from cgen import RestrictPointer, POD, Const + from cgen.opencl import CLGlobal - Record.__init__(self, - name=name, - dtype=dtype, - strides=strides, - offset=offset, - shape=shape) + arg_decl = RestrictPointer( + POD(dtype, self.name + name_suffix)) - @property - @memoize_method - def numpy_strides(self): - return tuple(self.dtype.itemsize*s for s in self.strides) + if not is_written: + arg_decl = Const(arg_decl) - @property - def dimensions(self): - return len(self.strides) + return CLGlobal(arg_decl) - def __str__(self): - import loopy as lp - if self.shape is None: - shape = "unknown" - elif self.shape is lp.auto: - shape = "auto" - else: - shape = ",".join(str(i) for i in self.shape) +class ConstantArg(ArrayBase, KernelArgument): + min_target_axes = 0 + max_target_axes = 1 - if self.strides is None: - strides = "unknown" - elif self.strides is lp.auto: - strides = "auto" + def get_arg_decl(self, name_suffix, shape, dtype, is_written): + if is_written: + mode = "w" else: - strides = ",".join(str(i) for i in self.strides) + mode = "r" - return "%s: %s, type: %s, shape: (%s), strides: (%s)" % ( - self.name, type(self).__name__, self.dtype, shape, - strides) + from cgen.opencl import CLImage + return CLImage(self.num_target_axes(), mode, self.name+name_suffix) - def __repr__(self): - return "<%s>" % self.__str__() +class ImageArg(ArrayBase, KernelArgument): + min_target_axes = 1 + max_target_axes = 3 -class GlobalArg(ShapedArg): - pass - - -class ConstantArg(ShapedArg): - pass + @property + def dimensions(self): + return len(self.dim_tags) + def get_arg_decl(self, name_suffix, shape, dtype, is_written): + from cgen import RestrictPointer, POD, Const + from cgen.opencl import CLConstant -class ImageArg(KernelArgument): - def __init__(self, name, dtype=None, dimensions=None, shape=None): - dtype = np.dtype(dtype) - if shape is not None: - if dimensions is not None and dimensions != len(shape): - raise RuntimeError("cannot specify both shape and " - "disagreeing dimensions in ImageArg") - dimensions = len(shape) - else: - if not isinstance(dimensions, int): - raise RuntimeError("ImageArg: dimensions must be an integer") + arg_decl = RestrictPointer( + POD(dtype, self.name + name_suffix)) - Record.__init__(self, - dimensions=dimensions, - shape=shape, - dtype=dtype, - name=name) + if not is_written: + arg_decl = Const(arg_decl) - def __str__(self): - return "%s: ImageArg, type %s" % (self.name, self.dtype) - - def __repr__(self): - return "<%s>" % self.__str__() + return CLConstant(arg_decl) class ValueArg(KernelArgument): @@ -349,26 +224,25 @@ class ValueArg(KernelArgument): # {{{ temporary variable -class TemporaryVariable(Record): - """ - :ivar name: - :ivar dtype: - :ivar shape: - :ivar storage_shape: - :ivar base_indices: - :ivar is_local: +class TemporaryVariable(ArrayBase): + __doc__ = ArrayBase.__doc__ + """ + .. attribute:: storage_shape + .. attribute:: base_indices + .. attribute:: is_local """ - def __init__(self, name, dtype, shape, is_local, base_indices=None, - storage_shape=None): + min_target_axes = 0 + max_target_axes = 1 + + def __init__(self, name, dtype, shape, is_local, + dim_tags=None, offset=0, strides=None, order=None, + base_indices=None, storage_shape=None): if base_indices is None: base_indices = (0,) * len(shape) - if shape is not None and not isinstance(shape, tuple): - shape = tuple(shape) - - Record.__init__(self, name=name, dtype=dtype, shape=shape, is_local=is_local, - base_indices=base_indices, + ArrayBase.__init__(self, name=name, dtype=dtype, shape=shape, + dim_tags=dim_tags, order="C", + base_indices=base_indices, is_local=is_local, storage_shape=storage_shape) @property @@ -376,6 +250,23 @@ class TemporaryVariable(Record): from pytools import product return product(si for si in self.shape)*self.dtype.itemsize + def get_arg_decl(self, name_suffix, shape, dtype, is_written): + from cgen import ArrayOf, POD + from cgen.opencl import CLLocal + + temp_var_decl = POD(self.dtype, self.name) + + # FIXME take into account storage_shape, or something like it + storage_shape = self.shape + + for l in storage_shape: + temp_var_decl = ArrayOf(temp_var_decl, l) + + if self.is_local: + temp_var_decl = CLLocal(temp_var_decl) + + return temp_var_decl + # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 1dfba0cb0dc2d3109d193e0de00b671f8ebeb5c0..09adb1278fb45bfafd91a2603c5253a163f421ff 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -480,7 +480,10 @@ def duplicate_private_temporaries_for_ilp(kernel): if shape is None: shape = () - new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape) + new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape, + # Forget what you knew about data layout, + # create from scratch. + dim_tags=None) # }}} @@ -957,27 +960,6 @@ def adjust_local_temp_var_storage(kernel): # }}} -# {{{ add automatic offset arguments - -def add_auto_offset_args(kernel): - import loopy as lp - - vng = kernel.get_var_name_generator() - - new_args = [] - for arg in kernel.args: - if getattr(arg, "offset", None) is lp.auto: - offset_arg_name = vng(arg.name+"_offset") - new_args.append(arg.copy(offset=offset_arg_name)) - new_args.append(lp.ValueArg(offset_arg_name, kernel.index_dtype)) - else: - new_args.append(arg) - - return kernel.copy(args=new_args) - -# }}} - - def preprocess_kernel(kernel): from loopy.subst import expand_subst kernel = expand_subst(kernel) @@ -1005,7 +987,6 @@ def preprocess_kernel(kernel): kernel = add_boostability_and_automatic_dependencies(kernel) kernel = limit_boostability(kernel) kernel = adjust_local_temp_var_storage(kernel) - kernel = add_auto_offset_args(kernel) return kernel diff --git a/test/test_loopy.py b/test/test_loopy.py index afc1f0559b183c42be6d14713941f4db7e3cb0d8..5875912bacd96df81f443180f768276a603412ac 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -681,7 +681,7 @@ def test_dependent_loop_bounds_2(ctx_factory): [ lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto), lp.GlobalArg("a_indices", np.int32, shape=lp.auto), - lp.GlobalArg("a_values", dtype), + lp.GlobalArg("a_values", dtype, strides=(1,)), lp.GlobalArg("ax", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], @@ -1117,7 +1117,7 @@ def test_array_with_offset(ctx_factory): a_full = cl.clrandom.rand(queue, (n, n), np.float64) a = a_full[3:10] - print cknl.get_highlighted_code({"a": a.dtype}, {"a": True, "b": False}) + print cknl.get_highlighted_code({"a": a.dtype}) evt, (b,) = cknl(queue, a=a) import numpy.linalg as la