diff --git a/loopy/__init__.py b/loopy/__init__.py index d27a3225bfe9bcc30b5b8342d452b881692262f1..48dd2ac41ad7040f7a28f32edd553ae0d3d4da5b 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -22,6 +22,7 @@ register_mpz_with_pymbolic() +# TODO: Wrong 19 # TODO: Try, fix reg. prefetch # TODO: Divisibility # TODO: nD Texture access @@ -471,6 +472,16 @@ class ArrayArg: +class ImageArg: + def __init__(self, name, dtype, dimensions): + self.name = name + self.dtype = np.dtype(dtype) + self.dimensions = dimensions + + def __repr__(self): + return "<ImageArg '%s' of type %s>" % (self.name, self.dtype) + + class ScalarArg: def __init__(self, name, dtype, approximately): self.name = name @@ -1140,27 +1151,36 @@ class LoopyCCodeMapper(CCodeMapper): PREC_SUM)) for iname in pf.inames) - offset = 0 - if isinstance(expr.aggregate, Variable): arg = self.kernel.arg_dict[expr.aggregate.name] - offset = arg.offset - - index_expr = expr.index - if isinstance(expr.index, tuple): - ary_strides = arg.strides - if ary_strides is None: - raise RuntimeError("tuple-indexed variable '%s' does not " - "have stride information" % expr.aggregate.name) + + if isinstance(arg, ImageArg): + if arg.dtype != np.float32: + raise NotImplementedError( + "non-float32 images not supported for now") + + assert isinstance(expr.index, tuple) + return ("read_imagef(%s, loopy_sampler, (float%d)(%s)).x" + % (arg.name, arg.dimensions, + ", ".join(self.rec(idx, PREC_NONE) + for idx in expr.index))) else: - ary_strides = (1,) - index_expr = (index_expr,) + # ArrayArg + index_expr = expr.index + if isinstance(expr.index, tuple): + ary_strides = arg.strides + if ary_strides is None: + raise RuntimeError("tuple-indexed variable '%s' does not " + "have stride information" % expr.aggregate.name) + else: + ary_strides = (1,) + index_expr = (index_expr,) - from pymbolic.primitives import Subscript - return CCodeMapper.map_subscript(self, - Subscript(expr.aggregate, offset+sum( - stride*expr_i for stride, expr_i in zip( - ary_strides, index_expr))), enclosing_prec) + from pymbolic.primitives import Subscript + return CCodeMapper.map_subscript(self, + Subscript(expr.aggregate, arg.offset+sum( + stride*expr_i for stride, expr_i in zip( + ary_strides, index_expr))), enclosing_prec) return CCodeMapper.map_subscript(self, expr, enclosing_prec) @@ -1336,9 +1356,13 @@ def generate_prefetch_code(cgs, kernel, sched_index, implemented_domain): index_expr = (index_expr,) arg = kernel.arg_dict[pf.input_vector] - ary_strides = arg.strides - if ary_strides is None and len(index_expr) == 1: - ary_strides = (1,) + if isinstance(arg, ImageArg): + # arbitrary + ary_strides = (1, 1, 1)[:arg.dimensions] + else: + ary_strides = arg.strides + if ary_strides is None and len(index_expr) == 1: + ary_strides = (1,) iname_to_stride = {} for iexpr_i, stride in zip(index_expr, ary_strides): @@ -1803,9 +1827,10 @@ class CodeGenerationState(Record): def generate_code(kernel): from cgen import (FunctionBody, FunctionDeclaration, POD, Value, ArrayOf, Module, Block, - Define, Line, Const, LiteralLines) + Define, Line, Const, LiteralLines, Initializer) - from cgen.opencl import CLKernel, CLGlobal, CLRequiredWorkGroupSize, CLLocal + from cgen.opencl import (CLKernel, CLGlobal, CLRequiredWorkGroupSize, + CLLocal, CLImage) # {{{ assign names, dim storage lengths to prefetches @@ -1847,11 +1872,12 @@ def generate_code(kernel): mod = Module() + body = Block() + group_size = kernel.tag_type_lengths(TAG_WORK_ITEM_IDX) # {{{ examine arg list - has_double = False def restrict_ptr_if_not_nvidia(arg): from cgen import Pointer, RestrictPointer @@ -1861,6 +1887,9 @@ def generate_code(kernel): else: return RestrictPointer(arg) + has_double = False + has_image = False + args = [] for arg in kernel.args: if isinstance(arg, ArrayArg): @@ -1869,6 +1898,15 @@ def generate_code(kernel): if arg_decl.name in kernel.input_vectors(): arg_decl = Const(arg_decl) arg_decl = CLGlobal(arg_decl) + elif isinstance(arg, ImageArg): + if arg.name in kernel.input_vectors(): + mode = "r" + else: + mode = "w" + + arg_decl = CLImage(arg.dimensions, mode, arg.name) + + has_image = True else: arg_decl = Const(POD(arg.dtype, arg.name)) @@ -1882,6 +1920,11 @@ def generate_code(kernel): Line("#pragma OPENCL EXTENSION cl_khr_fp64: enable"), Line()]) + if has_image: + body.append(Initializer(Const(Value("sampler_t", "loopy_sampler")), + "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP " + "| CLK_FILTER_NEAREST")) + # }}} if kernel.preamble is not None: @@ -1905,8 +1948,6 @@ def generate_code(kernel): # }}} - body = Block() - # {{{ build lmem array declarators for prefetches for pf in kernel.prefetch.itervalues():