diff --git a/MEMO b/MEMO index ec9b4ee1766846b4f8be94303d97b18a6f344c22..ad6cd94d588232cfc5603ce16b2a68adaf5b38b8 100644 --- a/MEMO +++ b/MEMO @@ -44,12 +44,17 @@ To-do - What if no universally valid precompute base index expression is found? (test_intel_matrix_mul with n = 6*16, e.g.?) -- When duplicating, use iname aliases to relieve burden on isl +- When duplicating inames, use iname aliases to relieve burden on isl - Differentiate ilp.unr from ilp.seq - Expose iname-duplicate-and-rename as a primitive. +- String instructions? + +- Making parameters run-time varying, substituting values that + depend on other inames? + - Fix all tests Future ideas @@ -95,6 +100,8 @@ Future ideas Dealt with ^^^^^^^^^^ +- Allow complex-valued arithmetic, despite CL's best efforts. + - "No schedule found" debug help: - Find longest dead-end diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 62977a48485e51c881cb087c06757577956e4461..20132276edece9822ecf1fc2d94dedc4691b4b7f 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -195,8 +195,7 @@ def generate_code(kernel, with_annotation=False): from cgen.opencl import (CLKernel, CLGlobal, CLRequiredWorkGroupSize, CLLocal, CLImage, CLConstant) - from loopy.symbolic import LoopyCCodeMapper, pw_aff_to_expr - + from loopy.codegen.expression import LoopyCCodeMapper ccm = (LoopyCCodeMapper(kernel, with_annotation=with_annotation) .copy_and_assign_many(make_initial_assignments(kernel))) @@ -301,24 +300,6 @@ def generate_code(kernel, with_annotation=False): from loopy.codegen.loop import set_up_hw_parallel_loops gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state) - gen_code_str = str(gen_code) - - if "int_floor_div" in gen_code_str: - mod.extend(""" - #define int_floor_div(a,b) \ - (( (a) - \ - ( ( (a)<0 ) != ( (b)<0 )) \ - *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \ - / (b) ) - """) - - if "int_floor_div_pos_b" in gen_code_str: - mod.extend(""" - #define int_floor_div_pos_b(a,b) ( \ - ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) \ - ) - """) - body.append(Line()) if isinstance(gen_code.ast, Block): @@ -326,6 +307,7 @@ def generate_code(kernel, with_annotation=False): else: body.append(gen_code.ast) + from loopy.symbolic import pw_aff_to_expr mod.append( FunctionBody( CLRequiredWorkGroupSize( @@ -334,7 +316,24 @@ def generate_code(kernel, with_annotation=False): Value("void", kernel.name), args))), body)) - # }}} + gen_code_str = str(gen_code) + + from cgen import LiteralLines + if "int_floor_div" in gen_code_str: + mod.extend(LiteralLines(""" + #define int_floor_div(a,b) \ + (( (a) - \ + ( ( (a)<0 ) != ( (b)<0 )) \ + *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \ + / (b) ) + """)) + + if "int_floor_div_pos_b" in gen_code_str: + mod.extend(LiteralLines(""" + #define int_floor_div_pos_b(a,b) ( \ + ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) \ + ) + """)) from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, gen_code.implemented_domains) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 07bb62c3c7ce1967f7bfb3c00186ca9abd5b68e9..9ed8da938ba9ded5204d4fb9c10a9799e37cc3d6 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -157,7 +157,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): # {{{ pass 3: greedily group schedule items that share admissible inames def build_insn_group(sched_indices_and_cond_inames, codegen_state, done_group_lengths=set()): - # min_iname_count serves to prevent infinite recursion by imposing a + # done_group_lengths serves to prevent infinite recursion by imposing a # bigger and bigger minimum size on the group of shared inames found. if not sched_indices_and_cond_inames: @@ -226,7 +226,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): # group only contains starting schedule item result = [generate_code_for_sched_index(kernel, sched_index, new_codegen_state)] else: - # recurse with a bigger minimum iname count + # recurse with a bigger done_group_lengths result = build_insn_group( sched_indices_and_cond_inames[0:group_length], new_codegen_state, diff --git a/loopy/codegen/expression.py b/loopy/codegen/expression.py new file mode 100644 index 0000000000000000000000000000000000000000..fc9e8a3b7ab3c3423d682e85602910c6e077623f --- /dev/null +++ b/loopy/codegen/expression.py @@ -0,0 +1,332 @@ +from __future__ import division + +import numpy as np + +from pymbolic.mapper.c_code import CCodeMapper as CCodeMapper +from pymbolic.mapper.stringifier import PREC_NONE +from pymbolic.mapper import CombineMapper + +# {{{ type inference + +class TypeInferenceMapper(CombineMapper): + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, dtypes): + dtypes = list(dtypes) + + result = dtypes.pop() + while dtypes: + other = dtypes.pop() + + if result.isbuiltin and other.isbuiltin: + result = (np.empty(0, dtype=result) + np.empty(0, dtype=other)).dtype + elif result.isbuiltin and not other.isbuiltin: + # assume the non-natiev type takes over + result = other + elif not result.isbuiltin and other.isbuiltin: + # assume the non-natiev type takes over + pass + else: + if not result is other: + raise TypeError("nothing known about result of operation on " + "'%s' and '%s'" % (result, other)) + + return result + + def map_constant(self, expr): + return np.asarray(expr).dtype + + def map_subscript(self, expr): + return self.rec(expr.aggregate) + + def map_variable(self, expr): + try: + return self.kernel.arg_dict[expr.name].dtype + except KeyError: + pass + + try: + return self.kernel.temporary_variables[expr.name].dtype + except KeyError: + pass + + if expr.name in self.kernel.all_inames(): + return np.dtype(np.int16) # don't force single-precision upcast + + raise RuntimeError("type inference: nothing known about '%s'" % expr.name) + +# }}} + +# {{{ C code mapper + +class LoopyCCodeMapper(CCodeMapper): + def __init__(self, kernel, cse_name_list=[], var_subst_map={}, + with_annotation=False, ): + def constant_mapper(c): + if isinstance(c, float): + # FIXME: type-variable + return "%sf" % repr(c) + else: + return repr(c) + + CCodeMapper.__init__(self, constant_mapper=constant_mapper, + cse_name_list=cse_name_list) + self.kernel = kernel + self.infer_type = TypeInferenceMapper(kernel) + + self.with_annotation = with_annotation + self.var_subst_map = var_subst_map.copy() + + def copy(self, var_subst_map=None, cse_name_list=None): + if var_subst_map is None: + var_subst_map = self.var_subst_map + if cse_name_list is None: + cse_name_list = self.cse_name_list + return LoopyCCodeMapper(self.kernel, + cse_name_list=cse_name_list, var_subst_map=var_subst_map, + with_annotation=self.with_annotation) + + def copy_and_assign(self, name, value): + """Make a copy of self with variable *name* fixed to *value*.""" + var_subst_map = self.var_subst_map.copy() + var_subst_map[name] = value + return self.copy(var_subst_map=var_subst_map) + + def copy_and_assign_many(self, assignments): + """Make a copy of self with *assignments* included.""" + + var_subst_map = self.var_subst_map.copy() + var_subst_map.update(assignments) + return self.copy(var_subst_map=var_subst_map) + + def map_common_subexpression(self, expr, prec): + raise RuntimeError("common subexpressions are not allowed in loopy") + + def map_variable(self, expr, prec): + if expr.name in self.var_subst_map: + if self.with_annotation: + return " /* %s */ %s" % ( + expr.name, + self.rec(self.var_subst_map[expr.name], prec)) + else: + return str(self.rec(self.var_subst_map[expr.name], prec)) + else: + return CCodeMapper.map_variable(self, expr, prec) + + def map_subscript(self, expr, enclosing_prec): + from pymbolic.primitives import Variable + if not isinstance(expr.aggregate, Variable): + return CCodeMapper.map_subscript(self, expr, enclosing_prec) + + if expr.aggregate.name in self.kernel.arg_dict: + arg = self.kernel.arg_dict[expr.aggregate.name] + + from loopy.kernel import ImageArg + if isinstance(arg, ImageArg): + assert isinstance(expr.index, tuple) + + base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" + % (arg.name, arg.dimensions, + ", ".join(self.rec(idx, PREC_NONE) + for idx in expr.index[::-1]))) + + if arg.dtype == np.float32: + return base_access+".x" + elif arg.dtype == np.float64: + return "as_double(%s.xy)" % base_access + else: + raise NotImplementedError( + "non-floating-point images not supported for now") + + else: + # ArrayArg + index_expr = expr.index + if isinstance(expr.index, tuple): + ary_strides = arg.strides + if ary_strides is None: + raise RuntimeError("tuple-indexed variable '%s' does not " + "have stride information" % expr.aggregate.name) + else: + ary_strides = (1,) + index_expr = (index_expr,) + + from pymbolic.primitives import Subscript + return CCodeMapper.map_subscript(self, + Subscript(expr.aggregate, arg.offset+sum( + stride*expr_i for stride, expr_i in zip( + ary_strides, index_expr))), enclosing_prec) + + + elif expr.aggregate.name in self.kernel.temporary_variables: + temp_var = self.kernel.temporary_variables[expr.aggregate.name] + if isinstance(expr.index, tuple): + index = expr.index + else: + index = (expr.index,) + + return (temp_var.name + "".join("[%s]" % self.rec(idx, PREC_NONE) + for idx in index)) + + else: + raise RuntimeError("nothing known about variable '%s'" % expr.aggregate.name) + + def map_floor_div(self, expr, prec): + from loopy.isl_helpers import is_nonnegative + num_nonneg = is_nonnegative(expr.numerator, self.kernel.domain) + den_nonneg = is_nonnegative(expr.denominator, self.kernel.domain) + + if den_nonneg: + if num_nonneg: + return CCodeMapper.map_floor_div(self, expr, prec) + else: + return ("int_floor_div_pos_b(%s, %s)" + % (self.rec(expr.numerator, PREC_NONE), + expr.denominator)) + else: + return ("int_floor_div(%s, %s)" + % (self.rec(expr.numerator, PREC_NONE), + self.rec(expr.denominator, PREC_NONE))) + + def map_min(self, expr, prec): + what = type(expr).__name__.lower() + + children = expr.children[:] + + result = self.rec(children.pop(), PREC_NONE) + while children: + result = "%s(%s, %s)" % (what, + self.rec(children.pop(), PREC_NONE), + result) + + return result + + map_max = map_min + + # {{{ deal with complex-valued variables + + def complex_type_name(self, dtype): + if dtype == np.complex64: + return "cfloat" + if dtype == np.complex128: + return "cdouble" + else: + raise RuntimeError + + def map_sum(self, expr, enclosing_prec): + tgt_dtype = self.infer_type(expr) + is_complex = tgt_dtype.kind == 'c' + + if not is_complex: + return CCodeMapper.map_sum(self, expr, enclosing_prec) + else: + tgt_name = self.complex_type_name(tgt_dtype) + + reals = [child for child in expr.children + if 'c' != self.infer_type(child).kind] + complexes = [child for child in expr.children + if 'c' == self.infer_type(child).kind] + + from pymbolic.mapper.stringifier import PREC_SUM + real_sum = self.join_rec(" + ", reals, PREC_SUM) + complex_sum = self.join_rec(" + ", complexes, PREC_SUM) + + if real_sum: + result = "%s_fromreal(%s) + %s" % (tgt_name, real_sum, complex_sum) + else: + result = complex_sum + + return self.parenthesize_if_needed(result, enclosing_prec, PREC_SUM) + + def map_product(self, expr, enclosing_prec): + tgt_dtype = self.infer_type(expr) + is_complex = 'c' == tgt_dtype.kind + + if not is_complex: + return CCodeMapper.map_product(self, expr, enclosing_prec) + else: + tgt_name = self.complex_type_name(tgt_dtype) + + reals = [child for child in expr.children + if 'c' != self.infer_type(child).kind] + complexes = [child for child in expr.children + if 'c' == self.infer_type(child).kind] + + from pymbolic.mapper.stringifier import PREC_PRODUCT + real_prd = self.join_rec("*", reals, PREC_PRODUCT) + + complex_prd = self.rec(complexes[0], PREC_NONE) + for child in complexes[1:]: + complex_prd = "%s_mul(%s, %s)" % ( + tgt_name, complex_prd, + self.rec(child, PREC_NONE)) + + if real_prd: + # elementwise semantics are correct + result = "%s * %s" % (real_prd, complex_prd) + else: + result = complex_prd + + return self.parenthesize_if_needed(result, enclosing_prec, PREC_PRODUCT) + + def map_quotient(self, expr, enclosing_prec): + n_complex = 'c' == self.infer_type(expr.numerator).kind + d_complex = 'c' == self.infer_type(expr.denominator).kind + + tgt_dtype = self.infer_type(expr) + + if not (n_complex or d_complex): + return CCodeMapper.map_quotient(self, expr, enclosing_prec) + elif n_complex and not d_complex: + # elementwise semnatics are correct + return CCodeMapper.map_quotient(self, expr, enclosing_prec) + elif not n_complex and d_complex: + return "%s_rdivide(%s, %s)" % ( + self.complex_type_name(tgt_dtype), + self.rec(expr.numerator, PREC_NONE), + self.rec(expr.denominator, PREC_NONE)) + else: + return "%s_divide(%s, %s)" % ( + self.complex_type_name(tgt_dtype), + self.rec(expr.numerator, PREC_NONE), + self.rec(expr.denominator, PREC_NONE)) + + def map_remainder(self, expr, enclosing_prec): + tgt_dtype = self.infer_type(expr) + if 'c' == tgt_dtype.kind: + raise RuntimeError("complex remainder not defined") + + return CCodeMapper.map_remainder(self, expr, enclosing_prec) + + def map_power(self, expr, enclosing_prec): + from pymbolic.mapper.stringifier import PREC_NONE + + tgt_dtype = self.infer_type(expr) + if 'c' == tgt_dtype.kind: + if expr.exponent in [2, 3, 4]: + value = expr.base + for i in range(expr.exponent-1): + value = value * expr.base + return self.rec(value, enclosing_prec) + else: + b_complex = 'c' == self.infer_type(expr.base).kind + e_complex = 'c' == self.infer_type(expr.exponent).kind + + if b_complex and not e_complex: + return "%s_powr(%s, %s)" % ( + self.complex_type_name(tgt_dtype), + self.rec(expr.base, PREC_NONE), + self.rec(expr.exponent, PREC_NONE)) + else: + return "%s_pow(%s, %s)" % ( + self.complex_type_name(tgt_dtype), + self.rec(expr.base, PREC_NONE), + self.rec(expr.exponent, PREC_NONE)) + + return CCodeMapper.map_power(self, expr, enclosing_prec) + + # }}} + +# }}} + +# vim: fdm=marker diff --git a/loopy/compiled.py b/loopy/compiled.py index 07bfab181954cdda0042ea5578fd216c1a530525..164e7db09c0b28799fe4f574a4e5d4fa6b0be3f2 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -134,6 +134,19 @@ def drive_timing_run(kernel_generator, queue, launch, flop_count=None, # {{{ automatic testing +def fill_rand(ary): + from pyopencl.clrandom import fill_rand + if ary.dtype.kind == "c": + real_dtype = ary.dtype.type(0).real.dtype + real_ary = ary.view(real_dtype) + + fill_rand(real_ary, luxury=0) + else: + fill_rand(ary, luxury=0) + + + + def make_ref_args(kernel, queue, parameters, fill_value): from loopy.kernel import ScalarArg, ArrayArg, ImageArg @@ -185,8 +198,7 @@ def make_ref_args(kernel, queue, parameters, output_arrays.append(ary) result.append(ary.data) else: - from pyopencl.clrandom import fill_rand - fill_rand(ary, luxury=2) + fill_rand(ary) input_arrays.append(ary) if isinstance(arg, ImageArg): result.append(cl.image_from_array(queue.context, ary.get(), 1)) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 44650bac88f76aae8a5908755c76ace3e3749bbd..66f946efe1486f40e97946096e3d5808d514e275 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -12,8 +12,6 @@ from pymbolic.mapper import ( WalkMapper as WalkMapperBase, CallbackMapper as CallbackMapperBase, ) -from pymbolic.mapper.c_code import CCodeMapper -from pymbolic.mapper.stringifier import PREC_NONE from pymbolic.mapper.substitutor import \ SubstitutionMapper as SubstitutionMapperBase from pymbolic.mapper.stringifier import \ @@ -23,7 +21,6 @@ from pymbolic.mapper.dependency import \ from pymbolic.mapper.unifier import UnidirectionalUnifier \ as UnidirectionalUnifierBase -import numpy as np import islpy as isl from islpy import dim_type @@ -293,149 +290,6 @@ class ArrayAccessFinder(CombineMapper): # }}} -# {{{ C code mapper - -class LoopyCCodeMapper(CCodeMapper): - def __init__(self, kernel, cse_name_list=[], var_subst_map={}, - with_annotation=False): - def constant_mapper(c): - if isinstance(c, float): - # FIXME: type-variable - return "%sf" % repr(c) - else: - return repr(c) - - CCodeMapper.__init__(self, constant_mapper=constant_mapper, - cse_name_list=cse_name_list) - self.kernel = kernel - - self.with_annotation = with_annotation - self.var_subst_map = var_subst_map.copy() - - def copy(self, var_subst_map=None, cse_name_list=None): - if var_subst_map is None: - var_subst_map = self.var_subst_map - if cse_name_list is None: - cse_name_list = self.cse_name_list - return LoopyCCodeMapper(self.kernel, - cse_name_list=cse_name_list, var_subst_map=var_subst_map, - with_annotation=self.with_annotation) - - def copy_and_assign(self, name, value): - var_subst_map = self.var_subst_map.copy() - var_subst_map[name] = value - return self.copy(var_subst_map=var_subst_map) - - def copy_and_assign_many(self, assignments): - var_subst_map = self.var_subst_map.copy() - var_subst_map.update(assignments) - return self.copy(var_subst_map=var_subst_map) - - def map_common_subexpression(self, expr, prec): - raise RuntimeError("common subexpressions are not allowed in loopy") - - def map_variable(self, expr, prec): - if expr.name in self.var_subst_map: - if self.with_annotation: - return " /* %s */ %s" % ( - expr.name, - self.rec(self.var_subst_map[expr.name], prec)) - else: - return str(self.rec(self.var_subst_map[expr.name], prec)) - else: - return CCodeMapper.map_variable(self, expr, prec) - - def map_subscript(self, expr, enclosing_prec): - from pymbolic.primitives import Variable - if not isinstance(expr.aggregate, Variable): - return CCodeMapper.map_subscript(self, expr, enclosing_prec) - - if expr.aggregate.name in self.kernel.arg_dict: - arg = self.kernel.arg_dict[expr.aggregate.name] - - from loopy.kernel import ImageArg - if isinstance(arg, ImageArg): - assert isinstance(expr.index, tuple) - - base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" - % (arg.name, arg.dimensions, - ", ".join(self.rec(idx, PREC_NONE) - for idx in expr.index[::-1]))) - - if arg.dtype == np.float32: - return base_access+".x" - elif arg.dtype == np.float64: - return "as_double(%s.xy)" % base_access - else: - raise NotImplementedError( - "non-floating-point images not supported for now") - - else: - # ArrayArg - index_expr = expr.index - if isinstance(expr.index, tuple): - ary_strides = arg.strides - if ary_strides is None: - raise RuntimeError("tuple-indexed variable '%s' does not " - "have stride information" % expr.aggregate.name) - else: - ary_strides = (1,) - index_expr = (index_expr,) - - from pymbolic.primitives import Subscript - return CCodeMapper.map_subscript(self, - Subscript(expr.aggregate, arg.offset+sum( - stride*expr_i for stride, expr_i in zip( - ary_strides, index_expr))), enclosing_prec) - - - elif expr.aggregate.name in self.kernel.temporary_variables: - temp_var = self.kernel.temporary_variables[expr.aggregate.name] - if isinstance(expr.index, tuple): - index = expr.index - else: - index = (expr.index,) - - return (temp_var.name + "".join("[%s]" % self.rec(idx, PREC_NONE) - for idx in index)) - - else: - raise RuntimeError("nothing known about variable '%s'" % expr.aggregate.name) - - def map_floor_div(self, expr, prec): - from loopy.isl_helpers import is_nonnegative - num_nonneg = is_nonnegative(expr.numerator, self.kernel.domain) - den_nonneg = is_nonnegative(expr.denominator, self.kernel.domain) - - if den_nonneg: - if num_nonneg: - return CCodeMapper.map_floor_div(self, expr, prec) - else: - return ("int_floor_div_pos_b(%s, %s)" - % (self.rec(expr.numerator, PREC_NONE), - expr.denominator)) - else: - return ("int_floor_div(%s, %s)" - % (self.rec(expr.numerator, PREC_NONE), - self.rec(expr.denominator, PREC_NONE))) - - def map_min(self, expr, prec): - what = type(expr).__name__.lower() - - children = expr.children[:] - - result = self.rec(children.pop(), PREC_NONE) - while children: - result = "%s(%s, %s)" % (what, - self.rec(children.pop(), PREC_NONE), - result) - - return result - - map_max = map_min - -# }}} - # {{{ aff <-> expr conversion def aff_to_expr(aff, except_name=None, error_on_name=None): diff --git a/test/test_linalg.py b/test/test_linalg.py index 96cbb869c38cfb6291516e0b9d55465d14186141..44b28490d021440c9620dfa5040d44f61c11e7c1 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -99,6 +99,7 @@ def test_axpy(ctx_factory): vec = cl_array.vec for dtype, check, a, b in [ + (np.complex64, None, 5, 7), (vec.float4, check_float4, vec.make_float4(1, 2, 3, 4), vec.make_float4(6, 7, 8, 9)), (np.float32, None, 5, 7), @@ -116,7 +117,10 @@ def test_axpy(ctx_factory): lp.ArrayArg("z", dtype, shape="n,"), lp.ScalarArg("n", np.int32, approximately=n), ], - name="axpy", assumptions="n>=1") + name="axpy", assumptions="n>=1", + preamble=""" + #include <pyopencl-complex.h> + """) seq_knl = knl