diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 3c85060dacf03b52f6e0b1faf05ad4697b6a5d07..1a883e26530dd820fba9b2411dec78ca26439ac9 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -547,7 +547,7 @@ Consider this example: >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - for (int i_outer = 0; i_outer <= -1 + ((15 + n) / 16); ++i_outer) + for (int i_outer = 0; i_outer <= -1 + (15 + n) / 16; ++i_outer) for (int i_inner = 0; i_inner <= (-16 + n + -16 * i_outer >= 0 ? 15 : -1 + n + -16 * i_outer); ++i_inner) a[16 * i_outer + i_inner] = 0.0f; ... @@ -579,7 +579,7 @@ relation to loop nesting. For example, it's perfectly possible to request #define lid(N) ((int) get_local_id(N)) ... for (int i_inner = 0; i_inner <= (-17 + n >= 0 ? 15 : -1 + n); ++i_inner) - for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) + for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + (15 + n + 15 * i_inner) / 16; ++i_outer) a[16 * i_outer + i_inner] = 0.0f; ... @@ -603,8 +603,8 @@ commonly called 'loop tiling': >>> evt, (out,) = knl(queue, a=a_mat_dev) #define lid(N) ((int) get_local_id(N)) ... - for (int i_outer = 0; i_outer <= ((-16 + n) / 16); ++i_outer) - for (int j_outer = 0; j_outer <= ((-16 + n) / 16); ++j_outer) + for (int i_outer = 0; i_outer <= (-16 + n) / 16; ++i_outer) + for (int j_outer = 0; j_outer <= (-16 + n) / 16; ++j_outer) for (int i_inner = 0; i_inner <= 15; ++i_inner) for (int j_inner = 0; j_inner <= 15; ++j_inner) out[n * (16 * i_outer + i_inner) + 16 * j_outer + j_inner] = a[n * (16 * j_outer + j_inner) + 16 * i_outer + i_inner]; @@ -767,7 +767,7 @@ assumption: >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - for (int i_outer = 0; i_outer <= -1 + ((3 + n) / 4); ++i_outer) + for (int i_outer = 0; i_outer <= -1 + (3 + n) / 4; ++i_outer) { a[4 * i_outer] = 0.0f; if (-2 + -4 * i_outer + n >= 0) @@ -797,7 +797,7 @@ enabling some cost savings: #define lid(N) ((int) get_local_id(N)) ... /* bulk slab for 'i_outer' */ - for (int i_outer = 0; i_outer <= -2 + ((3 + n) / 4); ++i_outer) + for (int i_outer = 0; i_outer <= -2 + (3 + n) / 4; ++i_outer) { a[4 * i_outer] = 0.0f; a[1 + 4 * i_outer] = 0.0f; @@ -806,7 +806,7 @@ enabling some cost savings: } /* final slab for 'i_outer' */ { - int const i_outer = -1 + n + -1 * (3 * n / 4); + int const i_outer = -1 + n + -1 * ((3 * n) / 4); if (-1 + n >= 0) { @@ -1297,7 +1297,7 @@ The kernel translates into two OpenCL kernels. int tmp; tmp = tmp_save_slot[16 * gid(0) + lid(0)]; - arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp; + arr[(1 + lid(0) + gid(0) * 16) % n] = tmp; } Now we can execute the kernel. diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 8ef921e447bf10d85ac60460f904d528ac64da19..fc8745b70cd8ddae09213bf487a705658b489f85 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -28,7 +28,8 @@ from six.moves import range import numpy as np from pymbolic.mapper import RecursiveMapper, IdentityMapper -from pymbolic.mapper.stringifier import (PREC_NONE, PREC_CALL, PREC_PRODUCT, +from pymbolic.mapper.stringifier import (PREC_NONE, PREC_CALL, + PREC_SUM, PREC_PRODUCT, PREC_POWER, PREC_SHIFT, PREC_UNARY, PREC_LOGICAL_OR, PREC_LOGICAL_AND, PREC_BITWISE_AND, PREC_BITWISE_OR) @@ -771,12 +772,22 @@ class CExpressionToCodeMapper(RecursiveMapper): enclosing_prec, PREC_CALL) def map_floor_div(self, expr, enclosing_prec): - # parenthesize to avoid negative signs being dragged in from the - # outside by associativity - return "(%s / %s)" % ( - self.rec(expr.numerator, PREC_PRODUCT), - # analogous to ^{-1} - self.rec(expr.denominator, PREC_POWER)) + num = self.rec(expr.numerator, PREC_PRODUCT) + + # analogous to ^{-1} + denom = self.rec(expr.denominator, PREC_POWER) + + # (-1) * ((-1)*x / 5) should not reassociate. + # Need to shield against surrounding products. + # (and divisions) + return self.parenthesize_if_needed( + "%s / %s" % ( + # Space is necessary--otherwise '/*' + # (i.e. divide-dereference) becomes + # start-of-comment in C. + num, + denom), + enclosing_prec, PREC_SUM) def map_min(self, expr, enclosing_prec): what = type(expr).__name__.lower() @@ -874,9 +885,14 @@ class CExpressionToCodeMapper(RecursiveMapper): def map_product(self, expr, enclosing_prec): # Spaces prevent '**z' (times dereference z), which # is hard to read. + # Use PREC_SUM to enforce correct order of operations + # in a series of products, quotients, and remainders. + # Without it the mapper leaves out parentheses + # and the series evaluates left to right. + return self.parenthesize_if_needed( - self.join_rec(" * ", expr.children, PREC_PRODUCT), - enclosing_prec, PREC_PRODUCT) + self.join_rec(" * ", expr.children, PREC_PRODUCT), + enclosing_prec, PREC_SUM) def map_quotient(self, expr, enclosing_prec): num = self.rec(expr.numerator, PREC_PRODUCT) @@ -884,20 +900,32 @@ class CExpressionToCodeMapper(RecursiveMapper): # analogous to ^{-1} denom = self.rec(expr.denominator, PREC_POWER) + # (-1) * ((-1)*x / 5) should not reassociate. + # Need to shield against surrounding products. + # (and divisions) return self.parenthesize_if_needed( "%s / %s" % ( # Space is necessary--otherwise '/*' - # (i.e. divide-dererference) becomes + # (i.e. divide-dereference) becomes # start-of-comment in C. num, denom), - enclosing_prec, PREC_PRODUCT) + enclosing_prec, PREC_SUM) def map_remainder(self, expr, enclosing_prec): - return "(%s %% %s)" % ( - self.rec(expr.numerator, PREC_PRODUCT), - # PREC_POWER analogous to ^{-1} - self.rec(expr.denominator, PREC_POWER)) + num = self.rec(expr.numerator, PREC_PRODUCT) + + # analogous to ^{-1} + denom = self.rec(expr.denominator, PREC_POWER) + + # (-1) * ((-1)*x % 5) should not reassociate. + # Need to shield against surrounding products. + # (and divisions) + return self.parenthesize_if_needed( + "%s %% %s" % ( + num, + denom), + enclosing_prec, PREC_SUM) def map_power(self, expr, enclosing_prec): return "pow(%s, %s)" % ( diff --git a/test/test_isl.py b/test/test_isl.py index bbd4a813ea12884fede753f428267038cc9b435a..12e7c767082005f34ef7112ce26f3d2c308fd390 100644 --- a/test/test_isl.py +++ b/test/test_isl.py @@ -48,7 +48,7 @@ def test_pw_aff_to_conditional_expr(): from loopy.symbolic import pw_aff_to_expr cond = isl.PwAff("[i] -> { [(0)] : i = 0; [(-1 + i)] : i > 0 }") expr = pw_aff_to_expr(cond) - assert str(expr) == "If(i == 0, 0, -1 + i)" + assert str(expr) == "0 if i == 0 else -1 + i" if __name__ == "__main__": diff --git a/test/test_loopy.py b/test/test_loopy.py index 231b70bf71d865e5b9832332c90f3228a0a26b82..74e2e34e1dad0d724ff52cfb60f1ce104ecda8ed 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -3003,6 +3003,25 @@ def test_shape_mismatch_check(ctx_factory): prg(queue, a=a, b=b) +def test_divide_precedence(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{:}", + """ + x[0] = c*(a/b) + y[0] = c*(a%b) + """, + [lp.ValueArg('a, b, c', np.int32), lp.GlobalArg('x, y', np.int32)]) + print(lp.generate_code_v2(knl).device_code()) + + evt, (x_out, y_out) = knl(queue, c=2, b=2, a=5) + evt.wait() + assert x_out.get() == 4 + assert y_out.get() == 2 + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])