From 957948faa4ba80165ca80c4d59b23b612ee3f2c8 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 16 Nov 2016 10:36:45 -0600 Subject: [PATCH 1/9] Improve generation of loop bounds for sequential loops by generating non-static loop bounds. This is intended to help the case when one loop's bounds depend on another loop, such as the domain { [i,j]: 0 <= i <= n and i <= j <= i } In this case we want the generated code to look like for (i = 1; i <= n; ++i) for (j = i; j <= i; ++i) ... rather than "for (j = 1; j <= n; ++j)" which are the static loop bounds for j. Additional changes that help realize this are as follows: * Enable code generation of piecewise affine loop bounds, so that piecewise affine constraints on the "j" loop may be expressed as conditional expressions. * Additionally, if "i" is marked local-parallel, we allow the "j" loop bounds to depend on "i" as long as there are no barriers inside the "j" loop. --- loopy/codegen/bounds.py | 15 +++--------- loopy/codegen/loop.py | 31 +++++++++++------------ loopy/isl_helpers.py | 10 ++++---- loopy/symbolic.py | 31 ++++++++++++++++++++--- loopy/target/c/__init__.py | 8 +++--- loopy/target/ispc.py | 7 +++--- loopy/target/python.py | 21 ++++++++++++---- test/test_isl.py | 7 ++++++ test/test_loopy.py | 50 +++++++++++++++++++++++++++++++++++++- 9 files changed, 128 insertions(+), 52 deletions(-) diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index fb254bd54..da4d133e4 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -62,18 +62,11 @@ def get_bounds_checks(domain, check_inames, implemented_domain, # {{{ on which inames may a conditional depend? def get_usable_inames_for_conditional(kernel, sched_index): - from loopy.schedule import EnterLoop, LeaveLoop + from loopy.schedule import find_active_inames_at, has_barrier_within from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag - result = set() - - for i, sched_item in enumerate(kernel.schedule): - if i >= sched_index: - break - if isinstance(sched_item, EnterLoop): - result.add(sched_item.iname) - elif isinstance(sched_item, LeaveLoop): - result.remove(sched_item.iname) + result = find_active_inames_at(kernel, sched_index) + crosses_barrier = has_barrier_within(kernel, sched_index) for iname in kernel.all_inames(): tag = kernel.iname_to_tag.get(iname) @@ -87,7 +80,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): if ( isinstance(tag, ParallelTag) - and not isinstance(tag, LocalIndexTagBase) + and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier) and not isinstance(tag, IlpBaseTag) ): result.add(iname) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 648c3fe6f..4d75533c5 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -386,38 +386,29 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] - from loopy.isl_helpers import ( - static_min_of_pw_aff, - static_max_of_pw_aff) - lbound = ( kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) + .gist(dom_and_slab.params()) .coalesce()) ubound = ( kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) + .gist(dom_and_slab.params()) .coalesce()) - static_lbound = static_min_of_pw_aff( - lbound, - constants_only=False) - static_ubound = static_max_of_pw_aff( - ubound, - constants_only=False) - # }}} # {{{ find implemented slab, build inner code - from loopy.isl_helpers import make_slab_from_bound_pwaffs + from loopy.isl_helpers import make_loop_bound_from_pwaffs # impl_slab may be overapproximated - impl_slab = make_slab_from_bound_pwaffs( + impl_slab = make_loop_bound_from_pwaffs( dom_and_slab.space, - loop_iname, static_lbound, static_ubound) + loop_iname, lbound, ubound) for iname in moved_inames: dt, idx = impl_slab.get_var_dict()[iname] @@ -442,13 +433,19 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): astb = codegen_state.ast_builder - if (static_ubound - static_lbound).plain_is_zero(): + zero = isl.PwAff.zero_on_domain( + isl.LocalSpace.from_space( + lbound.get_space())) + + from loopy.symbolic import pw_aff_to_expr + + if (ubound - lbound).plain_is_equal(zero): # single-trip, generate just a variable assignment, not a loop result.append(merge_codegen_results(codegen_state, [ astb.emit_initializer( codegen_state, kernel.index_dtype, loop_iname, - ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), + ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, @@ -461,7 +458,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): codegen_state, astb.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, - static_lbound, static_ubound, inner_ast))) + pw_aff_to_expr(lbound), pw_aff_to_expr(ubound), inner_ast))) return merge_codegen_results(codegen_state, result) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index e657beecb..82441eb7f 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -102,7 +102,7 @@ def make_slab(space, iname, start, stop): return result -def make_slab_from_bound_pwaffs(space, iname, lbound, ubound): +def make_loop_bound_from_pwaffs(space, iname, lbound, ubound): dt, pos = space.get_var_dict()[iname] iname_pwaff = isl.PwAff.var_on_domain(space, dt, pos) @@ -111,10 +111,10 @@ def make_slab_from_bound_pwaffs(space, iname, lbound, ubound): assert iname_pwaff.space == lbound.space assert iname_pwaff.space == ubound.space - return convexify( - iname_pwaff.ge_set(lbound) - & - iname_pwaff.le_set(ubound)) + return ( + iname_pwaff.ge_set(lbound) + & + iname_pwaff.le_set(ubound)) # }}} diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 5b5b24776..74bb5c1d1 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1142,12 +1142,35 @@ def pw_aff_to_expr(pw_aff, int_ok=False): return pw_aff pieces = pw_aff.get_pieces() + last_expr = aff_to_expr(pieces[-1][1]) - if len(pieces) != 1: - raise NotImplementedError("pw_aff_to_expr for multi-piece PwAff instances") + # {{{ make exprs from set constraints - (set, aff), = pieces - return aff_to_expr(aff) + from pymbolic.primitives import LogicalAnd, LogicalOr + + def set_to_expr(isl_set): + constrs = [] + for isl_basicset in isl_set.get_basic_sets(): + constrs.append(basic_set_to_expr(isl_basicset)) + return LogicalOr(tuple(constrs)) + + def basic_set_to_expr(isl_basicset): + constrs = [] + for constr in isl_basicset.get_constraints(): + constrs.append(constraint_to_expr(constr)) + return LogicalAnd(tuple(constrs)) + + # }}} + + pairs = [(set_to_expr(constr_set), aff_to_expr(aff)) + for constr_set, aff in pieces[:-1]] + + from pymbolic.primitives import If + expr = last_expr + for condition, then_expr in reversed(pairs): + expr = If(condition, then_expr, expr) + + return expr # }}} diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 6cfd8f0c7..e28da7453 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -703,11 +703,9 @@ class CASTBuilder(ASTBuilderBase): CExpression(self.get_c_expression_to_code_mapper(), result)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from loopy.symbolic import aff_to_expr - from pymbolic import var from pymbolic.primitives import Comparison from pymbolic.mapper.stringifier import PREC_NONE @@ -716,12 +714,12 @@ class CASTBuilder(ASTBuilderBase): return For( InlineInitializer( POD(self, iname_dtype, iname), - ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), + ecm(lbound, PREC_NONE, "i")), ecm( Comparison( var(iname), "<=", - aff_to_expr(static_ubound)), + ubound), PREC_NONE, "i"), "++%s" % iname, inner) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 4194d445f..f07d5f20a 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -469,10 +469,9 @@ class ISPCASTBuilder(CASTBuilder): return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from loopy.symbolic import aff_to_expr from loopy.target.c import POD from pymbolic.mapper.stringifier import PREC_NONE @@ -483,9 +482,9 @@ class ISPCASTBuilder(CASTBuilder): return For( InlineInitializer( ISPCUniform(POD(self, iname_dtype, iname)), - ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), + ecm(lbound, PREC_NONE, "i")), ecm( - p.Comparison(var(iname), "<=", aff_to_expr(static_ubound)), + p.Comparison(var(iname), "<=", ubound), PREC_NONE, "i"), "++%s" % iname, inner) diff --git a/loopy/target/python.py b/loopy/target/python.py index 036e60ab1..a348cba83 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -129,6 +129,19 @@ class ExpressionToPythonMapper(StringifyMapper): def map_local_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have local hw axes") + def map_if(self, expr, enclosing_prec): + # Synthesize PREC_IFTHENELSE, make sure it is in the right place in the + # operator precedence hierarchy (right above "or"). + from pymbolic.mapper.stringifier import PREC_LOGICAL_OR, PREC_NONE + PREC_IFTHENELSE = PREC_LOGICAL_OR - 1 + + return self.parenthesize_if_needed( + "{then} if {cond} else {else_}".format( + then=self.rec(expr.then, PREC_IFTHENELSE), + cond=self.rec(expr.condition, PREC_IFTHENELSE), + else_=self.rec(expr.else_, PREC_IFTHENELSE)), + enclosing_prec, PREC_NONE) + # }}} @@ -223,11 +236,9 @@ class PythonASTBuilderBase(ASTBuilderBase): return Suite def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from loopy.symbolic import aff_to_expr - from pymbolic.mapper.stringifier import PREC_NONE from genpy import For @@ -235,8 +246,8 @@ class PythonASTBuilderBase(ASTBuilderBase): (iname,), "range(%s, %s + 1)" % ( - ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), - ecm(aff_to_expr(static_ubound), PREC_NONE, "i"), + ecm(lbound, PREC_NONE, "i"), + ecm(ubound, PREC_NONE, "i"), ), inner) diff --git a/test/test_isl.py b/test/test_isl.py index 3bd3d221e..f793b1fa9 100644 --- a/test/test_isl.py +++ b/test/test_isl.py @@ -44,6 +44,13 @@ def test_aff_to_expr_2(): assert aff_to_expr(x) == (-1)*i0 + 2*(i0 // 2) +def test_pw_aff_to_conditional_expr(): + from loopy.symbolic import pw_aff_to_expr + cond = isl.PwAff("[i] -> { [(0)] : i = 0; [(-1 + i)] : i > 0 }") + expr = pw_aff_to_expr(cond) + assert str(expr) == "If(i == 0, 0, -1 + i)" + + if __name__ == "__main__": import sys if len(sys.argv) > 1: diff --git a/test/test_loopy.py b/test/test_loopy.py index 347c08d0d..3d5894b4d 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1601,6 +1601,54 @@ def test_scalars_with_base_storage(ctx_factory): knl(queue, out_host=True) +def test_tight_loop_bounds(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + ["{ [i] : 0 <= i <= 5 }", + "[i] -> { [j] : 2 * i - 2 < j <= 2 * i and 0 <= j <= 9 }"], + """ + for i + for j + out[j] = j + end + end + """, + silenced_warnings="write_race(insn)") + + knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0") + + evt, (out,) = knl(queue, out_host=True) + + assert (out == np.arange(10)).all() + + +def test_tight_loop_bounds_codegen(): + knl = lp.make_kernel( + ["{ [i] : 0 <= i <= 5 }", + "[i] -> { [j] : 2 * i - 2 <= j <= 2 * i and 0 <= j <= 9 }"], + """ + for i + for j + out[j] = j + end + end + """, + silenced_warnings="write_race(insn)", + target=lp.OpenCLTarget()) + + knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0") + + cgr = lp.generate_code_v2(knl) + #print(cgr.device_code()) + + for_loop = \ + "for (int j = (lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \ + "j <= (lid(0) == 0 && -1 + gid(0) == 0 ? 9 : 2 * lid(0)); ++j)" + + assert for_loop in cgr.device_code() + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) @@ -1608,4 +1656,4 @@ if __name__ == "__main__": from py.test.cmdline import main main([__file__]) -# vim: foldmethod=marker \ No newline at end of file +# vim: foldmethod=marker -- GitLab From b80059c1d63f2acb82011c12b1eddba05ca67d5f Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 16 Nov 2016 11:46:19 -0600 Subject: [PATCH 2/9] Remove unused import. --- loopy/codegen/loop.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 4d75533c5..2339fe60f 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -429,8 +429,6 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) - from loopy.symbolic import aff_to_expr - astb = codegen_state.ast_builder zero = isl.PwAff.zero_on_domain( -- GitLab From f8fcf65180d58ca61bd607009a918234eee867c9 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 16 Nov 2016 11:46:36 -0600 Subject: [PATCH 3/9] get_usable_inames_for_conditional(): Parallel inames are only defined within a subkernel. --- loopy/codegen/bounds.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index da4d133e4..61a825fb9 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -62,16 +62,33 @@ def get_bounds_checks(domain, check_inames, implemented_domain, # {{{ on which inames may a conditional depend? def get_usable_inames_for_conditional(kernel, sched_index): - from loopy.schedule import find_active_inames_at, has_barrier_within + from loopy.schedule import ( + find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) - for iname in kernel.all_inames(): + # Find our containing subkernel, grab inames for all insns from there. + + subkernel_index = sched_index + from loopy.schedule import CallKernel + + while not isinstance(kernel.schedule[subkernel_index], CallKernel): + subkernel_index -= 1 + + insn_ids_for_subkernel = get_insn_ids_for_block_at( + kernel.schedule, subkernel_index) + + inames_for_subkernel = ( + iname + for insn in insn_ids_for_subkernel + for iname in kernel.insn_inames(insn)) + + for iname in inames_for_subkernel: tag = kernel.iname_to_tag.get(iname) - # Parallel inames are always defined, BUT: + # Parallel inames are defined within a subkernel, BUT: # # - local indices may not be used in conditionals that cross barriers. # -- GitLab From 782df4280341a92999402b6f6fbc01c4e972c10e Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 16 Nov 2016 11:53:10 -0600 Subject: [PATCH 4/9] Fix doctest for non-static loop bounds. --- doc/tutorial.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index fa6fcc950..951a21426 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -532,9 +532,8 @@ Consider this example: #define lid(N) ((int) get_local_id(N)) ... for (int i_outer = 0; i_outer <= -1 + ((15 + n) / 16); ++i_outer) - for (int i_inner = 0; i_inner <= 15; ++i_inner) - if (-1 + -1 * i_inner + -16 * i_outer + n >= 0) - a[16 * i_outer + i_inner] = 0.0f; + for (int i_inner = 0; i_inner <= (-16 + n + -16 * i_outer >= 0 ? 15 : -1 + n + -16 * i_outer); ++i_inner) + a[16 * i_outer + i_inner] = 0.0f; ... By default, the new, split inames are named *OLD_outer* and *OLD_inner*, -- GitLab From 59a04afbf4ef7a4aa3bec66a3b84e46af4a40683 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 16 Nov 2016 11:55:38 -0600 Subject: [PATCH 5/9] Fix line length. --- test/test_loopy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 3d5894b4d..ae41779eb 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1644,7 +1644,8 @@ def test_tight_loop_bounds_codegen(): #print(cgr.device_code()) for_loop = \ - "for (int j = (lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \ + "for (int j = " \ + "(lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \ "j <= (lid(0) == 0 && -1 + gid(0) == 0 ? 9 : 2 * lid(0)); ++j)" assert for_loop in cgr.device_code() -- GitLab From c33f7b06f9c522f676bb3de1bb278d6e5f583acb Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 16 Nov 2016 12:06:34 -0600 Subject: [PATCH 6/9] Fix another doctest. --- doc/tutorial.rst | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 951a21426..e9e06769c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -561,12 +561,11 @@ relation to loop nesting. For example, it's perfectly possible to request >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) - ... - for (int i_inner = 0; i_inner <= 15; ++i_inner) - if (-1 + -1 * i_inner + n >= 0) - for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) - a[16 * i_outer + i_inner] = 0.0f; - ... + ... + for (int i_inner = 0; i_inner <= (-17 + n >= 0 ? 15 : -1 + n); ++i_inner) + for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) + a[16 * i_outer + i_inner] = 0.0f; + ... Notice how loopy has automatically generated guard conditionals to make sure the bounds on the old iname are obeyed. -- GitLab From f93bdb9ef7809b2b576aef356ccecfaa0f796855 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 16 Nov 2016 12:09:00 -0600 Subject: [PATCH 7/9] Fix spacing. --- doc/tutorial.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index e9e06769c..a5a73c8d3 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -561,11 +561,11 @@ relation to loop nesting. For example, it's perfectly possible to request >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) - ... - for (int i_inner = 0; i_inner <= (-17 + n >= 0 ? 15 : -1 + n); ++i_inner) - for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) - a[16 * i_outer + i_inner] = 0.0f; - ... + ... + for (int i_inner = 0; i_inner <= (-17 + n >= 0 ? 15 : -1 + n); ++i_inner) + for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) + a[16 * i_outer + i_inner] = 0.0f; + ... Notice how loopy has automatically generated guard conditionals to make sure the bounds on the old iname are obeyed. -- GitLab From 37be4710f1864ee42607ed7959714f3d3538dc4e Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 16 Nov 2016 12:39:32 -0600 Subject: [PATCH 8/9] Fix domain for zero. --- loopy/codegen/loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 2339fe60f..d0d68edec 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -433,7 +433,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): zero = isl.PwAff.zero_on_domain( isl.LocalSpace.from_space( - lbound.get_space())) + lbound.get_space()).domain()) from loopy.symbolic import pw_aff_to_expr -- GitLab From b968d0209072c39110fd58bd1288f6e78f2c16b7 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 16 Nov 2016 13:05:34 -0600 Subject: [PATCH 9/9] Fix another tutorial doctest changed by non-static loop bounds. --- doc/tutorial.rst | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index a5a73c8d3..6c3175dc0 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -789,17 +789,18 @@ enabling some cost savings: a[4 * i_outer + 3] = 0.0f; } /* final slab for 'i_outer' */ - for (int i_outer = -1 + n + -1 * (3 * n / 4); i_outer <= -1 + ((3 + n) / 4); ++i_outer) - if (-1 + n >= 0) - { - a[4 * i_outer] = 0.0f; - if (-2 + -4 * i_outer + n >= 0) - a[4 * i_outer + 1] = 0.0f; - if (-3 + -4 * i_outer + n >= 0) - a[4 * i_outer + 2] = 0.0f; - if (4 + 4 * i_outer + -1 * n == 0) - a[4 * i_outer + 3] = 0.0f; - } + int const i_outer = -1 + n + -1 * (3 * n / 4); + + if (-1 + n >= 0) + { + a[4 * i_outer] = 0.0f; + if (-2 + -4 * i_outer + n >= 0) + a[4 * i_outer + 1] = 0.0f; + if (-3 + -4 * i_outer + n >= 0) + a[4 * i_outer + 2] = 0.0f; + if (4 + 4 * i_outer + -1 * n == 0) + a[4 * i_outer + 3] = 0.0f; + } ... .. }}} -- GitLab