From 215912efda44f4a71a62d8ceeb035c3db4926c8c Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 8 Jul 2011 00:47:55 -0400
Subject: [PATCH] Uneven loop splits work, plus other improvements.

---
 .gitignore             |   2 +
 examples/matrix-ops.py |   9 +-
 loopy/__init__.py      | 420 +++++++++++++++++++++++++++--------------
 3 files changed, 288 insertions(+), 143 deletions(-)

diff --git a/.gitignore b/.gitignore
index d6e3e906b..0ed3a9bc0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,5 @@ setuptools*egg
 setuptools.pth
 distribute*egg
 distribute*tar.gz
+*.log
+*profiler.conf
diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py
index 86ec3532d..8338d421f 100644
--- a/examples/matrix-ops.py
+++ b/examples/matrix-ops.py
@@ -67,8 +67,7 @@ def plain_matrix_mul(ctx_factory=cl.create_some_context):
 
 
 def fancy_matrix_mul(ctx_factory=cl.create_some_context):
-    dtype = np.float64
-    #ctx = cl.create_some_context()
+    dtype = np.float32
     ctx = cl.create_some_context(answers=[1])
     queue = cl.CommandQueue(ctx,
             properties=cl.command_queue_properties.PROFILING_ENABLE)
@@ -92,9 +91,9 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context):
             lp.ScalarArg("n", np.uint32, approximately=1000),
         ], name="fancy_matmul")
 
-    knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", is_even_split=True)
-    knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0", is_even_split=True)
-    knl = lp.split_dimension(knl, "k", 16, is_even_split=True)
+    knl = lp.split_dimension(knl, "i", 13, outer_tag="g.0", inner_tag="l.1", is_even_split=False)
+    knl = lp.split_dimension(knl, "j", 17, outer_tag="g.1", inner_tag="l.0", is_even_split=False)
+    knl = lp.split_dimension(knl, "k", 19, is_even_split=False)
     knl = lp.add_prefetch_dims(knl, 'a', ["i_inner", "k_inner"])
     knl = lp.add_prefetch_dims(knl, 'b', ["k_inner", "j_inner"])
     assert knl.get_invalid_reason() is None
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 3f4975be3..a37782913 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -14,6 +14,7 @@ import pyopencl as cl
 
 # TODO: Multi-D array access
 # TODO: Non-multiple loop splits
+#       FIXME: Splitting an uneven-split loop?
 # TODO: nD Texture access
 # TODO: Functions
 # TODO: Common subexpressions
@@ -31,9 +32,28 @@ import pyopencl as cl
 
 
 
+NEG_RELATION = {
+        "==" : "!=",
+        "!=" : "==",
+        "<" : ">+",
+        "<=" : ">",
+        ">" : "<=",
+        ">=" : "<",
+        }
 
 
 
+def generate_condition_code(ccm, condition, negate=False, expr_map=None):
+    a, rel, b = condition
+
+    if negate:
+        rel = NEG_RELATION[rel]
+
+    if expr_map is not None:
+        a = expr_map(a)
+        b = expr_map(b)
+
+    return "%s %s %s" % (ccm(a, PREC_NONE), rel, ccm(b, PREC_NONE),)
 
 
 
@@ -92,10 +112,31 @@ def parse_tag(tag):
 # {{{ loop dim, loop domain, kernel
 
 class LoopDimension(Record):
-    __slots__ = ["name", "length", "tag"]
+    __slots__ = ["name", "length", "last_cond",  "tag", 
+            "end_cond", "end_cond_if_last_of"]
+
+    def __init__(self, name, length=None, last_cond=None, end_cond=None, tag=None, 
+            end_cond_if_last_of=set()):
+        """
+        One of two end conditions governs a loop:
+
+        :arg length:
+        :arg last_cond: If not None, generate separate code for the 'last iteration'
+            of this loop, as indicated by last cond.
 
-    def __init__(self, name, length, tag=None):
-        Record.__init__(self, name=name, length=length, tag=tag)
+        :arg end_cond: A condition indicating whether the loop has ended.
+            This is not used for loop termination, but to check in nested
+            blocks whether actions relating to this loop should be performed.
+
+        Any 'condition' above is a (value, comparison_op, other_value) triple.
+
+        All arguments except name are keyword-only.
+        """
+
+        # FIXME: Not sure what combinations of end conditions make sense
+
+        Record.__init__(self, name=name, length=length, last_cond=last_cond,
+                end_cond=end_cond, tag=tag, end_cond_if_last_of=end_cond_if_last_of)
 
         if tag is not None:
             assert isinstance(tag, IndexTag)
@@ -103,11 +144,8 @@ class LoopDimension(Record):
     def __hash__(self):
         return hash(self.name)
 
-    def __repr__(self):
-        if self.tag is not None:
-            return "LD(%r, %d, %s)" % (self.name, self.length, self.tag)
-        else:
-            return "LD(%r, %d)" % (self.name, self.length)
+
+
 
 
 class LoopDomain(Record):
@@ -144,7 +182,7 @@ class LoopDomain(Record):
         return [dim for dim in self.dims
                 if isinstance(dim.tag, tag_type)]
 
-    def ordered_dim_by_tag_type(self, tag_type):
+    def ordered_dims_by_tag_type(self, tag_type):
         result = []
         from itertools import count
         for i in count():
@@ -155,8 +193,6 @@ class LoopDomain(Record):
             else:
                 result.append(dim)
 
-        return result
-
     def dims_by_tag(self, tag):
         return [dim for dim in self.dims if dim.tag == tag]
 
@@ -302,11 +338,11 @@ class LoopKernel(LoopDomain):
         return [dim for dim in self.dims if dim.name not in self.output_indices()]
 
     def group_dims(self):
-        dims = self.ordered_dim_by_tag_type(GROUP_IDX_TAG)
+        dims = self.ordered_dims_by_tag_type(GROUP_IDX_TAG)
         return tuple(dim.length for dim in dims)
 
     def local_dims(self):
-        dims = self.ordered_dim_by_tag_type(WORK_ITEM_IDX_TAG)
+        dims = self.ordered_dims_by_tag_type(WORK_ITEM_IDX_TAG)
         return tuple(dim.length for dim in dims)
 
     def group_size(self):
@@ -425,6 +461,9 @@ class LoopKernel(LoopDomain):
                 raise RuntimeError("repeated tag: %s" % d.tag)
 
         dim = self.dims[idx]
+        if dim.end_cond is not None or dim.last_cond is not None:
+            raise NotImplementedError("don't yet know how to split "
+                    "last_cond or end_cond loops")
 
         if dim.tag:
             raise ValueError("cannot split already-tagged dimension")
@@ -432,33 +471,50 @@ class LoopKernel(LoopDomain):
         if new_tags and dim.name not in self.output_indices():
             raise NotImplementedError("cannot yet tag a non-output dimension")
 
+        if is_even_split != False and dim.length % inner_length == 0:
+            is_even_split = True
+
         if outer_name is None:
             outer_name = dim.name+"_outer"
         if inner_name is None:
             inner_name = dim.name+"_inner"
-
-        if is_even_split != False and dim.length % inner_length == 0:
-            is_even_split = True
-
-        assert is_even_split
-
         from pymbolic import var
-        tgt_expr = var(inner_name) + var(outer_name)*inner_length
-
-        return self \
-                .substitute(dim.name, tgt_expr) \
-                .copy(dims=
-                        self.dims[:idx] + [
-                            LoopDimension(
-                                name=outer_name,
-                                length=dim.length//inner_length,
-                                tag=outer_tag),
-                            LoopDimension(
-                                name=inner_name,
-                                length=inner_length,
-                                tag=inner_tag),
-                            ]
-                        + self.dims[(idx+1):]), tgt_expr
+        outer = var(outer_name)
+        inner = var(inner_name)
+
+        new_loop_index = inner + outer*inner_length
+
+        if is_even_split:
+            new_dims = [
+                    LoopDimension(
+                        name=outer_name,
+                        length=dim.length//inner_length,
+                        tag=outer_tag),
+                    LoopDimension(
+                        name=inner_name,
+                        length=inner_length,
+                        tag=inner_tag),
+                    ]
+        else:
+            from pytools import div_ceil
+            new_dims = [
+                    LoopDimension(
+                        name=outer_name,
+                        length=div_ceil(dim.length, inner_length),
+                        last_cond=((outer+1)*inner_length, ">=", dim.length),
+                        tag=outer_tag),
+                    LoopDimension(
+                        name=inner_name,
+                        length=inner_length,
+                        end_cond=(new_loop_index, ">=", dim.length),
+                        tag=inner_tag,
+                        end_cond_if_last_of=dim.end_cond_if_last_of | set([outer_name])),
+                    ]
+
+        return (self
+                .substitute(dim.name, new_loop_index)
+                .copy(dims=self.dims[:idx] + new_dims + self.dims[(idx+1):]), 
+                new_loop_index)
 
     def get_invalid_reason(self):
         gdims = self.group_dims()
@@ -820,27 +876,25 @@ class RegisterPrefetch(Record):
 
 
 
-def generate_prefetch_code(ccm, kernel, schedule, sched_index, inner):
+def generate_prefetch_code(ccm, kernel, sched_index, last_of):
     from pymbolic import var
 
-    from cgen import (POD, Block,
-            Initializer, Assign, Statement as S, ArrayOf,
-            For, If, Line, Comment, MaybeUnused)
-
-    from cgen.opencl import CLLocal
+    from cgen import (Block,
+            Assign, Statement as S,
+            For, If, Line, Comment)
 
     # find surrounding schedule items
-    if sched_index > 0:
-        next_inner_sched_item = schedule[sched_index-1]
+    if sched_index-1 >= 0:
+        next_outer_sched_item = kernel.schedule[sched_index-1]
     else:
-        next_inner_sched_item = None
+        next_outer_sched_item = None
 
-    if sched_index+1 < len(schedule):
-        next_outer_sched_item = schedule[sched_index+1]
+    if sched_index+1 < len(kernel.schedule):
+        next_inner_sched_item = kernel.schedule[sched_index+1]
     else:
-        next_outer_sched_item = None
+        next_inner_sched_item = None
 
-    scheduled_pf = schedule[sched_index]
+    scheduled_pf = kernel.schedule[sched_index]
     pf = kernel.prefetch[
             scheduled_pf.input_vector, scheduled_pf.index_expr]
 
@@ -863,8 +917,7 @@ def generate_prefetch_code(ccm, kernel, schedule, sched_index, inner):
 
     # {{{ first, fix the user-specified fetch dims
 
-    knl_work_item_dims = sorted(kernel.dims_by_tag_type(WORK_ITEM_IDX_TAG),
-            key=lambda dim: dim.tag.axis)
+    knl_work_item_dims = kernel.ordered_dims_by_tag_type(WORK_ITEM_IDX_TAG)
 
     for realization_dim_idx, loc_fetch_axis_list in \
             getattr(pf, "loc_fetch_axes", {}).iteritems():
@@ -903,6 +956,13 @@ def generate_prefetch_code(ccm, kernel, schedule, sched_index, inner):
         # for size reasons
         realization_dims[least_stride_pf_dim_idx] = [knl_work_item_dims.pop(0)]
 
+    if knl_work_item_dims:
+        # FIXME
+        from warnings import warn
+        warn("There were leftover work group dimensions in prefetch "
+                "assignment. For now, this won't lead to wrong code, "
+                "but it will lead to unnecessary memory bandwidth use.")
+
     # }}}
 
     # }}}
@@ -912,8 +972,8 @@ def generate_prefetch_code(ccm, kernel, schedule, sched_index, inner):
     def make_fetch_loop_nest(pf_dim_idx, pf_dim_exprs=[], pf_idx_subst_map={}):
         # may mutate kernel for prefetch dim enlargement
 
+        from pymbolic.mapper.substitutor import substitute
         if pf_dim_idx >= len(pf.dims):
-            from pymbolic.mapper.substitutor import substitute
             # done, return
             return Assign(
                     pf.name + "".join("[%s]" % ccm(dexpr, PREC_NONE)
@@ -955,6 +1015,14 @@ def generate_prefetch_code(ccm, kernel, schedule, sched_index, inner):
                             "%s < %s" % (ccm(pf_dim_expr, PREC_NONE), pf_dim.length),
                             inner)
 
+                if (pf_dim.end_cond is not None
+                        and pf_dim.end_cond_if_last_of <= last_of):
+                    inner = If(
+                            generate_condition_code(ccm,
+                                pf_dim.end_cond, negate=True,
+                                expr_map=lambda expr: substitute(expr, pf_idx_subst_map)),
+                            inner)
+
                 if result is None:
                     result = inner
                 elif isinstance(result, Block):
@@ -964,6 +1032,7 @@ def generate_prefetch_code(ccm, kernel, schedule, sched_index, inner):
 
                 start_index += total_realiz_size
 
+
             return result
 
             # }}}
@@ -991,15 +1060,6 @@ def generate_prefetch_code(ccm, kernel, schedule, sched_index, inner):
 
     # }}}
 
-    # {{{ build lmem array declarator
-
-    smem_pf_array = POD(kernel.arg_dict[pf.input_vector].dtype, pf.name)
-    for l in pf.dim_storage_lengths:
-        smem_pf_array = ArrayOf(smem_pf_array, l)
-    smem_pf_array = CLLocal(smem_pf_array)
-
-    # }}}
-
     new_block = Block([
             Comment(("prefetch %s dim: " % pf.input_vector) + ", ".join(
                 "%s[%d]" % (pfdim.name, pfdim.length) for pfdim in pf.dims)),
@@ -1014,10 +1074,7 @@ def generate_prefetch_code(ccm, kernel, schedule, sched_index, inner):
             "no sync needed"))
 
     new_block.extend([
-        Line(),
-        smem_pf_array,
         fetch_block,
-        Line(),
         ])
 
     # omit tail sync primitive if we're headed into another prefetch
@@ -1027,24 +1084,152 @@ def generate_prefetch_code(ccm, kernel, schedule, sched_index, inner):
         new_block.append(Comment("next inner schedule item is a prefetch: "
             "no sync needed"))
 
-    new_block.extend([Line(),inner])
+    new_block.extend([Line(), build_loop_nest(ccm, kernel, sched_index+1, last_of)])
 
     return new_block
 
 
 
 
-def generate_code(kernel):
-    from cgen import FunctionBody, FunctionDeclaration, \
-            POD, Value, RestrictPointer, Module, Block, \
-            Initializer, Assign, Statement, For, \
-            Define, Line, Const, LiteralLines
 
-    from cgen.opencl import CLKernel, CLGlobal, CLRequiredWorkGroupSize
+def generate_loop_dim_code(ccm, kernel, sched_index, last_of):
+    from cgen import (POD, Block, Initializer,
+            For, If, Line, Comment, add_comment)
+
+    dim = kernel.schedule[sched_index]
 
-    S = Statement
+    if dim.tag is None:
+        # regular loop
+        if dim.last_cond is not None:
+            return Block([
+                    Initializer(POD(np.uint32, dim.name), 0),
+                    For(
+                        "",
+                        generate_condition_code(ccm, dim.last_cond, negate=True),
+                        "++%s" % dim.name, 
+                        build_loop_nest(ccm, kernel, sched_index+1, last_of)),
+                    Line(),
+                    Comment("last iteration of %s loop, with added bounds checks" % dim.name),
+                    build_loop_nest(ccm, kernel, sched_index+1, 
+                        last_of=last_of | set([dim.name]))
+                    ])
+
+        elif dim.length is not None:
+            if dim.end_cond is not None and dim.end_cond_if_last_of <= last_of:
+                return For(
+                        "int %s = 0" % dim.name,
+                        generate_condition_code(ccm, dim.end_cond, negate=True),
+                        "++%s" % dim.name, 
+                        build_loop_nest(ccm, kernel, sched_index+1, last_of))
+            else:
+                return For(
+                        "int %s = 0" % dim.name,
+                        "%s < %s" % (dim.name, ccm(dim.length, PREC_NONE)),
+                        "++%s" % dim.name, 
+                        build_loop_nest(ccm, kernel, sched_index+1, last_of))
+        else:
+            raise RuntimeError("unsupported loop ending condition")
+    else:
+        if dim.last_cond is not None:
+            return If(generate_condition_code(ccm, dim.last_cond, negate=True),
+                add_comment(
+                    "not the last entry along the '%s' work group axis" % dim.name,
+                    build_loop_nest(ccm, kernel, sched_index+1, last_of)),
+                add_comment(
+                    "last entry along the '%s' work group axis" % dim.name,
+                    build_loop_nest(ccm, kernel, sched_index+1, 
+                        last_of=last_of | set([dim.name]))))
+        else:
+            return build_loop_nest(ccm, kernel, sched_index+1, last_of)
+
+
+
+
+def get_parallel_dim_bounds_checks(ccm, kernel, last_of, stmt):
+    from cgen import If
+
+    for dim in (
+            kernel.dims_by_tag_type(GROUP_IDX_TAG)
+            + kernel.dims_by_tag_type(WORK_ITEM_IDX_TAG)):
+        if (dim.end_cond is not None
+                and dim.end_cond_if_last_of <= last_of):
+            stmt = If(
+                    generate_condition_code(ccm, dim.end_cond, negate=True),
+                    stmt)
+
+    return stmt
+
+
+
+
+def build_loop_nest(ccm, kernel, sched_index, last_of=set()):
+    from cgen import (POD, Block, Initializer, Assign, Statement as S,
+            block_if_necessary)
+
+    if sched_index >= len(kernel.schedule):
+        # write innermost loop body
+
+        from pymbolic.primitives import Subscript
+
+        insns = []
+        for lvalue, expr in kernel.instructions:
+            assert isinstance(lvalue, Subscript)
+            name = lvalue.aggregate.name
+            insns.append(S("tmp_%s += %s"
+                % (name, ccm(expr, PREC_NONE))))
+
+        return get_parallel_dim_bounds_checks(ccm, kernel, last_of, 
+                block_if_necessary(insns))
+
+        # }}}
+
+    sched_item = kernel.schedule[sched_index]
+
+    if isinstance(sched_item, LoopDimension):
+        return generate_loop_dim_code(ccm, kernel, sched_index, last_of)
+
+    elif isinstance(sched_item, WriteOutput):
+        return Block(
+                [Initializer(POD(kernel.arg_dict[lvalue.aggregate.name].dtype,
+                    "tmp_"+lvalue.aggregate.name), 0)
+                    for lvalue, expr in kernel.instructions]
+                +[build_loop_nest(ccm, kernel, sched_index+1, last_of)]+
+                [get_parallel_dim_bounds_checks(ccm, kernel, last_of,
+                    block_if_necessary([
+                        Assign(
+                            ccm(lvalue, PREC_NONE),
+                            "tmp_"+lvalue.aggregate.name)
+                        for lvalue, expr in kernel.instructions]))])
+
+    elif isinstance(sched_item, PrefetchDescriptor):
+        return generate_prefetch_code(ccm, kernel, sched_index, last_of)
+
+    elif isinstance(sched_item, RegisterPrefetch):
+        agg_name = sched_item.subscript_expr.aggregate.name
+        return Block([
+            get_parallel_dim_bounds_checks(ccm, kernel, last_of,
+                Initializer(POD(kernel.arg_dict[agg_name].dtype,
+                    sched_item.new_name),
+                    "%s[%s]"
+                    % (agg_name,
+                        ccm(sched_item.subscript_expr.index, PREC_NONE)))),
+
+            build_loop_nest(ccm, kernel, sched_index+1, last_of)])
+
+    else:
+        raise ValueError("invalid schedule item encountered")
+
+
+
+
+
+def generate_code(kernel):
+    from cgen import (FunctionBody, FunctionDeclaration, \
+            POD, Value, RestrictPointer, ArrayOf, Module, Block,
+            Define, Line, Const, LiteralLines)
+
+    from cgen.opencl import CLKernel, CLGlobal, CLRequiredWorkGroupSize, CLLocal
 
-    from pymbolic.primitives import Subscript
 
     # {{{ assign names, dim storage lengths to prefetches
 
@@ -1079,71 +1264,13 @@ def generate_code(kernel):
 
     ccm = LoopyCCodeMapper(kernel)
 
-    # {{{ write innermost loop body
-
-    inner = Block([])
-    for lvalue, expr in kernel.instructions:
-        assert isinstance(lvalue, Subscript)
-        name = lvalue.aggregate.name
-        inner.append(S("tmp_%s += %s"
-            % (name, ccm(expr, PREC_NONE))))
-
-    group_size = kernel.group_size()
-
-    # }}}
-
-    # {{{ nest loop bodies around existing code
-
-    # we're progressing from the innermost (last in the schedule)
-    # to the outermost loop
-
-    schedule = kernel.schedule[::-1]
-    for sched_index, sched_item in enumerate(schedule):
-        # write code for loops
-        if isinstance(sched_item, LoopDimension):
-            dim = sched_item
-            if dim.tag is None:
-                inner = For(
-                        "int %s = 0" % dim.name,
-                        "%s < %s" % (dim.name, ccm(dim.length, PREC_NONE)),
-                        "++%s" % dim.name, inner)
-
-        # write code for output writes
-        elif isinstance(sched_item, WriteOutput):
-            inner = Block(
-                    [Initializer(POD(kernel.arg_dict[lvalue.aggregate.name].dtype,
-                        "tmp_"+lvalue.aggregate.name), 0)
-                        for lvalue, expr in kernel.instructions]
-                    +[inner]+
-                    [Assign(
-                        ccm(lvalue, PREC_NONE),
-                        "tmp_"+lvalue.aggregate.name)
-                        for lvalue, expr in kernel.instructions])
-
-        # write code for prefetches
-        elif isinstance(sched_item, PrefetchDescriptor):
-            inner = generate_prefetch_code(
-                    ccm, kernel, schedule, sched_index, inner)
-
-        elif isinstance(sched_item, RegisterPrefetch):
-            agg_name = sched_item.subscript_expr.aggregate.name
-            inner = Block([
-                Initializer(POD(kernel.arg_dict[agg_name].dtype,
-                    sched_item.new_name),
-                    "%s[%s]"
-                    % (agg_name,
-                        ccm(sched_item.subscript_expr.index, PREC_NONE))),
-                inner])
-
-        else:
-            raise ValueError("invalid schedule item encountered")
-
-    # }}}
 
     # {{{ build top-level
 
     mod = Module()
 
+    group_size = kernel.group_size()
+
     # {{{ examine arg list
 
     has_double = False
@@ -1175,23 +1302,40 @@ def generate_code(kernel):
 
     # {{{ symbolic names for group and local indices
 
-    mod.extend([Define(dim.name, "get_group_id(%d) /* 0..%s */"
+    mod.extend([Define(dim.name, "get_group_id(%d) /* 0..(%s) */"
                 % (dim.tag.axis, ccm(dim.length-1, PREC_NONE)))
-                for dim in kernel.dims_by_tag_type(GROUP_IDX_TAG)]
-            + [Define(dim.name, "get_local_id(%d) /* 0..%s */"
+                for dim in kernel.ordered_dims_by_tag_type(GROUP_IDX_TAG)]
+            + [Define(dim.name, "get_local_id(%d) /* 0..(%s) */"
                 % (dim.tag.axis, ccm(dim.length-1, PREC_NONE)))
-                for dim in kernel.dims_by_tag_type(WORK_ITEM_IDX_TAG)]
+                for dim in kernel.ordered_dims_by_tag_type(WORK_ITEM_IDX_TAG)]
             + [Line()])
 
     # }}}
 
+    body = Block()
+
+    # {{{ build lmem array declarators for prefetches
+
+    for pf in kernel.prefetch.itervalues():
+        smem_pf_array = POD(kernel.arg_dict[pf.input_vector].dtype, pf.name)
+        for l in pf.dim_storage_lengths:
+            smem_pf_array = ArrayOf(smem_pf_array, l)
+        body.append(CLLocal(smem_pf_array))
+
+    # }}}
+
+    body.extend([
+        Line(),
+        build_loop_nest(ccm, kernel, 0)])
+
     mod.append(
         FunctionBody(
             CLRequiredWorkGroupSize(
-                tuple(dim.length for dim in kernel.dims_by_tag_type(WORK_ITEM_IDX_TAG)),
+                tuple(dim.length 
+                    for dim in kernel.ordered_dims_by_tag_type(WORK_ITEM_IDX_TAG)),
                 CLKernel(FunctionDeclaration(
                     Value("void", kernel.name), args))),
-            Block([inner])))
+            body))
 
     # }}}
 
-- 
GitLab