External (vector) function call

(There are actually two issues here, but I'm putting it together as they have significant overlaps.)

Using inline-kernel branch, which builds on top of kernel_callable_v3 branch, I can generate the following code for assembling two-forms in Firedrake (and I think two-form assembly is the last hurdle before fully migrating to Loopy backend). This is after inlining the local assembly kernel.

#include <math.h>
#include <petsc.h>


static double const child_t0[3 * 3] __attribute__ ((aligned (64))) = { 0.6666666666666669, 0.16666666666666663, 0.16666666666666666, 0.16666666666666674, 0.16666666666666663, 0.6666666666666665, 0.16666666666666669, 0.6666666666666666, 0.16666666666666663 };
static double const child_t4[3] __attribute__ ((aligned (64))) = { 0.16666666666666666, 0.16666666666666666, 0.16666666666666666 };

void wrap_form00_cell_integral_otherwise(int const start, int const end, unsigned long const mat0, double const *__restrict__ dat0, int const *__restrict__ map0)
{
  double child_t1 __attribute__ ((aligned (64)));
  double child_t2 __attribute__ ((aligned (64)));
  double child_t3 __attribute__ ((aligned (64)));
  double child_t5 __attribute__ ((aligned (64)));
  double child_t6 __attribute__ ((aligned (64)));
  double t2[3 * 3] __attribute__ ((aligned (64)));
  double t3[3 * 2] __attribute__ ((aligned (64)));

  for (int n = start; n <= -1 + end; ++n)
  {
    for (int i13 = 0; i13 <= 2; ++i13)
      for (int i14 = 0; i14 <= 1; ++i14)
        t3[2 * i13 + i14] = dat0[2 * map0[3 * n + i13] + i14];
    for (int i1 = 0; i1 <= 2; ++i1)
      for (int i4 = 0; i4 <= 2; ++i4)
        t2[3 * i1 + i4] = 0.0;
    child_t1 = -1.0 * t3[0];
    child_t2 = -1.0 * t3[1];
    child_t3 = fabs((child_t1 + t3[2]) * (child_t2 + t3[5]) + -1.0 * (child_t1 + t3[4]) * (child_t2 + t3[3]));
    for (int child_ip = 0; child_ip <= 2; ++child_ip)
    {
      child_t5 = child_t4[child_ip] * child_t3;
      for (int child_j = 0; child_j <= 2; ++child_j)
      {
        child_t6 = child_t0[3 * child_ip + child_j] * child_t5;
        for (int child_k = 0; child_k <= 2; ++child_k)
          t2[3 * child_j + child_k] = t2[3 * child_j + child_k] + child_t0[3 * child_ip + child_k] * child_t6;
      }
    }
    mat0 = MatSetValuesBlockedLocal(3, &(map0[3 * n]), 3, &(map0[3 * n]), &(t2[0]), ADD_VALUES);
  }
}

Good news is that we are almost there. The only problem here is external types. The argument mat0, should be of type Mat (which is basically a pointer to matrix), defined by PETSc. So the first question is

What's the best way to introduce external data types? I can think of a) adding a new target for which we extend the types, or b) introducing a opaque data type to loopy type system.

I guess if I use the correct type, the function call will become MatSetValuesBlockedLocal(mat0, 3, ...), which is exactly what we want.

Now I can try transformations. If I split n and tag n_inner as ilp.seq for vectorization, I got this code:

#include <math.h>
#include <petsc.h>


static double const child_t0[3 * 3] __attribute__ ((aligned (64))) = { 0.6666666666666669, 0.16666666666666663, 0.16666666666666666, 0.16666666666666674, 0.16666666666666663, 0.6666666666666665, 0.16666666666666669, 0.6666666666666666, 0.16666666666666663 };
static double const child_t4[3] __attribute__ ((aligned (64))) = { 0.16666666666666666, 0.16666666666666666, 0.16666666666666666 };

void wrap_form00_cell_integral_otherwise(int const start, int const end, unsigned long const mat0, double const *__restrict__ dat0, int const *__restrict__ map0)
{
  double child_t1[4] __attribute__ ((aligned (64)));
  double child_t2[4] __attribute__ ((aligned (64)));
  double child_t3[4] __attribute__ ((aligned (64)));
  double child_t5[4] __attribute__ ((aligned (64)));
  double child_t6[4] __attribute__ ((aligned (64)));
  double t2[3 * 3 * 4] __attribute__ ((aligned (64)));
  double t3[3 * 2 * 4] __attribute__ ((aligned (64)));

  for (int n_outer = (start / 4); n_outer <= ((-4 + end) / 4); ++n_outer)
  {
    for (int i13 = 0; i13 <= 2; ++i13)
      for (int i14 = 0; i14 <= 1; ++i14)
        for (int n_inner = 0; n_inner <= 3; ++n_inner)
          t3[8 * i13 + 4 * i14 + n_inner] = dat0[2 * map0[3 * (4 * n_outer + n_inner) + i13] + i14];
    for (int i1 = 0; i1 <= 2; ++i1)
      for (int i4 = 0; i4 <= 2; ++i4)
        for (int n_inner = 0; n_inner <= 3; ++n_inner)
          t2[12 * i1 + 4 * i4 + n_inner] = 0.0;
    for (int n_inner = 0; n_inner <= 3; ++n_inner)
    {
      child_t1[n_inner] = -1.0 * t3[n_inner];
      child_t2[n_inner] = -1.0 * t3[4 + n_inner];
      child_t3[n_inner] = fabs((child_t1[n_inner] + t3[8 + n_inner]) * (child_t2[n_inner] + t3[20 + n_inner]) + -1.0 * (child_t1[n_inner] + t3[16 + n_inner]) * (child_t2[n_inner] + t3[12 + n_inner]));
    }
    for (int child_ip = 0; child_ip <= 2; ++child_ip)
    {
      for (int n_inner = 0; n_inner <= 3; ++n_inner)
        child_t5[n_inner] = child_t4[child_ip] * child_t3[n_inner];
      for (int child_j = 0; child_j <= 2; ++child_j)
      {
        for (int n_inner = 0; n_inner <= 3; ++n_inner)
          child_t6[n_inner] = child_t0[3 * child_ip + child_j] * child_t5[n_inner];
        for (int child_k = 0; child_k <= 2; ++child_k)
          for (int n_inner = 0; n_inner <= 3; ++n_inner)
            t2[12 * child_j + 4 * child_k + n_inner] = t2[12 * child_j + 4 * child_k + n_inner] + child_t0[3 * child_ip + child_k] * child_t6[n_inner];
      }
    }
    for (int n_inner = 0; n_inner <= 3; ++n_inner)
      mat0 = MatSetValuesBlockedLocal(3, &(map0[3 * (4 * n_outer + n_inner)]), 3, &(map0[3 * (4 * n_outer + n_inner)]), &(t2[n_inner]), ADD_VALUES);
  }
}

This is almost correct (!) except for in the function call MatSetValuesBlockedLocal, the input data is passed in as &(t2[n_inner]).

Recall that what we are doing here is computing the array t2, and use it to update mat0. Originally t2 has shape (3, 3). In SubArrayRef notation, it is [i, j]:t2[i, j]. After ilp.seq transformation, it is vector expanded to (3, 3, 4), and it should become [i, j]: t2[i, j, n_inner]. The problem is that MatSetValuesBlockedLocal expects an contiguous array for t2, i.e. the swept inames should be the innermost dimensions. So I think we either need to "transpose" t2 to (4, 3, 3), or (probably better) extract t2[:,:,n_inner] to a temporary array before calling the function. So the second question is:

For external vector function calls on temporaries, do we need to ensure that the temporaries have the swept inames as the innermost dimension? This might require explicit repacking of data.

Hopefully it is clear. Thanks very much!

-TJ

Edited Apr 26, 2018 by tj-sun