From 4c46e82206a4d89863d72056166ec7e082f3e3b0 Mon Sep 17 00:00:00 2001
From: Rio Yokota <yokota@yeager.bu.edu>
Date: Thu, 20 Jan 2011 21:19:21 -0500
Subject: [PATCH] Reorganize files a bit.

---
 .gitignore             |   1 +
 read-only-dump/fmm.py  | 270 +++++++++++++++++++++++++++++++++++
 read-only-dump/fmm2.py | 141 +++++++++++++++++++
 read-only-dump/tree.py | 140 +++++++++++++++++++
 rio/kernel.c           | 100 +++++++++++++
 rio/tree.c             | 205 +++++++++++++++++++++++++++
 rio/tree.cu            | 309 +++++++++++++++++++++++++++++++++++++++++
 7 files changed, 1166 insertions(+)
 create mode 100644 read-only-dump/fmm.py
 create mode 100644 read-only-dump/fmm2.py
 create mode 100644 read-only-dump/tree.py
 create mode 100644 rio/kernel.c
 create mode 100644 rio/tree.c
 create mode 100644 rio/tree.cu

diff --git a/.gitignore b/.gitignore
index d6e3e906..64192954 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ setuptools*egg
 setuptools.pth
 distribute*egg
 distribute*tar.gz
+a.out
diff --git a/read-only-dump/fmm.py b/read-only-dump/fmm.py
new file mode 100644
index 00000000..bba0c1ab
--- /dev/null
+++ b/read-only-dump/fmm.py
@@ -0,0 +1,270 @@
+from __future__ import division
+
+import pyopencl as cl
+import pyopencl.array as cl_array
+import numpy as np
+import sympy as sp
+import sympy.printing.ccode
+import numpy.linalg as la
+
+# TODO:
+# - Data layout, float4s bad
+# - Make side-effect-free
+# - Exclude self-interaction if source and target are same
+
+# LATER:
+# - Optimization for source = target (postpone)
+
+
+DIRECT_KERNEL = """
+    __kernel void sum_direct(
+      __global float *potential_g,
+      __global const float4 *target_g,
+      __global const float4 *source_g,
+      ulong nsource,
+      ulong ntarget)
+    {
+      int itarget = get_global_id(0);
+      if (itarget >= ntarget) return;
+
+      float p=0;
+      for(int isource=0; isource<nsource; isource++ )
+      {
+        float4 dist = target_g[itarget] - source_g[isource];
+        float4 dist_sq = dist*dist;
+        p += source_g[isource].w * rsqrt(dist_sq.x + dist_sq.y + dist_sq.z);
+      }
+      potential_g[itarget] = p;
+    }
+    """
+
+
+
+def make_sym_vector(name, components):
+    return sp.Matrix(
+            [sp.Symbol("%s%d" % (name, i)) for i in range(components)])
+
+
+
+class CLCodePrinter(sp.printing.codeprinter.CodePrinter):
+    def __init__(self, vectors=set()):
+        sp.printing.codeprinter.CodePrinter.__init__(self)
+        self.vectors = vectors
+ 
+    def _print_Pow(self, expr):
+        from sympy.core import S
+        from sympy.printing.precedence import precedence
+        PREC = precedence(expr)
+        if expr.exp is S.NegativeOne:
+            return '1.0/%s'%(self.parenthesize(expr.base, PREC))
+        elif expr.exp == 0.5:
+            return 'sqrt(%s)' % self._print(expr.base)
+        elif expr.exp == -0.5:
+            return 'rsqrt(%s)' % self._print(expr.base)
+        elif expr.exp == 2 and isinstance(expr.base, sp.Symbol):
+            return '%s*%s' % (expr.base.name, expr.base.name)
+        else:
+            return 'pow(%s, %s)'%(self._print(expr.base),
+                                 self._print(expr.exp))
+
+
+
+
+class SympyMapper(object):
+    def __call__(self, expr, *args, **kwargs):
+        return self.rec(expr, *args, **kwargs)
+
+    def rec(self, expr, *args, **kwargs):
+        mro = list(type(expr).__mro__)
+
+        while mro:
+            method_name = "map_"+mro.pop(0).__name__
+
+            try:
+                method = getattr(self, method_name)
+            except AttributeError:
+                pass
+            else:
+                return method(expr, *args, **kwargs)
+
+        raise NotImplementedError(
+                "%s does not know how to map type '%s'"
+                % (type(self).__name__,
+                    type(expr).__name__))
+
+
+
+
+
+class IdentityMapper(SympyMapper):
+    def map_Add(self, expr):
+        return type(expr)(*tuple(self.rec(arg) for arg in expr.args))
+
+    map_Mul = map_Add
+    map_Pow = map_Add
+    map_Function = map_Add
+
+    def map_Integer(self, expr):
+        return expr
+
+    map_Symbol = map_Integer
+    map_Real = map_Integer
+
+
+
+
+class SquareRewriter(IdentityMapper):
+    def __init__(self, symbol_gen, expr_to_var={}):
+        self.assignments = []
+        self.symbol_gen = iter(symbol_gen)
+        self.expr_to_var = expr_to_var
+
+    def get_var_for(self, expr):
+        try:
+            return self.expr_to_var[expr]
+        except KeyError:
+            sym = self.symbol_gen.next()
+            self.assignments.append((sym, expr))
+            self.expr_to_var[expr] = sym
+            return sym
+
+    def __call__(self, var_name, expr):
+        self.assignments.append((var_name, self.rec(expr)))
+
+    def map_Pow(self, expr):
+        if expr.exp == 2:
+            new_base = self.get_var_for(expr.base)
+            return new_base**2
+        else:
+            return IdentityMapper.map_Pow(self, expr)
+
+
+
+
+
+def generate_cl_statements_from_assignments(assignments):
+    """
+    :param assignments: a list of tuples *(var_name, expr)*
+    """
+
+    # {{{ perform CSE
+
+    from sympy.utilities.iterables import  numbered_symbols
+    sym_gen = numbered_symbols("cse")
+
+    new_assignments = []
+    for var_name, expr in assignments:
+        print 'Initial expression for',var_name
+        print expr
+        from sympy.simplify.cse_main import cse
+        replacements, reduced = cse([expr], sym_gen)
+        print 'replacements', replacements
+        print 'reduced', reduced
+        new_assignments.extend(
+                (sym.name, expr) for sym, expr in replacements)
+        print 'new_assignments', new_assignments
+        new_assignments.append((var_name, reduced[0]))
+        print new_assignments
+
+    assignments = new_assignments
+    print 'After CSE',assignments
+    # }}}
+
+    # {{{ rewrite squares
+
+    sq_rewriter = SquareRewriter(sym_gen, expr_to_var=dict(
+        (expr, sp.Symbol(var_name)) for var_name, expr in assignments))
+
+    for var_name, expr in assignments:
+        sq_rewriter(var_name, expr)
+    print 'After SqRewrite',sq_rewriter.assignments
+
+    # }}}
+
+    # {{{ print code
+
+    ccp = CLCodePrinter()
+    return ["%s = %s" % (var_name, ccp.doprint(expr))
+            for var_name, expr in sq_rewriter.assignments]
+
+    # }}}
+
+def gen_direct_sum_for_kernel(expr):
+    lines = generate_cl_statements_from_assignments(
+            [("result", expr)])
+
+    print "\n".join(lines)
+
+    1/0
+
+
+
+def make_coulomb_kernel(dimensions=3):
+    tgt = make_sym_vector("t", dimensions)
+    src = make_sym_vector("s", dimensions)
+
+    return 1/sp.sqrt(((tgt-src).T*(tgt-src))[0,0])
+
+
+
+
+
+def test_direct():
+    target = np.random.rand(5000, 4).astype(np.float32)
+    source = np.random.rand(5000, 4).astype(np.float32)
+
+    dev = cl.get_platforms()[0].get_devices()[1]
+    print dev.name
+    ctx = cl.Context([dev])
+    queue = cl.CommandQueue(ctx)
+
+    target_dev = cl_array.to_device(ctx, queue, target)
+    source_dev = cl_array.to_device(ctx, queue, source)
+
+
+
+    prg = cl.Program(ctx,
+            gen_direct_sum_for_kernel(
+                    make_coulomb_kernel()
+                    .diff(sp.Symbol("t0"))
+                    )).build()
+
+    sum_direct = prg.sum_direct
+    sum_direct.set_scalar_arg_dtypes([None, None, None, np.uintp, np.uintp])
+
+    potential_dev = cl_array.empty(ctx, len(target), np.float32, queue=queue)
+    grp_size = 128
+    sum_direct(queue, ((len(target) + grp_size) // grp_size * grp_size,), (grp_size,),
+        potential_dev.data, target_dev.data, source_dev.data, len(source), len(target))
+
+    potential = potential_dev.get()
+    potential_host = np.empty_like(potential)
+
+    for itarg in xrange(len(target)):
+        potential_host[itarg] = np.sum(
+                source[:,3]
+                /
+                np.sum((target[itarg,:3] - source[:,:3])**2, axis=-1)**0.5)
+
+    #print potential[:100]
+    #print potential_host[:100]
+    assert la.norm(potential - potential_host)/la.norm(potential_host) < 1e-6
+
+def test_symbolic():
+    from sympy.utilities.iterables import numbered_symbols
+    dim         = 3
+    potKernel   = make_coulomb_kernel()
+    fieldKernel = sp.Matrix([potKernel.diff(s) for d,s in zip(range(dim), numbered_symbols('t'))])
+    print 'Kernels:'
+    print potKernel
+    print fieldKernel
+    print 'Kernel OpenCL code:'
+    lines = generate_cl_statements_from_assignments([('potential', potKernel)])
+    print '\n'.join(lines)
+    return
+
+if __name__ == "__main__":
+    test_symbolic()
+    #test_direct()
+
+# vim: foldmethod=marker
diff --git a/read-only-dump/fmm2.py b/read-only-dump/fmm2.py
new file mode 100644
index 00000000..ef720213
--- /dev/null
+++ b/read-only-dump/fmm2.py
@@ -0,0 +1,141 @@
+from __future__ import division
+
+import pyopencl as cl
+import pyopencl.array as cl_array
+import numpy as np
+import numpy.linalg as la
+import sympy as sp
+
+DIRECT_KERNEL = """
+    __kernel void sum_direct(
+      __global float *potential_g,
+      __global const float4 *target_g,
+      __global const float4 *source_g,
+      ulong nsource,
+      ulong ntarget)
+    {
+      int itarget = get_global_id(0);
+      if (itarget >= ntarget) return;
+
+      float p=0;
+      for(int isource=0; isource<nsource; isource++ )
+      {
+        float4 dist = target_g[itarget] - source_g[isource];
+        float4 dist_sq = dist*dist;
+        p += source_g[isource].w * rsqrt(dist_sq.x + dist_sq.y + dist_sq.z);
+      }
+      potential_g[itarget] = p;
+    }
+    """
+
+
+
+
+def generate_derivatives(dimensions, max_order):
+    from sumpy.fmm import make_sym_vector
+    x = make_sym_vector("x", dimensions)
+    func = 1/sp.sqrt((x.T*x)[0,0])
+
+    yield func
+
+    derivative_cache = {
+            dimensions*(0,): func
+            }
+    for order in range(max_order+1):
+        from pytools import (
+                generate_nonnegative_integer_tuples_summing_to_at_most
+                as gnitstam)
+        for idx in gnitstam(order, dimensions):
+
+
+
+
+
+
+
+def test_direct():
+    target = -np.random.rand(50, 4).astype(np.float32)
+    source = np.random.rand(50, 4).astype(np.float32)
+    multip = np.zeros((10,1))
+
+    xc = yc = zc = 0.5
+    for j in range(len(source)):
+        dx = xc-source[j,0]
+        dy = yc-source[j,1]
+        dz = zc-source[j,2]
+        multip[0] += source[j,3]
+        multip[1] += source[j,3] * dx
+        multip[2] += source[j,3] * dy
+        multip[3] += source[j,3] * dz
+        multip[4] += source[j,3] * dx * dx / 2
+        multip[5] += source[j,3] * dy * dy / 2
+        multip[6] += source[j,3] * dz * dz / 2
+        multip[7] += source[j,3] * dx * dy / 2
+        multip[8] += source[j,3] * dy * dz / 2
+        multip[9] += source[j,3] * dz * dx / 2
+        # this one is \vec(x)^n / n!
+
+    print "CTX"
+    dev = cl.get_platforms()[0].get_devices()[1]
+    print dev.name
+    ctx = cl.Context([dev])
+    queue = cl.CommandQueue(ctx)
+    print "CTX END"
+
+    target_dev = cl_array.to_device(ctx, queue, target)
+    source_dev = cl_array.to_device(ctx, queue, source)
+
+
+    prg = cl.Program(ctx,DIRECT_KERNEL).build()
+
+    sum_direct = prg.sum_direct
+    sum_direct.set_scalar_arg_dtypes([None, None, None, np.uintp, np.uintp])
+
+    potential_dev = cl_array.empty(ctx, len(target), np.float32, queue=queue)
+    grp_size = 128
+    sum_direct(queue, ((len(target) + grp_size) // grp_size * grp_size,), (grp_size,),
+        potential_dev.data, target_dev.data, source_dev.data, len(source), len(target))
+
+    potential = potential_dev.get()
+    potential_host = np.empty_like(potential)
+
+    for i in range(len(target)):
+        p = 0
+        X = target[i,0] - xc
+        Y = target[i,1] - yc
+        Z = target[i,2] - zc
+        R = (X * X + Y * Y + Z * Z)**0.5
+        R3 = R * R * R
+        R5 = R3 * R * R
+        p += multip[0] / R
+        p += multip[1] * (-X / R3)
+        p += multip[2] * (-Y / R3)
+        p += multip[3] * (-Z / R3)
+        p += multip[4] * (3 * X * X / R5 - 1 / R3)
+        p += multip[5] * (3 * Y * Y / R5 - 1 / R3)
+        p += multip[6] * (3 * Z * Z / R5 - 1 / R3)
+        p += multip[7] * (3 * X * Y / R5)
+        p += multip[8] * (3 * Y * Z / R5)
+        p += multip[9] * (3 * Z * X / R5)
+        # this one is grad^n 1/R
+        # ok -- i'll go play with sympy on screen 1
+        potential[i] = p
+    for itarg in xrange(len(target)):
+        potential_host[itarg] = np.sum(
+                source[:,3]
+                /
+                np.sum((target[itarg,:3] - source[:,:3])**2, axis=-1)**0.5)
+
+    print potential[:10]
+    print potential_host[:10]
+    print la.norm(potential - potential_host)/la.norm(potential_host)
+
+
+
+
+
+
+if __name__ == "__main__":
+    test_direct()
+
+# vim: foldmethod=marker
diff --git a/read-only-dump/tree.py b/read-only-dump/tree.py
new file mode 100644
index 00000000..0b359d6d
--- /dev/null
+++ b/read-only-dump/tree.py
@@ -0,0 +1,140 @@
+from __future__ import division
+
+import pyopencl as cl
+import pyopencl.array as cl_array
+import numpy as np
+import numpy.linalg as la
+
+DIRECT_KERNEL = """
+    __kernel void sum_direct(
+      __global float *potential_g,
+      __global const float4 *target_g,
+      __global const float4 *source_g,
+      ulong nsource,
+      ulong ntarget)
+    {
+      int itarget = get_global_id(0);
+      if (itarget >= ntarget) return;
+
+      float p=0;
+      for(int isource=0; isource<nsource; isource++ )
+      {
+        float4 dist = target_g[itarget] - source_g[isource];
+        float4 dist_sq = dist*dist;
+        p += source_g[isource].w * rsqrt(dist_sq.x + dist_sq.y + dist_sq.z);
+      }
+      potential_g[itarget] = p;
+    }
+    """
+
+
+def test_tree():
+    import logging
+    from math import log
+    logging.basicConfig(filename = 'fmm.log', level = logging.DEBUG)
+
+    logging.debug("CTX")
+    dev = cl.get_platforms()[0].get_devices()[1]
+    logging.debug(dev.name)
+    ctx = cl.Context([dev])
+    queue = cl.CommandQueue(ctx)
+    logging.debug("CTX END")
+
+    logging.debug('This message should go to the log file')
+    order           = 3
+    numCoefficients = order*(order+1)*(order+2)/6
+    
+    separationMax  = 10
+    base           = 1.55
+    targetOffsets  = base**np.linspace(log(1, base), log(separationMax, base), 20)
+    res            = []
+    np.random.seed(1)
+    # Convergence study
+    for target_offset in targetOffsets:
+        # (x,y,z,phi) for each target
+        xt = yt = zt = 0.5 - target_offset
+        target = np.random.rand(50, 4).astype(np.float32)
+        target[:,0] += xt - 0.5
+        target[:,1] += yt - 0.5
+        target[:,2] += zt - 0.5
+        # (x,y,z,q) for each source
+        source = np.random.rand(50, 4).astype(np.float32)
+        # M_i: coefficients of multipole expansion
+        multip = np.zeros((numCoefficients,1))
+
+        # Expand around source box center
+        xc = yc = zc = 0.5
+        for j in range(len(source)):
+            dx = xc-source[j,0]
+            dy = yc-source[j,1]
+            dz = zc-source[j,2]
+            multip[0] += source[j,3]
+            multip[1] += source[j,3] * dx
+            multip[2] += source[j,3] * dy
+            multip[3] += source[j,3] * dz
+            multip[4] += source[j,3] * dx * dx / 2
+            multip[5] += source[j,3] * dy * dy / 2
+            multip[6] += source[j,3] * dz * dz / 2
+            multip[7] += source[j,3] * dx * dy / 2
+            multip[8] += source[j,3] * dy * dz / 2
+            multip[9] += source[j,3] * dz * dx / 2
+
+        target_dev = cl_array.to_device(queue, target)
+        source_dev = cl_array.to_device(queue, source)
+
+        prg = cl.Program(ctx,DIRECT_KERNEL).build()
+
+        sum_direct = prg.sum_direct
+        sum_direct.set_scalar_arg_dtypes([None, None, None, np.uintp, np.uintp])
+
+        potential_dev = cl_array.empty(queue, len(target), np.float32)
+        grp_size = 1
+        sum_direct(queue, ((len(target) + grp_size) // grp_size * grp_size,), (grp_size,),
+                   potential_dev.data, target_dev.data, source_dev.data, len(source), len(target))
+
+        potential = potential_dev.get()
+        potential_host = np.empty_like(potential)
+
+        for i in range(len(target)):
+            p = 0
+            X = target[i,0] - xc
+            Y = target[i,1] - yc
+            Z = target[i,2] - zc
+            R = (X * X + Y * Y + Z * Z)**0.5
+            R3 = R * R * R
+            R5 = R3 * R * R
+            p += multip[0] / R
+            p += multip[1] * (-X / R3)
+            p += multip[2] * (-Y / R3)
+            p += multip[3] * (-Z / R3)
+            p += multip[4] * (3 * X * X / R5 - 1 / R3)
+            p += multip[5] * (3 * Y * Y / R5 - 1 / R3)
+            p += multip[6] * (3 * Z * Z / R5 - 1 / R3)
+            p += multip[7] * (3 * X * Y / R5)
+            p += multip[8] * (3 * Y * Z / R5)
+            p += multip[9] * (3 * Z * X / R5)
+            potential[i] = p
+        for itarg in xrange(len(target)):
+            potential_host[itarg] = np.sum(
+                source[:,3]
+                /
+                np.sum((target[itarg,:3] - source[:,:3])**2, axis=-1)**0.5)
+
+        logging.debug(potential[:10])
+        logging.debug(potential_host[:10])
+        residual = la.norm(potential - potential_host)/la.norm(potential_host)
+        logging.debug('Potential Residual: %g' % residual)
+        res.append(residual)
+    res = np.array(res)
+    logging.debug(res)
+    dist = np.sqrt(3*targetOffsets**2)
+    intercept, slope = np.polyfit(np.log(dist), np.log(res), 1)
+    if abs(slope + order+1) > 1.0e-1:
+        import sys
+        sys.exit('Order of approximation should be %d' % order+1)
+    return
+
+if __name__ == "__main__":
+    test_tree()
+
+# vim: foldmethod=marker
diff --git a/rio/kernel.c b/rio/kernel.c
new file mode 100644
index 00000000..3c989d47
--- /dev/null
+++ b/rio/kernel.c
@@ -0,0 +1,100 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+int main() {
+  int N = 10;
+  float xi[N], yi[N], zi[N], pi[N];
+  float xj[N], yj[N], zj[N], mj[N];
+// Initialize
+  for( int i=0; i<N; i++ ) {
+    xi[i] = -rand() / (1. + RAND_MAX);
+    yi[i] = -rand() / (1. + RAND_MAX);
+    zi[i] = -rand() / (1. + RAND_MAX);
+    xj[i] =  rand() / (1. + RAND_MAX);
+    yj[i] =  rand() / (1. + RAND_MAX);
+    zj[i] =  rand() / (1. + RAND_MAX);
+    mj[i] = 1.0 / N;
+  }
+// Direct summation
+  float dx, dy, dz, r, eps2=0.0001;
+  for( int i=0; i<N; i++ ) {
+    float p = 0;
+    for( int j=0; j<N; j++ ) {
+      dx = xi[i] - xj[j];
+      dy = yi[i] - yj[j];
+      dz = zi[i] - zj[j];
+      r = sqrtf(dx * dx + dy * dy + dz * dz + eps2);
+      p += mj[j] / r;
+    }
+    pi[i] = p;
+  }
+// P2M
+  float xc[9], yc[9], zc[9];
+  float multipole[9][10];
+  for( int i=0; i<8; i++ ) {
+    xc[i] = ( i      % 2) * 0.5 + 0.25;
+    yc[i] = ((i / 2) % 2) * 0.5 + 0.25;
+    zc[i] = ( i / 4     ) * 0.5 + 0.25;
+    for( int j=0; j<10; j++ ) multipole[i][j] = 0;
+  }
+  xc[8] = yc[8] = zc[8] = 0.5;
+  for( int j=0; j<10; j++ ) multipole[8][j] = 0;
+  for( int j=0; j<N; j++ ) {
+    int i = (xj[j] > xc[8]) + ((yj[j] > yc[8]) << 1) + ((zj[j] > zc[8]) << 2);
+    dx = xc[i] - xj[j];
+    dy = yc[i] - yj[j];
+    dz = zc[i] - zj[j];
+    multipole[i][0] += mj[j];
+    multipole[i][1] += mj[j] * dx;
+    multipole[i][2] += mj[j] * dy;
+    multipole[i][3] += mj[j] * dz;
+    multipole[i][4] += mj[j] * dx * dx / 2;
+    multipole[i][5] += mj[j] * dy * dy / 2;
+    multipole[i][6] += mj[j] * dz * dz / 2;
+    multipole[i][7] += mj[j] * dx * dy / 2;
+    multipole[i][8] += mj[j] * dy * dz / 2;
+    multipole[i][9] += mj[j] * dz * dx / 2;
+  }
+// M2M
+  for( int i=0; i<8; i++ ) {
+    dx = xc[8] - xc[i];
+    dy = yc[8] - yc[i];
+    dz = zc[8] - zc[i];
+    multipole[8][0] += multipole[i][0];
+    multipole[8][1] += multipole[i][1] +  dx*multipole[i][0];
+    multipole[8][2] += multipole[i][2] +  dy*multipole[i][0];
+    multipole[8][3] += multipole[i][3] +  dz*multipole[i][0];
+    multipole[8][4] += multipole[i][4] +  dx*multipole[i][1] + dx * dx * multipole[i][0] / 2;
+    multipole[8][5] += multipole[i][5] +  dy*multipole[i][2] + dy * dy * multipole[i][0] / 2;
+    multipole[8][6] += multipole[i][6] +  dz*multipole[i][3] + dz * dz * multipole[i][0] / 2;
+    multipole[8][7] += multipole[i][7] + (dx*multipole[i][2] + dy * multipole[i][1] + dx * dy * multipole[i][0]) / 2;
+    multipole[8][8] += multipole[i][8] + (dy*multipole[i][3] + dz * multipole[i][2] + dy * dz * multipole[i][0]) / 2;
+    multipole[8][9] += multipole[i][9] + (dz*multipole[i][1] + dx * multipole[i][3] + dz * dx * multipole[i][0]) / 2;
+  }
+// M2P
+  float X, Y, Z, R, R3, R5, err=0,rel=0;
+  for( int i=0; i<N; i++ ) {
+    float p = 0;
+    X = xi[i] - xc[8];
+    Y = yi[i] - yc[8];
+    Z = zi[i] - zc[8];
+    R = sqrtf(X * X + Y * Y + Z * Z);
+    R3 = R * R * R;
+    R5 = R3 * R * R;
+    p += multipole[8][0] / R;
+    p += multipole[8][1] * (-X / R3);
+    p += multipole[8][2] * (-Y / R3);
+    p += multipole[8][3] * (-Z / R3);
+    p += multipole[8][4] * (3 * X * X / R5 - 1 / R3);
+    p += multipole[8][5] * (3 * Y * Y / R5 - 1 / R3);
+    p += multipole[8][6] * (3 * Z * Z / R5 - 1 / R3);
+    p += multipole[8][7] * (3 * X * Y / R5);
+    p += multipole[8][8] * (3 * Y * Z / R5);
+    p += multipole[8][9] * (3 * Z * X / R5);
+    err += (pi[i] - p) * (pi[i] - p);
+    rel += pi[i] * pi[i];
+    printf("%d %f %f\n",i,pi[i],p);
+  }
+  printf("error : %f\n",sqrtf(err/rel));
+}
diff --git a/rio/tree.c b/rio/tree.c
new file mode 100644
index 00000000..252aebd0
--- /dev/null
+++ b/rio/tree.c
@@ -0,0 +1,205 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+int   const NCRIT = 10;
+float const THETA = 0.5;
+float const EPS2  = 0.0001;
+
+struct cell {
+  int nleaf, nchild, leaf[NCRIT];
+  float xc, yc, zc, r;
+  float multipole[10];
+  cell *parent, *child[8];
+};
+
+void initialize(cell *C) {
+  C->nleaf = C->nchild = 0;
+  C->parent = NULL;
+  for( int i=0; i<8; i++ ) C->child[i] = NULL;
+  for( int i=0; i<10; i++ ) C->multipole[i] = 0;
+}
+
+void add_child(int octant, cell *C, cell *&CN) {
+  ++CN;
+  initialize(CN);
+  CN->r  = C->r/2;
+  CN->xc = C->xc + CN->r * ((octant & 1) * 2  -1);
+  CN->yc = C->yc + CN->r * ((octant & 2) - 1   );
+  CN->zc = C->zc + CN->r * ((octant & 4) / 2 - 1);
+  CN->parent = C;
+  C->child[octant] = CN;
+  C->nchild |= (1 << octant);
+}
+
+void split_cell(float *x, float *y, float *z, cell *C, cell *&CN) {
+  for( int i=0; i<NCRIT; i++ ) {
+    int l = C->leaf[i];
+    int octant = (x[l] > C->xc) + ((y[l] > C->yc) << 1) + ((z[l] > C->zc) << 2);
+    if( !(C->nchild & (1 << octant)) ) add_child(octant,C,CN);
+    cell *CC = C->child[octant];
+    CC->leaf[CC->nleaf++] = l;
+    if( CC->nleaf >= NCRIT ) split_cell(x,y,z,CC,CN);
+  }
+}
+
+void getMultipole(cell *C, float *x, float *y, float *z, float *m, cell **twig, int &ntwig) {
+  float dx, dy, dz;
+  if( C->nleaf >= NCRIT ) {
+    for( int c=0; c<8; c++ )
+      if( C->nchild & (1 << c) ) getMultipole(C->child[c],x,y,z,m,twig,ntwig);
+  } else {
+    for( int l=0; l<C->nleaf; l++ ) {
+      int j = C->leaf[l];
+      dx = C->xc-x[j];
+      dy = C->yc-y[j];
+      dz = C->zc-z[j];
+      C->multipole[0] += m[j];
+      C->multipole[1] += m[j] * dx;
+      C->multipole[2] += m[j] * dy;
+      C->multipole[3] += m[j] * dz;
+      C->multipole[4] += m[j] * dx * dx / 2;
+      C->multipole[5] += m[j] * dy * dy / 2;
+      C->multipole[6] += m[j] * dz * dz / 2;
+      C->multipole[7] += m[j] * dx * dy / 2;
+      C->multipole[8] += m[j] * dy * dz / 2;
+      C->multipole[9] += m[j] * dz * dx / 2;
+    }
+    twig[ntwig] = C;
+    ntwig++;
+  }
+}
+
+void upwardSweep(cell *C, cell *P) {
+  float dx, dy, dz;
+  dx = P->xc - C->xc;
+  dy = P->yc - C->yc;
+  dz = P->zc - C->zc;
+  P->multipole[0] += C->multipole[0];
+  P->multipole[1] += C->multipole[1] +  dx*C->multipole[0];
+  P->multipole[2] += C->multipole[2] +  dy*C->multipole[0];
+  P->multipole[3] += C->multipole[3] +  dz*C->multipole[0];
+  P->multipole[4] += C->multipole[4] +  dx*C->multipole[1] + dx * dx * C->multipole[0] / 2;
+  P->multipole[5] += C->multipole[5] +  dy*C->multipole[2] + dy * dy * C->multipole[0] / 2;
+  P->multipole[6] += C->multipole[6] +  dz*C->multipole[3] + dz * dz * C->multipole[0] / 2;
+  P->multipole[7] += C->multipole[7] + (dx*C->multipole[2] +      dy * C->multipole[1] + dx * dy * C->multipole[0]) / 2;
+  P->multipole[8] += C->multipole[8] + (dy*C->multipole[3] +      dz * C->multipole[2] + dy * dz * C->multipole[0]) / 2;
+  P->multipole[9] += C->multipole[9] + (dz*C->multipole[1] +      dx * C->multipole[3] + dz * dx * C->multipole[0]) / 2;
+}
+
+void evaluate(cell *CI, cell *CJ, float *x, float *y, float *z, float *m, float *p) {
+  float dx, dy, dz, r, X, Y, Z, R, R3, R5;
+  if( CJ->nleaf >= NCRIT ) {
+    for( int c=0; c<8; c++ ) {
+      if( CJ->nchild & (1 << c) ) {
+        cell *CC = CJ->child[c];
+        dx = CI->xc - CC->xc;
+        dy = CI->yc - CC->yc;
+        dz = CI->zc - CC->zc;
+        r = sqrtf(dx * dx + dy * dy + dz * dz);
+        if( CI->r+CC->r > THETA*r ) {
+          evaluate(CI,CC,x,y,z,m,p);
+        } else {
+          for( int l=0; l<CI->nleaf; l++ ) {
+            int i = CI->leaf[l];
+            X = x[i] - CC->xc;
+            Y = y[i] - CC->yc;
+            Z = z[i] - CC->zc;
+            R = sqrtf(X * X + Y * Y + Z * Z);
+            R3 = R * R * R;
+            R5 = R3 * R * R;
+            p[i] += CC->multipole[0] / R;
+            p[i] += CC->multipole[1] * (-X / R3);
+            p[i] += CC->multipole[2] * (-Y / R3);
+            p[i] += CC->multipole[3] * (-Z / R3);
+            p[i] += CC->multipole[4] * (3 * X * X / R5 - 1 / R3);
+            p[i] += CC->multipole[5] * (3 * Y * Y / R5 - 1 / R3);
+            p[i] += CC->multipole[6] * (3 * Z * Z / R5 - 1 / R3);
+            p[i] += CC->multipole[7] * (3 * X * Y / R5);
+            p[i] += CC->multipole[8] * (3 * Y * Z / R5);
+            p[i] += CC->multipole[9] * (3 * Z * X / R5);
+          }
+        }
+      }
+    }
+  } else {
+    for( int li=0; li<CI->nleaf; li++ ) {
+      int i = CI->leaf[li];
+      for( int lj=0; lj<CJ->nleaf; lj++ ) {
+        int j = CJ->leaf[lj];
+        dx = x[i] - x[j];
+        dy = y[i] - y[j];
+        dz = z[i] - z[j];
+        r = sqrtf(dx * dx + dy * dy + dz * dz + EPS2);
+        p[i] += m[j] / r;
+      }
+    }
+  }
+}
+
+int main() {
+  int N = 50;
+  float x[N], y[N], z[N], m[N], p[N], pd[N];
+// Initialize
+  for( int i=0; i<N; i++ ) {
+    x[i] = rand() / (1. + RAND_MAX) * 2 - 1;
+    y[i] = rand() / (1. + RAND_MAX) * 2 - 1;
+    z[i] = rand() / (1. + RAND_MAX) * 2 - 1;
+    m[i] = 1.0 / N;
+  }
+// Direct summation
+  float dx, dy, dz, r;
+  for( int i=0; i<N; i++ ) {
+    float pp = - m[i] / sqrtf(EPS2);
+    for( int j=0; j<N; j++ ) {
+      dx = x[i] - x[j];
+      dy = y[i] - y[j];
+      dz = z[i] - z[j];
+      r = sqrtf(dx * dx + dy * dy + dz * dz + EPS2);
+      pp += m[j] / r;
+    }
+    pd[i] = pp;
+  }
+
+// Set root cell
+  cell C0[N];
+  initialize(C0);
+  C0->xc = C0->yc = C0->zc = C0->r = 0.5;
+// Build tree
+  cell *CN = C0;
+  for( int i=0; i<N; i++ ) {
+    cell *C = C0;
+    while( C->nleaf >= NCRIT ) {
+      C->nleaf++;
+      int octant = (x[i] > C->xc) + ((y[i] > C->yc) << 1) + ((z[i] > C->zc) << 2);
+      if( !(C->nchild & (1 << octant)) ) add_child(octant,C,CN);
+      C = C->child[octant];
+    }
+    C->leaf[C->nleaf++] = i;
+    if( C->nleaf >= NCRIT ) split_cell(x,y,z,C,CN);
+  }
+// Multipole expansion
+  int ntwig=0;
+  cell *twig[N];
+  getMultipole(C0,x,y,z,m,twig,ntwig);
+// Upward translation
+  for( cell *C=CN; C!=C0; --C ) {
+    cell *P = C->parent;
+    upwardSweep(C,P);
+  }
+// Evaluate expansion
+  float err=0,rel=0;
+  for( int i=0; i<N; i++ ) p[i] = -m[i] / sqrtf(EPS2);
+  for( int t=0; t<ntwig; t++ ) {
+    cell *CI = twig[t];
+    cell *CJ = C0;
+    evaluate(CI,CJ,x,y,z,m,p);
+    for( int l=0; l<CI->nleaf; l++ ) {
+      int i = CI->leaf[l];
+      err += (pd[i] - p[i]) * (pd[i] - p[i]);
+      rel += pd[i] * pd[i];
+      printf("%d %f %f\n",i,pd[i],p[i]);
+    }
+  }
+  printf("error : %f\n",sqrtf(err/rel));
+}
diff --git a/rio/tree.cu b/rio/tree.cu
new file mode 100644
index 00000000..9d8cbc48
--- /dev/null
+++ b/rio/tree.cu
@@ -0,0 +1,309 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+int   const THREADS = 8;
+int   const NCRIT   = THREADS;
+float const THETA   = 0.5;
+float const EPS2    = 0.0001;
+
+__device__ void multipole(int i, float4 &target, float *multipShrd) {
+  float R, R3, R5;
+  float3 d;
+  d.x = target.x - multipShrd[i*13+0];
+  d.y = target.y - multipShrd[i*13+1];
+  d.z = target.z - multipShrd[i*13+2];
+  R = rsqrtf(d.x * d.x + d.y * d.y + d.z * d.z);
+  R3 = R * R * R;
+  R5 = R3 * R * R;
+  target.w += multipShrd[i*13+ 3] * R;
+  target.w += multipShrd[i*13+ 4] * (-d.x * R3);
+  target.w += multipShrd[i*13+ 5] * (-d.y * R3);
+  target.w += multipShrd[i*13+ 6] * (-d.z * R3);
+  target.w += multipShrd[i*13+ 7] * (3 * d.x * d.x * R5 - 1 * R3);
+  target.w += multipShrd[i*13+ 8] * (3 * d.y * d.y * R5 - 1 * R3);
+  target.w += multipShrd[i*13+ 9] * (3 * d.z * d.z * R5 - 1 * R3);
+  target.w += multipShrd[i*13+10] * (3 * d.x * d.y * R5);
+  target.w += multipShrd[i*13+11] * (3 * d.y * d.z * R5);
+  target.w += multipShrd[i*13+12] * (3 * d.z * d.x * R5);
+}
+
+__global__ void kernel(int *offSrcGlob, float4 *sourceGlob, int *offMtpGlob, float *multipGlob, float4 *targetGlob) {
+  int N = offSrcGlob[blockIdx.x+1]-offSrcGlob[blockIdx.x];
+  int offset = offSrcGlob[blockIdx.x];
+  float3 d;
+  __shared__ float4 sourceShrd[THREADS];
+  __shared__ float  multipShrd[13*THREADS];
+  float4 target = targetGlob[blockIdx.x * THREADS + threadIdx.x];
+  target.w *= -rsqrtf(EPS2);
+  for( int iblok=0; iblok<(N-1)/THREADS; iblok++) {
+    __syncthreads();
+    sourceShrd[threadIdx.x] = sourceGlob[offset + iblok * THREADS + threadIdx.x];
+    __syncthreads();
+    for( int i=0; i<THREADS; i++ ) {
+      d.x = target.x - sourceShrd[i].x;
+      d.y = target.y - sourceShrd[i].y;
+      d.z = target.z - sourceShrd[i].z;
+      target.w += sourceShrd[i].w * rsqrtf(d.x * d.x + d.y * d.y + d.z * d.z + EPS2);
+    }
+  }
+  int iblok = (N-1)/THREADS;
+  __syncthreads();
+  sourceShrd[threadIdx.x] = sourceGlob[offset + iblok * THREADS + threadIdx.x];
+  __syncthreads();
+  for( int i=0; i<N - (iblok * THREADS); i++ ) {
+    d.x = target.x - sourceShrd[i].x;
+    d.y = target.y - sourceShrd[i].y;
+    d.z = target.z - sourceShrd[i].z;
+    target.w += sourceShrd[i].w * rsqrtf(d.x * d.x + d.y * d.y + d.z * d.z + EPS2);
+  }
+  N = offMtpGlob[blockIdx.x+1]-offMtpGlob[blockIdx.x];
+  offset = offMtpGlob[blockIdx.x];
+  for( int iblok=0; iblok<(N-1)/THREADS; iblok++) {
+    int index = offset + iblok * THREADS + threadIdx.x;
+    __syncthreads();
+    for( int i=0; i<13; i++ )
+      multipShrd[threadIdx.x*13+i] = multipGlob[index*13+i];
+    __syncthreads();
+    for( int i=0; i<THREADS; i++ ) {
+      multipole(i,target,multipShrd);
+    }
+  }
+  iblok = (N-1)/THREADS;
+  int index = offset + iblok * THREADS + threadIdx.x;
+  __syncthreads();
+  for( int i=0; i<13; i++ )
+    multipShrd[threadIdx.x*13+i] = multipGlob[index*13+i];
+  __syncthreads();
+  for( int i=0; i<N - (iblok * THREADS); i++ ) {
+    multipole(i,target,multipShrd);
+  }
+  targetGlob[blockIdx.x * THREADS + threadIdx.x] = target;
+}
+
+struct cell {
+  int nleaf, nchild, leaf[NCRIT];
+  float xc, yc, zc, r;
+  float multipole[10];
+  cell *parent, *child[8];
+};
+
+void initialize(cell *C) {
+  C->nleaf = C->nchild = 0;
+  C->parent = NULL;
+  for( int i=0; i<8; i++ ) C->child[i] = NULL;
+  for( int i=0; i<10; i++ ) C->multipole[i] = 0;
+}
+
+void add_child(int octant, cell *C, cell *&CN) {
+  ++CN;
+  initialize(CN);
+  CN->r  = C->r/2;
+  CN->xc = C->xc + CN->r * ((octant & 1) * 2 - 1);
+  CN->yc = C->yc + CN->r * ((octant & 2) - 1    );
+  CN->zc = C->zc + CN->r * ((octant & 4) / 2 - 1);
+  CN->parent = C;
+  C->child[octant] = CN;
+  C->nchild |= (1 << octant);
+}
+
+void split_cell(float *x, float *y, float *z, cell *C, cell *&CN) {
+  for( int i=0; i<NCRIT; i++ ) {
+    int l = C->leaf[i];
+    int octant = (x[l] > C->xc) + ((y[l] > C->yc) << 1) + ((z[l] > C->zc) << 2);
+    if( !(C->nchild & (1 << octant)) ) add_child(octant,C,CN);
+    cell *CC = C->child[octant];
+    CC->leaf[CC->nleaf++] = l;
+    if( CC->nleaf >= NCRIT ) split_cell(x,y,z,CC,CN);
+  }
+}
+
+void getMultipole(cell *C, float *x, float *y, float *z, float *m, cell **twig, int &ntwig) {
+  float dx, dy, dz;
+  if( C->nleaf >= NCRIT ) {
+    for( int c=0; c<8; c++ )
+      if( C->nchild & (1 << c) ) getMultipole(C->child[c],x,y,z,m,twig,ntwig);
+  } else {
+    for( int l=0; l<C->nleaf; l++ ) {
+      int j = C->leaf[l];
+      dx = C->xc - x[j];
+      dy = C->yc - y[j];
+      dz = C->zc - z[j];
+      C->multipole[0] += m[j];
+      C->multipole[1] += m[j] * dx;
+      C->multipole[2] += m[j] * dy;
+      C->multipole[3] += m[j] * dz;
+      C->multipole[4] += m[j] * dx * dx / 2;
+      C->multipole[5] += m[j] * dy * dy / 2;
+      C->multipole[6] += m[j] * dz * dz / 2;
+      C->multipole[7] += m[j] * dx * dy / 2;
+      C->multipole[8] += m[j] * dy * dz / 2;
+      C->multipole[9] += m[j] * dz * dx / 2;
+    }
+    twig[ntwig] = C;
+    ntwig++;
+  }
+}
+
+void upwardSweep(cell *C, cell *P) {
+  float dx, dy, dz;
+  dx = P->xc - C->xc;
+  dy = P->yc - C->yc;
+  dz = P->zc - C->zc;
+  P->multipole[0] += C->multipole[0];
+  P->multipole[1] += C->multipole[1] +  dx*C->multipole[0];
+  P->multipole[2] += C->multipole[2] +  dy*C->multipole[0];
+  P->multipole[3] += C->multipole[3] +  dz*C->multipole[0];
+  P->multipole[4] += C->multipole[4] +  dx*C->multipole[1] + dx * dx * C->multipole[0] / 2;
+  P->multipole[5] += C->multipole[5] +  dy*C->multipole[2] + dy * dy * C->multipole[0] / 2;
+  P->multipole[6] += C->multipole[6] +  dz*C->multipole[3] + dz * dz * C->multipole[0] / 2;
+  P->multipole[7] += C->multipole[7] + (dx*C->multipole[2] +      dy * C->multipole[1] + dx * dy * C->multipole[0]) / 2;
+  P->multipole[8] += C->multipole[8] + (dy*C->multipole[3] +      dz * C->multipole[2] + dy * dz * C->multipole[0]) / 2;
+  P->multipole[9] += C->multipole[9] + (dz*C->multipole[1] +      dx * C->multipole[3] + dz * dx * C->multipole[0]) / 2;
+}
+
+void evaluate(cell *CI, cell *CJ, float *x, float *y, float *z, float *m, float *p,
+              int &offSrc, float4 *sourceHost, int &offMtp, float *multipHost) {
+  float dx, dy, dz, r;
+  if( CJ->nleaf >= NCRIT ) {
+    for( int c=0; c<8; c++ ) {
+      if( CJ->nchild & (1 << c) ) {
+        cell *CC = CJ->child[c];
+        dx = CI->xc - CC->xc;
+        dy = CI->yc - CC->yc;
+        dz = CI->zc - CC->zc;
+        r = sqrtf(dx * dx + dy * dy + dz * dz);
+        if( CI->r + CC->r > THETA*r ) {
+          evaluate(CI,CC,x,y,z,m,p,offSrc,sourceHost,offMtp,multipHost);
+        } else {
+          multipHost[offMtp*13+ 0] = CC->xc;
+          multipHost[offMtp*13+ 1] = CC->yc;
+          multipHost[offMtp*13+ 2] = CC->zc;
+          for( int i=0; i<10; i++ )
+            multipHost[offMtp*13+ i + 3] = CC->multipole[i];
+          offMtp++;
+        }
+      }
+    }
+  } else {
+    for( int lj=0; lj<CJ->nleaf; lj++ ) {
+      int j = CJ->leaf[lj];
+      sourceHost[offSrc].x = x[j];
+      sourceHost[offSrc].y = y[j];
+      sourceHost[offSrc].z = z[j];
+      sourceHost[offSrc].w = m[j];
+      offSrc++;
+    }
+  }
+}
+
+int main() {
+  int N = 50;
+  float x[N],y[N],z[N],m[N],p[N],pd[N];
+// Initialize
+  for( int i=0; i<N; i++ ) {
+    x[i] = rand() / (1. + RAND_MAX);
+    y[i] = rand() / (1. + RAND_MAX);
+    z[i] = rand() / (1. + RAND_MAX);
+    m[i] = 1.0 / N;
+  }
+// Direct summation
+  float dx, dy, dz, r;
+  for( int i=0; i<N; i++ ) {
+    float pp = - m[i] / sqrtf(EPS2);
+    for( int j=0; j<N; j++ ) {
+      dx = x[i] - x[j];
+      dy = y[i] - y[j];
+      dz = z[i] - z[j];
+      r = sqrtf(dx * dx + dy * dy + dz * dz + EPS2);
+      pp += m[j] / r;
+    }
+    pd[i] = pp;
+  }
+
+// Set root cell
+  cell C0[N];
+  initialize(C0);
+  C0->xc = C0->yc = C0->zc = C0->r = 0.5;
+// Build tree
+  cell *CN = C0;
+  for( int i=0; i<N; i++ ) {
+    cell *C = C0;
+    while( C->nleaf >= NCRIT ) {
+      C->nleaf++;
+      int octant = (x[i] > C->xc) + ((y[i] > C->yc) << 1) + ((z[i] > C->zc) << 2);
+      if( !(C->nchild & (1 << octant)) ) add_child(octant,C,CN);
+      C = C->child[octant];
+    }
+    C->leaf[C->nleaf++] = i;
+    if( C->nleaf >= NCRIT ) split_cell(x,y,z,C,CN);
+  }
+// Multipole expansion
+  int ntwig=0;
+  cell *twig[N];
+  getMultipole(C0,x,y,z,m,twig,ntwig);
+// Upward translation
+  for( cell *C=CN; C!=C0; --C ) {
+    cell *P = C->parent;
+    upwardSweep(C,P);
+  }
+// Evaluate expansion
+  int Nround = ntwig * THREADS;
+  int Nlist  = ntwig * Nround;
+  int Mround =    13 * Nlist;
+  int    *offSrcHost, *offSrcDevc;
+  int    *offMtpHost, *offMtpDevc;
+  float4 *sourceHost, *sourceDevc;
+  float4 *targetHost, *targetDevc;
+  float  *multipHost, *multipDevc;
+// Allocate memory on host and device
+  offSrcHost = (int   *)     malloc( (ntwig+1)*sizeof(int) );
+  offMtpHost = (int   *)     malloc( (ntwig+1)*sizeof(int) );
+  sourceHost = (float4*)     malloc(  Nlist*sizeof(float4) );
+  targetHost = (float4*)     malloc( Nround*sizeof(float4) );
+  multipHost = (float *)     malloc( Mround*sizeof(float ) );
+  cudaMalloc(  (void**) &offSrcDevc, (ntwig+1)*sizeof(int) );
+  cudaMalloc(  (void**) &offMtpDevc, (ntwig+1)*sizeof(int) );
+  cudaMalloc(  (void**) &sourceDevc,  Nlist*sizeof(float4) );
+  cudaMalloc(  (void**) &targetDevc, Nround*sizeof(float4) );
+  cudaMalloc(  (void**) &multipDevc, Mround*sizeof(float ) );
+  for( int i=0; i<N; i++ ) p[i] = 0;
+  int offSrc = 0, offMtp = 0;
+  for( int t=0; t<ntwig; t++ ) {
+    cell *CI = twig[t];
+    cell *CJ = C0;
+    offSrcHost[t] = offSrc;
+    offMtpHost[t] = offMtp;
+    for( int l=0; l<CI->nleaf; l++ ) {
+      int i = CI->leaf[l];
+      targetHost[t * THREADS + l].x = x[i];
+      targetHost[t * THREADS + l].y = y[i];
+      targetHost[t * THREADS + l].z = z[i];
+      targetHost[t * THREADS + l].w = m[i];
+    }
+    evaluate(CI,CJ,x,y,z,m,p,offSrc,sourceHost,offMtp,multipHost);
+  }
+  offSrcHost[ntwig] = offSrc;
+  offMtpHost[ntwig] = offMtp;
+// Direct summation on device
+  cudaMemcpy(offSrcDevc,offSrcHost,(ntwig+1)*sizeof(int),cudaMemcpyHostToDevice);
+  cudaMemcpy(offMtpDevc,offMtpHost,(ntwig+1)*sizeof(int),cudaMemcpyHostToDevice);
+  cudaMemcpy(sourceDevc,sourceHost, Nlist*sizeof(float4),cudaMemcpyHostToDevice);
+  cudaMemcpy(multipDevc,multipHost, Nlist*sizeof(float ),cudaMemcpyHostToDevice);
+  cudaMemcpy(targetDevc,targetHost,Nround*sizeof(float4),cudaMemcpyHostToDevice);
+  kernel<<< Nround/THREADS, THREADS >>>(offSrcDevc,sourceDevc,offMtpDevc,multipDevc,targetDevc);
+  cudaMemcpy(targetHost,targetDevc,Nround*sizeof(float4),cudaMemcpyDeviceToHost);
+// Compare results
+  float err=0, rel=0;
+  for( int t=0; t<ntwig; t++ ) {
+    cell *CI = twig[t];
+    for( int l=0; l<CI->nleaf; l++ ) {
+      int i = CI->leaf[l];
+      p[i] += targetHost[t * THREADS + l].w;
+      err += (pd[i] - p[i]) * (pd[i] - p[i]);
+      rel += pd[i] * pd[i];
+      printf("%d %f %f\n",i,pd[i],p[i]);
+    }
+  }
+  printf("error : %f\n",sqrtf(err/rel));
+}
-- 
GitLab