From 4c46e82206a4d89863d72056166ec7e082f3e3b0 Mon Sep 17 00:00:00 2001 From: Rio Yokota Date: Thu, 20 Jan 2011 21:19:21 -0500 Subject: [PATCH] Reorganize files a bit. --- .gitignore | 1 + read-only-dump/fmm.py | 270 +++++++++++++++++++++++++++++++++++ read-only-dump/fmm2.py | 141 +++++++++++++++++++ read-only-dump/tree.py | 140 +++++++++++++++++++ rio/kernel.c | 100 +++++++++++++ rio/tree.c | 205 +++++++++++++++++++++++++++ rio/tree.cu | 309 +++++++++++++++++++++++++++++++++++++++++ 7 files changed, 1166 insertions(+) create mode 100644 read-only-dump/fmm.py create mode 100644 read-only-dump/fmm2.py create mode 100644 read-only-dump/tree.py create mode 100644 rio/kernel.c create mode 100644 rio/tree.c create mode 100644 rio/tree.cu diff --git a/.gitignore b/.gitignore index d6e3e906..64192954 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ setuptools*egg setuptools.pth distribute*egg distribute*tar.gz +a.out diff --git a/read-only-dump/fmm.py b/read-only-dump/fmm.py new file mode 100644 index 00000000..bba0c1ab --- /dev/null +++ b/read-only-dump/fmm.py @@ -0,0 +1,270 @@ +from __future__ import division + +import pyopencl as cl +import pyopencl.array as cl_array +import numpy as np +import sympy as sp +import sympy.printing.ccode +import numpy.linalg as la + +# TODO: +# - Data layout, float4s bad +# - Make side-effect-free +# - Exclude self-interaction if source and target are same + +# LATER: +# - Optimization for source = target (postpone) + + +DIRECT_KERNEL = """ + __kernel void sum_direct( + __global float *potential_g, + __global const float4 *target_g, + __global const float4 *source_g, + ulong nsource, + ulong ntarget) + { + int itarget = get_global_id(0); + if (itarget >= ntarget) return; + + float p=0; + for(int isource=0; isource= ntarget) return; + + float p=0; + for(int isource=0; isource= ntarget) return; + + float p=0; + for(int isource=0; isource 1.0e-1: + import sys + sys.exit('Order of approximation should be %d' % order+1) + return + +if __name__ == "__main__": + test_tree() + +# vim: foldmethod=marker diff --git a/rio/kernel.c b/rio/kernel.c new file mode 100644 index 00000000..3c989d47 --- /dev/null +++ b/rio/kernel.c @@ -0,0 +1,100 @@ +#include +#include +#include + +int main() { + int N = 10; + float xi[N], yi[N], zi[N], pi[N]; + float xj[N], yj[N], zj[N], mj[N]; +// Initialize + for( int i=0; i xc[8]) + ((yj[j] > yc[8]) << 1) + ((zj[j] > zc[8]) << 2); + dx = xc[i] - xj[j]; + dy = yc[i] - yj[j]; + dz = zc[i] - zj[j]; + multipole[i][0] += mj[j]; + multipole[i][1] += mj[j] * dx; + multipole[i][2] += mj[j] * dy; + multipole[i][3] += mj[j] * dz; + multipole[i][4] += mj[j] * dx * dx / 2; + multipole[i][5] += mj[j] * dy * dy / 2; + multipole[i][6] += mj[j] * dz * dz / 2; + multipole[i][7] += mj[j] * dx * dy / 2; + multipole[i][8] += mj[j] * dy * dz / 2; + multipole[i][9] += mj[j] * dz * dx / 2; + } +// M2M + for( int i=0; i<8; i++ ) { + dx = xc[8] - xc[i]; + dy = yc[8] - yc[i]; + dz = zc[8] - zc[i]; + multipole[8][0] += multipole[i][0]; + multipole[8][1] += multipole[i][1] + dx*multipole[i][0]; + multipole[8][2] += multipole[i][2] + dy*multipole[i][0]; + multipole[8][3] += multipole[i][3] + dz*multipole[i][0]; + multipole[8][4] += multipole[i][4] + dx*multipole[i][1] + dx * dx * multipole[i][0] / 2; + multipole[8][5] += multipole[i][5] + dy*multipole[i][2] + dy * dy * multipole[i][0] / 2; + multipole[8][6] += multipole[i][6] + dz*multipole[i][3] + dz * dz * multipole[i][0] / 2; + multipole[8][7] += multipole[i][7] + (dx*multipole[i][2] + dy * multipole[i][1] + dx * dy * multipole[i][0]) / 2; + multipole[8][8] += multipole[i][8] + (dy*multipole[i][3] + dz * multipole[i][2] + dy * dz * multipole[i][0]) / 2; + multipole[8][9] += multipole[i][9] + (dz*multipole[i][1] + dx * multipole[i][3] + dz * dx * multipole[i][0]) / 2; + } +// M2P + float X, Y, Z, R, R3, R5, err=0,rel=0; + for( int i=0; i +#include +#include + +int const NCRIT = 10; +float const THETA = 0.5; +float const EPS2 = 0.0001; + +struct cell { + int nleaf, nchild, leaf[NCRIT]; + float xc, yc, zc, r; + float multipole[10]; + cell *parent, *child[8]; +}; + +void initialize(cell *C) { + C->nleaf = C->nchild = 0; + C->parent = NULL; + for( int i=0; i<8; i++ ) C->child[i] = NULL; + for( int i=0; i<10; i++ ) C->multipole[i] = 0; +} + +void add_child(int octant, cell *C, cell *&CN) { + ++CN; + initialize(CN); + CN->r = C->r/2; + CN->xc = C->xc + CN->r * ((octant & 1) * 2 -1); + CN->yc = C->yc + CN->r * ((octant & 2) - 1 ); + CN->zc = C->zc + CN->r * ((octant & 4) / 2 - 1); + CN->parent = C; + C->child[octant] = CN; + C->nchild |= (1 << octant); +} + +void split_cell(float *x, float *y, float *z, cell *C, cell *&CN) { + for( int i=0; ileaf[i]; + int octant = (x[l] > C->xc) + ((y[l] > C->yc) << 1) + ((z[l] > C->zc) << 2); + if( !(C->nchild & (1 << octant)) ) add_child(octant,C,CN); + cell *CC = C->child[octant]; + CC->leaf[CC->nleaf++] = l; + if( CC->nleaf >= NCRIT ) split_cell(x,y,z,CC,CN); + } +} + +void getMultipole(cell *C, float *x, float *y, float *z, float *m, cell **twig, int &ntwig) { + float dx, dy, dz; + if( C->nleaf >= NCRIT ) { + for( int c=0; c<8; c++ ) + if( C->nchild & (1 << c) ) getMultipole(C->child[c],x,y,z,m,twig,ntwig); + } else { + for( int l=0; lnleaf; l++ ) { + int j = C->leaf[l]; + dx = C->xc-x[j]; + dy = C->yc-y[j]; + dz = C->zc-z[j]; + C->multipole[0] += m[j]; + C->multipole[1] += m[j] * dx; + C->multipole[2] += m[j] * dy; + C->multipole[3] += m[j] * dz; + C->multipole[4] += m[j] * dx * dx / 2; + C->multipole[5] += m[j] * dy * dy / 2; + C->multipole[6] += m[j] * dz * dz / 2; + C->multipole[7] += m[j] * dx * dy / 2; + C->multipole[8] += m[j] * dy * dz / 2; + C->multipole[9] += m[j] * dz * dx / 2; + } + twig[ntwig] = C; + ntwig++; + } +} + +void upwardSweep(cell *C, cell *P) { + float dx, dy, dz; + dx = P->xc - C->xc; + dy = P->yc - C->yc; + dz = P->zc - C->zc; + P->multipole[0] += C->multipole[0]; + P->multipole[1] += C->multipole[1] + dx*C->multipole[0]; + P->multipole[2] += C->multipole[2] + dy*C->multipole[0]; + P->multipole[3] += C->multipole[3] + dz*C->multipole[0]; + P->multipole[4] += C->multipole[4] + dx*C->multipole[1] + dx * dx * C->multipole[0] / 2; + P->multipole[5] += C->multipole[5] + dy*C->multipole[2] + dy * dy * C->multipole[0] / 2; + P->multipole[6] += C->multipole[6] + dz*C->multipole[3] + dz * dz * C->multipole[0] / 2; + P->multipole[7] += C->multipole[7] + (dx*C->multipole[2] + dy * C->multipole[1] + dx * dy * C->multipole[0]) / 2; + P->multipole[8] += C->multipole[8] + (dy*C->multipole[3] + dz * C->multipole[2] + dy * dz * C->multipole[0]) / 2; + P->multipole[9] += C->multipole[9] + (dz*C->multipole[1] + dx * C->multipole[3] + dz * dx * C->multipole[0]) / 2; +} + +void evaluate(cell *CI, cell *CJ, float *x, float *y, float *z, float *m, float *p) { + float dx, dy, dz, r, X, Y, Z, R, R3, R5; + if( CJ->nleaf >= NCRIT ) { + for( int c=0; c<8; c++ ) { + if( CJ->nchild & (1 << c) ) { + cell *CC = CJ->child[c]; + dx = CI->xc - CC->xc; + dy = CI->yc - CC->yc; + dz = CI->zc - CC->zc; + r = sqrtf(dx * dx + dy * dy + dz * dz); + if( CI->r+CC->r > THETA*r ) { + evaluate(CI,CC,x,y,z,m,p); + } else { + for( int l=0; lnleaf; l++ ) { + int i = CI->leaf[l]; + X = x[i] - CC->xc; + Y = y[i] - CC->yc; + Z = z[i] - CC->zc; + R = sqrtf(X * X + Y * Y + Z * Z); + R3 = R * R * R; + R5 = R3 * R * R; + p[i] += CC->multipole[0] / R; + p[i] += CC->multipole[1] * (-X / R3); + p[i] += CC->multipole[2] * (-Y / R3); + p[i] += CC->multipole[3] * (-Z / R3); + p[i] += CC->multipole[4] * (3 * X * X / R5 - 1 / R3); + p[i] += CC->multipole[5] * (3 * Y * Y / R5 - 1 / R3); + p[i] += CC->multipole[6] * (3 * Z * Z / R5 - 1 / R3); + p[i] += CC->multipole[7] * (3 * X * Y / R5); + p[i] += CC->multipole[8] * (3 * Y * Z / R5); + p[i] += CC->multipole[9] * (3 * Z * X / R5); + } + } + } + } + } else { + for( int li=0; linleaf; li++ ) { + int i = CI->leaf[li]; + for( int lj=0; ljnleaf; lj++ ) { + int j = CJ->leaf[lj]; + dx = x[i] - x[j]; + dy = y[i] - y[j]; + dz = z[i] - z[j]; + r = sqrtf(dx * dx + dy * dy + dz * dz + EPS2); + p[i] += m[j] / r; + } + } + } +} + +int main() { + int N = 50; + float x[N], y[N], z[N], m[N], p[N], pd[N]; +// Initialize + for( int i=0; ixc = C0->yc = C0->zc = C0->r = 0.5; +// Build tree + cell *CN = C0; + for( int i=0; inleaf >= NCRIT ) { + C->nleaf++; + int octant = (x[i] > C->xc) + ((y[i] > C->yc) << 1) + ((z[i] > C->zc) << 2); + if( !(C->nchild & (1 << octant)) ) add_child(octant,C,CN); + C = C->child[octant]; + } + C->leaf[C->nleaf++] = i; + if( C->nleaf >= NCRIT ) split_cell(x,y,z,C,CN); + } +// Multipole expansion + int ntwig=0; + cell *twig[N]; + getMultipole(C0,x,y,z,m,twig,ntwig); +// Upward translation + for( cell *C=CN; C!=C0; --C ) { + cell *P = C->parent; + upwardSweep(C,P); + } +// Evaluate expansion + float err=0,rel=0; + for( int i=0; inleaf; l++ ) { + int i = CI->leaf[l]; + err += (pd[i] - p[i]) * (pd[i] - p[i]); + rel += pd[i] * pd[i]; + printf("%d %f %f\n",i,pd[i],p[i]); + } + } + printf("error : %f\n",sqrtf(err/rel)); +} diff --git a/rio/tree.cu b/rio/tree.cu new file mode 100644 index 00000000..9d8cbc48 --- /dev/null +++ b/rio/tree.cu @@ -0,0 +1,309 @@ +#include +#include +#include + +int const THREADS = 8; +int const NCRIT = THREADS; +float const THETA = 0.5; +float const EPS2 = 0.0001; + +__device__ void multipole(int i, float4 &target, float *multipShrd) { + float R, R3, R5; + float3 d; + d.x = target.x - multipShrd[i*13+0]; + d.y = target.y - multipShrd[i*13+1]; + d.z = target.z - multipShrd[i*13+2]; + R = rsqrtf(d.x * d.x + d.y * d.y + d.z * d.z); + R3 = R * R * R; + R5 = R3 * R * R; + target.w += multipShrd[i*13+ 3] * R; + target.w += multipShrd[i*13+ 4] * (-d.x * R3); + target.w += multipShrd[i*13+ 5] * (-d.y * R3); + target.w += multipShrd[i*13+ 6] * (-d.z * R3); + target.w += multipShrd[i*13+ 7] * (3 * d.x * d.x * R5 - 1 * R3); + target.w += multipShrd[i*13+ 8] * (3 * d.y * d.y * R5 - 1 * R3); + target.w += multipShrd[i*13+ 9] * (3 * d.z * d.z * R5 - 1 * R3); + target.w += multipShrd[i*13+10] * (3 * d.x * d.y * R5); + target.w += multipShrd[i*13+11] * (3 * d.y * d.z * R5); + target.w += multipShrd[i*13+12] * (3 * d.z * d.x * R5); +} + +__global__ void kernel(int *offSrcGlob, float4 *sourceGlob, int *offMtpGlob, float *multipGlob, float4 *targetGlob) { + int N = offSrcGlob[blockIdx.x+1]-offSrcGlob[blockIdx.x]; + int offset = offSrcGlob[blockIdx.x]; + float3 d; + __shared__ float4 sourceShrd[THREADS]; + __shared__ float multipShrd[13*THREADS]; + float4 target = targetGlob[blockIdx.x * THREADS + threadIdx.x]; + target.w *= -rsqrtf(EPS2); + for( int iblok=0; iblok<(N-1)/THREADS; iblok++) { + __syncthreads(); + sourceShrd[threadIdx.x] = sourceGlob[offset + iblok * THREADS + threadIdx.x]; + __syncthreads(); + for( int i=0; inleaf = C->nchild = 0; + C->parent = NULL; + for( int i=0; i<8; i++ ) C->child[i] = NULL; + for( int i=0; i<10; i++ ) C->multipole[i] = 0; +} + +void add_child(int octant, cell *C, cell *&CN) { + ++CN; + initialize(CN); + CN->r = C->r/2; + CN->xc = C->xc + CN->r * ((octant & 1) * 2 - 1); + CN->yc = C->yc + CN->r * ((octant & 2) - 1 ); + CN->zc = C->zc + CN->r * ((octant & 4) / 2 - 1); + CN->parent = C; + C->child[octant] = CN; + C->nchild |= (1 << octant); +} + +void split_cell(float *x, float *y, float *z, cell *C, cell *&CN) { + for( int i=0; ileaf[i]; + int octant = (x[l] > C->xc) + ((y[l] > C->yc) << 1) + ((z[l] > C->zc) << 2); + if( !(C->nchild & (1 << octant)) ) add_child(octant,C,CN); + cell *CC = C->child[octant]; + CC->leaf[CC->nleaf++] = l; + if( CC->nleaf >= NCRIT ) split_cell(x,y,z,CC,CN); + } +} + +void getMultipole(cell *C, float *x, float *y, float *z, float *m, cell **twig, int &ntwig) { + float dx, dy, dz; + if( C->nleaf >= NCRIT ) { + for( int c=0; c<8; c++ ) + if( C->nchild & (1 << c) ) getMultipole(C->child[c],x,y,z,m,twig,ntwig); + } else { + for( int l=0; lnleaf; l++ ) { + int j = C->leaf[l]; + dx = C->xc - x[j]; + dy = C->yc - y[j]; + dz = C->zc - z[j]; + C->multipole[0] += m[j]; + C->multipole[1] += m[j] * dx; + C->multipole[2] += m[j] * dy; + C->multipole[3] += m[j] * dz; + C->multipole[4] += m[j] * dx * dx / 2; + C->multipole[5] += m[j] * dy * dy / 2; + C->multipole[6] += m[j] * dz * dz / 2; + C->multipole[7] += m[j] * dx * dy / 2; + C->multipole[8] += m[j] * dy * dz / 2; + C->multipole[9] += m[j] * dz * dx / 2; + } + twig[ntwig] = C; + ntwig++; + } +} + +void upwardSweep(cell *C, cell *P) { + float dx, dy, dz; + dx = P->xc - C->xc; + dy = P->yc - C->yc; + dz = P->zc - C->zc; + P->multipole[0] += C->multipole[0]; + P->multipole[1] += C->multipole[1] + dx*C->multipole[0]; + P->multipole[2] += C->multipole[2] + dy*C->multipole[0]; + P->multipole[3] += C->multipole[3] + dz*C->multipole[0]; + P->multipole[4] += C->multipole[4] + dx*C->multipole[1] + dx * dx * C->multipole[0] / 2; + P->multipole[5] += C->multipole[5] + dy*C->multipole[2] + dy * dy * C->multipole[0] / 2; + P->multipole[6] += C->multipole[6] + dz*C->multipole[3] + dz * dz * C->multipole[0] / 2; + P->multipole[7] += C->multipole[7] + (dx*C->multipole[2] + dy * C->multipole[1] + dx * dy * C->multipole[0]) / 2; + P->multipole[8] += C->multipole[8] + (dy*C->multipole[3] + dz * C->multipole[2] + dy * dz * C->multipole[0]) / 2; + P->multipole[9] += C->multipole[9] + (dz*C->multipole[1] + dx * C->multipole[3] + dz * dx * C->multipole[0]) / 2; +} + +void evaluate(cell *CI, cell *CJ, float *x, float *y, float *z, float *m, float *p, + int &offSrc, float4 *sourceHost, int &offMtp, float *multipHost) { + float dx, dy, dz, r; + if( CJ->nleaf >= NCRIT ) { + for( int c=0; c<8; c++ ) { + if( CJ->nchild & (1 << c) ) { + cell *CC = CJ->child[c]; + dx = CI->xc - CC->xc; + dy = CI->yc - CC->yc; + dz = CI->zc - CC->zc; + r = sqrtf(dx * dx + dy * dy + dz * dz); + if( CI->r + CC->r > THETA*r ) { + evaluate(CI,CC,x,y,z,m,p,offSrc,sourceHost,offMtp,multipHost); + } else { + multipHost[offMtp*13+ 0] = CC->xc; + multipHost[offMtp*13+ 1] = CC->yc; + multipHost[offMtp*13+ 2] = CC->zc; + for( int i=0; i<10; i++ ) + multipHost[offMtp*13+ i + 3] = CC->multipole[i]; + offMtp++; + } + } + } + } else { + for( int lj=0; ljnleaf; lj++ ) { + int j = CJ->leaf[lj]; + sourceHost[offSrc].x = x[j]; + sourceHost[offSrc].y = y[j]; + sourceHost[offSrc].z = z[j]; + sourceHost[offSrc].w = m[j]; + offSrc++; + } + } +} + +int main() { + int N = 50; + float x[N],y[N],z[N],m[N],p[N],pd[N]; +// Initialize + for( int i=0; ixc = C0->yc = C0->zc = C0->r = 0.5; +// Build tree + cell *CN = C0; + for( int i=0; inleaf >= NCRIT ) { + C->nleaf++; + int octant = (x[i] > C->xc) + ((y[i] > C->yc) << 1) + ((z[i] > C->zc) << 2); + if( !(C->nchild & (1 << octant)) ) add_child(octant,C,CN); + C = C->child[octant]; + } + C->leaf[C->nleaf++] = i; + if( C->nleaf >= NCRIT ) split_cell(x,y,z,C,CN); + } +// Multipole expansion + int ntwig=0; + cell *twig[N]; + getMultipole(C0,x,y,z,m,twig,ntwig); +// Upward translation + for( cell *C=CN; C!=C0; --C ) { + cell *P = C->parent; + upwardSweep(C,P); + } +// Evaluate expansion + int Nround = ntwig * THREADS; + int Nlist = ntwig * Nround; + int Mround = 13 * Nlist; + int *offSrcHost, *offSrcDevc; + int *offMtpHost, *offMtpDevc; + float4 *sourceHost, *sourceDevc; + float4 *targetHost, *targetDevc; + float *multipHost, *multipDevc; +// Allocate memory on host and device + offSrcHost = (int *) malloc( (ntwig+1)*sizeof(int) ); + offMtpHost = (int *) malloc( (ntwig+1)*sizeof(int) ); + sourceHost = (float4*) malloc( Nlist*sizeof(float4) ); + targetHost = (float4*) malloc( Nround*sizeof(float4) ); + multipHost = (float *) malloc( Mround*sizeof(float ) ); + cudaMalloc( (void**) &offSrcDevc, (ntwig+1)*sizeof(int) ); + cudaMalloc( (void**) &offMtpDevc, (ntwig+1)*sizeof(int) ); + cudaMalloc( (void**) &sourceDevc, Nlist*sizeof(float4) ); + cudaMalloc( (void**) &targetDevc, Nround*sizeof(float4) ); + cudaMalloc( (void**) &multipDevc, Mround*sizeof(float ) ); + for( int i=0; inleaf; l++ ) { + int i = CI->leaf[l]; + targetHost[t * THREADS + l].x = x[i]; + targetHost[t * THREADS + l].y = y[i]; + targetHost[t * THREADS + l].z = z[i]; + targetHost[t * THREADS + l].w = m[i]; + } + evaluate(CI,CJ,x,y,z,m,p,offSrc,sourceHost,offMtp,multipHost); + } + offSrcHost[ntwig] = offSrc; + offMtpHost[ntwig] = offMtp; +// Direct summation on device + cudaMemcpy(offSrcDevc,offSrcHost,(ntwig+1)*sizeof(int),cudaMemcpyHostToDevice); + cudaMemcpy(offMtpDevc,offMtpHost,(ntwig+1)*sizeof(int),cudaMemcpyHostToDevice); + cudaMemcpy(sourceDevc,sourceHost, Nlist*sizeof(float4),cudaMemcpyHostToDevice); + cudaMemcpy(multipDevc,multipHost, Nlist*sizeof(float ),cudaMemcpyHostToDevice); + cudaMemcpy(targetDevc,targetHost,Nround*sizeof(float4),cudaMemcpyHostToDevice); + kernel<<< Nround/THREADS, THREADS >>>(offSrcDevc,sourceDevc,offMtpDevc,multipDevc,targetDevc); + cudaMemcpy(targetHost,targetDevc,Nround*sizeof(float4),cudaMemcpyDeviceToHost); +// Compare results + float err=0, rel=0; + for( int t=0; tnleaf; l++ ) { + int i = CI->leaf[l]; + p[i] += targetHost[t * THREADS + l].w; + err += (pd[i] - p[i]) * (pd[i] - p[i]); + rel += pd[i] * pd[i]; + printf("%d %f %f\n",i,pd[i],p[i]); + } + } + printf("error : %f\n",sqrtf(err/rel)); +} -- GitLab