求助OPENMP效率问题
这个函数是molecular dynamics 中计算vdw和electrostatic能量的。有谁知道下面这个function,在使用icpc -openmp编译后,多线程运行达不到理想的效率?
4 core 提高两倍,16core 大概提高5倍。而按我的理解,multiprocess 编程基本上多少core就代表多少scale。因为它不存在像mpi那种数据传输的问题。
double CmdCore_NonBondTerms::CalcForce_nbonds(double *x, double *y, double *z, double *q, double *dx, double *dy, double *dz, double *eele, double *evdw, int *listR)
{
*eele=*evdw=0;
if(m_nsize_nonbondsdata<1) return 0;
register double ee=0,ev=0;
#ifdef ENABLE_FAST_ERFC
register double bt;
register double P1= 1.26551223;
register double P2= 1.00002368;
register double P3= 0.37409196;
register double P4= 0.09678418;
register double P5=-0.18628806;
register double P6= 0.27886807;
register double P7=-1.13520398;
register double P8= 1.48851587;
register double P9=-0.82215223;
register double P0= 0.17087277;
register double betar2;
#endif
register int i,j,k,ii,jj,t;
register double xx,yy,zz,s,r,r1,r2,r3,r6,df,ae,be,qq;
register double twel=12.0, six=6.0;
register double esf=m_14fac*m_eps*MCELEC; //e14
register double beta=m_beta,betar;
register double ferfc=0;
register double betatwo_sqrtpi=beta*2/sqrt(PI);
register double dnb=m_cutnb;
register double dsw=m_cutsw;
register double dnb2=dnb*dnb, dsw2=dsw*dsw;
register double dnb3=dnb2*dnb,dsw3=dsw2*dsw;
register double e3=1.0/dnb3,e6=e3*e3;
register double dnb6=dnb3*dnb3,dsw6=dsw3*dsw3;
register double dsa=dnb6/(dnb6-dsw6), dsb=dnb3/(dnb3-dsw3);
register double dss6=1.0/(dnb3*dsw3),dss12=1.0/(dnb6*dsw6);
register double rs3,rs6,a,b;
register int natoms=m_natoms;
register int *p;
register int *ptyp=m_atomsvdwt;
register tmdcore_nbondtable *ptab=m_nonbonds;
register tmdcore_type_vdwp *ptvdw=m_types_vdwp;
register double x1,y1,z1,dx1,dy1,dz1,q1;
#ifdef ENABLE_VDWFSWITH_MEMORY_OPTIMIZATION
for(i=0;i<m_ntypes_vdwp;i++){
m_types_vdwp[i].ad=m_types_vdwp[i].a*dsa;
m_types_vdwp[i].bd=m_types_vdwp[i].b*dsb;
}
#endif
#ifdef ENABLE_OPENMP
int inode=Computility::GetNodeID();
register int mm;
#pragma omp parallel default(shared) \
private(mm,i,x1,y1,z1,dx1,dy1,dz1,ii,p,q1,k,j,jj,t,xx,yy,zz,s,r,r1,r2,r3,r6,a,b,ae,be,df,betar,\
ferfc,qq)
{
#pragma omp for
for(mm=0;mm<natoms;mm++){
i=listR[mm];
#elif defined ENABLE_MPI_PARALLEL
int istart=0,iend=natoms;
if(CmpiUtility::IsParallel()){
int nsss=natoms/CmpiUtility::GetNumNodes()+1;
istart=CmpiUtility::GetNodeID()*nsss;
iend=istart+nsss;
if(iend>natoms) iend=natoms;
}
for(i=istart;i<iend;i++){ //here should be identical to BuildNonbondTable()
#else
for(i=0;i<natoms;i++){
#endif
x1=x[i];y1=y[i];z1=z[i]; dx1=dy1=dz1=0;
ii=ptyp[i];
#ifdef ENABLE_OPENMP
p=&m_nonbondsdata[inode][ptab[i].p]; //
#else
p=&m_nonbondsdata[ptab[i].p];
#endif
q1=q[i]*esf;
for(k=0;k<ptab[i].np;k++){
j=p[k];
jj=ptyp[j];
if(jj>ii) t= jj*(jj+1)/2+ii;
else t= ii*(ii+1)/2+jj;
xx=x[j]-x1; //if(fabs(xx)>dnb) continue;
yy=y[j]-y1; //if(fabs(yy)>dnb) continue;
zz=z[j]-z1; //if(fabs(zz)>dnb) continue;
s=xx*xx+yy*yy+zz*zz; //if(s>dnb2) continue;
//s+=zz*zz;
if(s<dnb2){
r=sqrt(s);
r1=1.0/r;
r2=r1*r1;
r3=r2*r1;
r6=r3*r3;
#ifndef ENABLE_VDWFSWITH_MEMORY_OPTIMIZATION
a=ptvdw[t].a;
b=ptvdw[t].b;
#endif
//vdw
if(s>dsw2){ // ** a*da*(r6-e6)^2-b*db*(r3-e3)^2 **
#ifdef ENABLE_VDWFSWITH_MEMORY_OPTIMIZATION
a=ptvdw[t].ad;
b=ptvdw[t].bd;
#else
a*=dsa;
b*=dsb;
#endif
rs3=r3-e3;
rs6=r6-e6;
ae= a*rs6;
be= b*rs3;
ev+=ae*rs6 - be*rs3;
df=r1*(-ae*twel*r6 + be*r3*six);
}
else{ // ** a/r12-b/r6+b*o6-a*o12 **
#ifdef ENABLE_VDWFSWITH_MEMORY_OPTIMIZATION
a=ptvdw[t].a;
b=ptvdw[t].b;
#endif
ae= a*r6*r6;
be= b*r6;
ev+=ae - be + dss6*b-dss12*a;
df= r1*(-ae*twel+be*six);
}
//elec
betar=beta*r;
#ifdef ENABLE_FAST_ERFC
bt=1/(1+0.5*betar);
betar2=betar*betar;
ferfc=(bt*exp(-betar2-P1+bt*(P2+bt*(P3+bt*(P4+
bt*(P5+bt*(P6+bt*(P7+bt*(P8+
bt*(P9+bt*P0))))))))));
ferfc=fast_erfc(betar);
#else
#ifdef __MSWIN
ferfc=fast_erfc(betar);
#else
ferfc=erfc(betar);
#endif
#endif
qq=q1*q[j]*r1;
#ifdef ENABLE_FAST_ERFC
df-=qq*(ferfc*r1+betatwo_sqrtpi*exp(-betar2)); //derfc=-two_sqrtpi*exp(-z*z)
#else
df-=qq*(ferfc*r1+betatwo_sqrtpi*exp(-betar*betar)); //derfc=-two_sqrtpi*exp(-z*z)
#endif
ee+=qq*ferfc;
//LOG_PR("%10d%6d%6d%10.5f%10.5f%10.5f%10.5f%10.5f\n",i*natoms+j,i,j,ptvdw[t].Eps,ptvdw[t].r2,r,eev,qq*ferfc);
df*=r1; //
xx*=df;
yy*=df;
zz*=df;
dx1-=xx;
dy1-=yy;
dz1-=zz;
dx[j]+=xx;
dy[j]+=yy;
dz[j]+=zz;
}// if(s<dnb2)
}// for(j=0;j<ptab[i].np;j++)
dx[i]+=dx1;
dy[i]+=dy1;
dz[i]+=dz1;
}
#ifdef ENABLE_OPENMP
}
#endif
if(eele!=NULL) *eele=ee;
if(evdw!=NULL) *evdw=ev;
return (ee+ev);
}