高人指点下,如何用openmp并行设计对函数进行优化
void Get_Ftr(BYTE* pdata,BYTE* ftr)
{
int i, j;
BYTE eight_neighbor;
BYTE direction;
BYTE block;
BYTE tmpblock;
SHORT xcenter;
SHORT tmpxCenter;
SHORT ycenter;
BYTE i_3;
BYTE j_3;
BYTE* px=pdata;//
BYTE* py;
BYTE* contour;
contour=(BYTE *)malloc(4096*sizeof(BYTE));
py=contour;
//step 1: Get contour point
memset(contour,0,4096);
//原程序循环操作
/*__int64 t1,t2,t3,t4;
t1 = RDTSC();*/
for( i=4095;i>=0;i--,px++,py++)
{
if( px[0]&&(px[-64]+px[-1]+px[64]+px[1])<4&&(px[-64]+px[-63]+px[-65]+px[1]+px[-1]+px[64]+px[63]+px[65])>0)
py[0]=1;
}
/*t2 = RDTSC();
printf("runtimeorig_4095 = %I64u\r\n",(t2-t1)/1000 );*/
//step 2: Get each cobtour point
px=contour+64;
//t3 = RDTSC();
for(i=1;i<63;i++)
{
px++;
i_3=i>>3;
tmpxCenter = i/8*8-1;//((i_3)<<3)-1;
for( j=1;j<63;j++,px++)
{
if(px[0])
{
eight_neighbor=(px[-63]<<7)+(px[-64]<<6)+(px[-65]<<5)+(px[-1]<<4)+(px[63]<<3)+(px[64]<<2)+(px[65]<<1)+px[1];
direction=Dot_Orientation[eight_neighbor];
if(!direction)
continue;
block=0;
xcenter=tmpxCenter;
ycenter=((j>>3)<<3)-1;//j/8*8-1;
j_3=j>>3;
tmpblock = 7*j_3+i_3;
if(xcenter>0&&ycenter>0)
{
block =tmpblock-8;
Get_DEF_Dot_ftr(direction,ftr+block);
}
xcenter+=8;
if(ycenter>0&&xcenter<56&&xcenter>0)
{
block =tmpblock-7;
Get_DEF_Dot_ftr(direction,ftr+block);
}
xcenter-=8;
ycenter+=8;
if(xcenter>0&&ycenter<56&&ycenter>0)
{
block =tmpblock-1;
Get_DEF_Dot_ftr(direction,ftr+block);
}
xcenter+=8;
if(xcenter<56&&ycenter<56&&xcenter>0&&ycenter>0)
{
block =tmpblock;
Get_DEF_Dot_ftr(direction,ftr+block);
}
}
}
px++;
}
/*t4 = RDTSC();
printf("runtimeFTR63 = %I64u\r\n",(t4-t3)/1000 );*/
}
这个函数里面有两个主要费时的循环:for( i=4095;i>=0;i--,px++,py++),还有一个63*63嵌套循环,
对第一个循环:用openmp并行
#pragma omp parallel for
for(j=1;j>=0;j--)
{
for( i=2047;i>=0;i--)
{
px++;py++;
if( px[0]&&(px[-64]+px[-1]+px[64]+px[1])<4&&(px[-64]+px[-63]+px[-65]+px[1]+px[-1]+px[64]+px[63]+px[65])>0)
py[0]=1;
}
}
不仅不快 还慢了很多!
对63*63循环我把它分成两个section来做,不仅慢 结果也不对了。
#pragma omp parallel sections firstprivate(ftr) lastprivate(ftr)
{
#pragma omp section
{
for(int i=1;i<32;i++)
{
px++;
i_3=i>>3;
tmpxCenter = i/8*8-1;
for( j=1;j<63;j++,px++)
{
if(px[0])
{
eight_neighbor=(px[-63]<<7)+(px[-64]<<6)+(px[-65]<<5)+(px[-1]<<4)+(px[63]<<3)+(px[64]<<2)+(px[65]<<1)+px[1];
direction=Dot_Orientation[eight_neighbor];
if(!direction)
continue;
block=0;
xcenter=tmpxCenter;
ycenter=((j>>3)<<3)-1;//j/8*8-1;
j_3=j>>3;
tmpblock = 7*j_3+i_3;
if(xcenter>0&&ycenter>0)
{
block =tmpblock-8;
Get_DEF_Dot_ftr(direction,ftr+block);
}
xcenter+=8;
if(ycenter>0&&xcenter<56&&xcenter>0)
{
block =tmpblock-7;
Get_DEF_Dot_ftr(direction,ftr+block);
}
xcenter-=8;
ycenter+=8;
if(xcenter>0&&ycenter<56&&ycenter>0)
{
block =tmpblock-1;
Get_DEF_Dot_ftr(direction,ftr+block);
}
xcenter+=8;
if(xcenter<56&&ycenter<56&&xcenter>0&&ycenter>0)
{
block =tmpblock;
Get_DEF_Dot_ftr(direction,ftr+block);
}
}
}
px++;
}
}
#pragma omp section
{
px = px + 2048;
for(int i=32;i<63;i++)
{
px++;
i_3=i>>3;
tmpxCenter = i/8*8-1;
for( j=1;j<63;j++,px++)
{
if(px[0])
{
eight_neighbor=(px[-63]<<7)+(px[-64]<<6)+(px[-65]<<5)+(px[-1]<<4)+(px[63]<<3)+(px[64]<<2)+(px[65]<<1)+px[1];
direction=Dot_Orientation[eight_neighbor];
if(!direction)
continue;
block=0;
xcenter=tmpxCenter;
ycenter=((j>>3)<<3)-1;//j/8*8-1;
j_3=j>>3;
tmpblock = 7*j_3+i_3;
if(xcenter>0&&ycenter>0)
{
block =tmpblock-8;
Get_DEF_Dot_ftr(direction,ftr+block);
}
xcenter+=8;
if(ycenter>0&&xcenter<56&&xcenter>0)
{
block =tmpblock-7;
Get_DEF_Dot_ftr(direction,ftr+block);
}
xcenter-=8;
ycenter+=8;
if(xcenter>0&&ycenter<56&&ycenter>0)
{
block =tmpblock-1;
Get_DEF_Dot_ftr(direction,ftr+block);
}
xcenter+=8;
if(xcenter<56&&ycenter<56&&xcenter>0&&ycenter>0)
{
block =tmpblock;
Get_DEF_Dot_ftr(direction,ftr+block);
}
}
}
px++;
}
}
}
也慢很多!实在是不知道该怎么并行,openmp好迷糊,希望高人指点下啊。