cudamemcpy运行速度很慢!
编写程序
内核函数为:
__global__ void decode_one_step(int* dev_pos,u8 *r)//8176,16个block,511个id;
{
int i= blockIdx.x*blockDim.x+threadIdx.x;
u8 tmp1,tmp2,tmp3,tmp4,tmp;
if(i<8176)
{
tmp1=0;
tmp2=0;
tmp3=0;
tmp4=0;
tmp=0;
for(int m=0;m<32;m++)
{
tmp1^=r[*(dev_pos+i*128+m)];
tmp2^=r[*(dev_pos+i*128+32+m)];
tmp3^=r[*(dev_pos+i*128+64+m)];
tmp4^=r[*(dev_pos+i*128+96+m)];
}
tmp=tmp1+tmp2+tmp3+tmp4;
if(tmp>=2)
{
r[i]^=1;
}
}
}
主函数部分为:
LARGE_INTEGER nFreq;
LARGE_INTEGER nBeginTime,nEndTime,mid1,mid2;
double time;
QueryPerformanceFrequency(&nFreq);
QueryPerformanceCounter(&nBeginTime);
#pragma omp parallel
{
cudaMemcpy( dev_a, a,cnt_len*8176*sizeof(u8),cudaMemcpyHostToDevice );
}
QueryPerformanceCounter(&mid1);
for(int tt=0;tt<cnt_len;tt++)
{
decode_one_step<<<16,511>>>(dev_b,(dev_a+tt*8176));
for(int ttt=0;ttt<8176;ttt++)
{
for(int jj=0;jj<32;jj++);
}
}
QueryPerformanceCounter(&mid2);
#pragma omp parallel
{
checkCudaErrors(cudaMemcpy(a,dev_a,sizeof(u8)*8176*cnt_len,cudaMemcpyDeviceToHost));
}
QueryPerformanceCounter(&nEndTime);
time=((double)(nEndTime.QuadPart-nBeginTime.QuadPart)/(double)nFreq.QuadPart);
double time1,time2,time3;
time1=((double)(mid1.QuadPart-nBeginTime.QuadPart)/(double)nFreq.QuadPart);
time2=((double)(mid2.QuadPart-mid1.QuadPart)/(double)nFreq.QuadPart);
time3=((double)(nEndTime.QuadPart-mid2.QuadPart)/(double)nFreq.QuadPart);
cout<<"time is "<<time*1000<<"ms "<<endl;
cout<<"the decode speed is "<<0.8176/time2<<"Mb/s"<<endl;
cout<<"拷贝数据时间:"<<time1*1000<<"ms\n"<<"程序时间"<<time2*1000<<"ms\n"<<"拷贝回来时间"<<time3*1000<<"ms"<<endl;
为什么拷贝回来的时间,也就是cudaMemcpy从设备拷贝回主机的时间特别长。