CUDA程序,写入全局存储器占了大部分运行时间?

xiao503 2013-05-15 11:19:51
最近写CUDA遇到一个令我很困惑的问题,发现将数据写入全局存储器很慢。具体见代码:

内核函数,写入全局存储器

__global__ void computeDistanceKernel(double * d_dist, double * d_ref, double * d_query, int width, int height)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

if (x < width && y < height)
{
double query = d_query[y];
double dist = 0;

for (int i = 0; i < height; i++)
{
double ref = d_ref[i*width+x];
dist = dist + abs(query - ref);
}
__syncthreads();

d_dist[y*width+x] = dist;
}
}


运行时间是373.97ms

将写入全局存储器的语句注释掉

__global__ void computeDistanceKernel(double * d_dist, double * d_ref, double * d_query, int width, int height)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

if (x < width && y < height)
{
double query = d_query[y];
double dist = 0;

for (int i = 0; i < height; i++)
{
double ref = d_ref[i*width+x];
dist = dist + abs(query - ref);
}
__syncthreads();

//d_dist[y*width+x] = dist;
}
}


运行时间是0.76ms

main函数是一样的
int main()
{
int width = 10000;
int height = 400;
double * h_ref = new double[width*height];
double * h_query = new double[height];

for (int i = 0; i < width*height; i++)
{
h_ref[i] = rand() % 10;
}

for (int i = 0; i < height; i++)
{
h_query[i] = rand() % 10;
}

double *d_ref, *d_query;
double *d_dist, *d_dist2;

cutilSafeCall( cudaMalloc( (void **) &d_ref, width*height*sizeof(double)) );
cutilSafeCall( cudaMemcpy(d_ref, h_ref, width*height*sizeof(double), cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMalloc( (void **) &d_query, height*sizeof(double)) );
cutilSafeCall( cudaMemcpy(d_query, h_query, height*sizeof(double), cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMalloc( (void **) &d_dist, width*height*sizeof(double)));
cutilSafeCall( cudaMalloc( (void **) &d_dist2, width*sizeof(double)));

//计时开始timer
unsigned int timer = 0;
cutilCheckError( cutCreateTimer( &timer));
cutilCheckError( cutStartTimer( timer));

dim3 dimBlock(16, 16, 1);
dim3 dimGrid(width/16, height/16, 1);
if (width%16 != 0) dimGrid.x += 1;
if (height%16 != 0) dimGrid.x += 1;
computeDistanceKernel<<<dimGrid, dimBlock>>>(d_dist, d_ref, d_query, width, height);
cutilSafeCall(cudaThreadSynchronize());

//计时结束timer
cutilCheckError( cutStopTimer( timer));
printf("computeDifferenceKernel Processing time: %f (ms)\n", cutGetTimerValue( timer));
cutilCheckError( cutDeleteTimer( timer));

cutilSafeCall(cudaFree(d_dist));
cutilSafeCall(cudaFree(d_ref));
cutilSafeCall(cudaFree(d_query));
}


写入全局存储器占了大部分运行时间,是我的代码优化的不好,还是全局存储器的写入真的很慢?请论坛里各位大神指教。
...全文
323 3 打赏 收藏 转发到动态 举报
写回复
用AI写文章
3 条回复
切换为时间正序
请发表友善的回复…
发表回复
yjoe61 2013-07-05
  • 打赏
  • 举报
回复
fp64很慢,如果不需要64浮点可以换成float试试
ukyolei 2013-06-27
  • 打赏
  • 举报
回复
不写回会被编译优化,这种实验我也做过
linxxx3 2013-05-16
  • 打赏
  • 举报
回复
第二个实验,把写回结果的语句注释掉,相当于kernel没有使用上一步计算的dist,很可能整个kernel都被编译器优化成空的,要相信编译器还是有些智能的。所以这种对比实验完全没有参考意义。 原先的代码有可能有优化的空间吧,方法可以看书或者手册里将合并访存(coalesced memory access)相关的内容

353

社区成员

发帖
与我相关
我的任务
社区描述
CUDA高性能计算讨论
社区管理员
  • CUDA高性能计算讨论社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧