CUDA程序，写入全局存储器占了大部分运行时间？

xiao503 2013-05-15 11:19:51

最近写CUDA遇到一个令我很困惑的问题，发现将数据写入全局存储器很慢。具体见代码：

内核函数,写入全局存储器



__global__ void computeDistanceKernel(double * d_dist, double * d_ref, double * d_query, int width, int height) 

{

	unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; 

	unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; 



	if (x < width && y < height)

	{

		double query = d_query[y];

		double dist = 0;



		for (int i = 0; i < height; i++)

		{

			double ref  = d_ref[i*width+x];

			dist = dist + abs(query - ref);	

		}

		__syncthreads();



		d_dist[y*width+x] = dist;

	}

}

运行时间是373.97ms

将写入全局存储器的语句注释掉



__global__ void computeDistanceKernel(double * d_dist, double * d_ref, double * d_query, int width, int height) 

{

	unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; 

	unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; 



	if (x < width && y < height)

	{

		double query = d_query[y];

		double dist = 0;



		for (int i = 0; i < height; i++)

		{

			double ref  = d_ref[i*width+x];

			dist = dist + abs(query - ref);	

		}

		__syncthreads();



		//d_dist[y*width+x] = dist;

	}

}

运行时间是0.76ms

main函数是一样的

int main()

{

	int width =  10000;

	int height = 400;

	double * h_ref = new double[width*height];

	double * h_query = new double[height];



	for (int i = 0; i < width*height; i++)

	{

		h_ref[i] = rand() % 10;

	}



	for (int i = 0; i < height; i++)

	{

		h_query[i] = rand() % 10;

	}



	double   *d_ref, *d_query;

	double   *d_dist, *d_dist2;



	cutilSafeCall( cudaMalloc( (void **) &d_ref, width*height*sizeof(double)) );

	cutilSafeCall( cudaMemcpy(d_ref, h_ref, width*height*sizeof(double), cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaMalloc( (void **) &d_query, height*sizeof(double)) );

	cutilSafeCall( cudaMemcpy(d_query, h_query, height*sizeof(double), cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaMalloc( (void **) &d_dist, width*height*sizeof(double)));

	cutilSafeCall( cudaMalloc( (void **) &d_dist2, width*sizeof(double)));



	//计时开始timer

	unsigned int timer = 0;

	cutilCheckError( cutCreateTimer( &timer));

	cutilCheckError( cutStartTimer( timer));



	dim3 dimBlock(16, 16, 1);

	dim3 dimGrid(width/16, height/16, 1);

	if (width%16 != 0) dimGrid.x += 1;

	if (height%16 != 0) dimGrid.x += 1;

	computeDistanceKernel<<<dimGrid, dimBlock>>>(d_dist, d_ref, d_query, width, height);

	cutilSafeCall(cudaThreadSynchronize());



	//计时结束timer

	cutilCheckError( cutStopTimer( timer));

	printf("computeDifferenceKernel Processing time: %f (ms)\n", cutGetTimerValue( timer));

	cutilCheckError( cutDeleteTimer( timer));



	cutilSafeCall(cudaFree(d_dist));

	cutilSafeCall(cudaFree(d_ref));

	cutilSafeCall(cudaFree(d_query));

}

写入全局存储器占了大部分运行时间，是我的代码优化的不好，还是全局存储器的写入真的很慢？请论坛里各位大神指教。

...全文

323 3 打赏收藏转发到动态举报

写回复

用AI写文章

3 条回复

切换为时间正序

请发表友善的回复…

发表回复

yjoe61 2013-07-05

打赏
举报

fp64很慢，如果不需要64浮点可以换成float试试

ukyolei 2013-06-27

打赏
举报

不写回会被编译优化，这种实验我也做过

linxxx3 2013-05-16

打赏
举报

第二个实验，把写回结果的语句注释掉，相当于kernel没有使用上一步计算的dist，很可能整个kernel都被编译器优化成空的，要相信编译器还是有些智能的。所以这种对比实验完全没有参考意义。原先的代码有可能有优化的空间吧，方法可以看书或者手册里将合并访存（coalesced memory access）相关的内容