CUDA运行时间问题
写了个求解线性方程解的程序,相比串行程序,性能不敢恭维。于是把各个步骤拆分开,看各个步骤的加速比
其中一个步骤是针对Ax=r,A为对角矩阵的问题,实际上就是求解x[i]=r[i]/a[i]
代码如下:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#define DataType float
#define Thread_num_def 512
#define Block_num_def 20480
#define Matrix_Dim Thread_num_def*Block_num_def
// includes, project
#include <cutil_inline.h>
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void Init_data(DataType *a, DataType *b, DataType *c, DataType* r, DataType* x, int n);
void Wang_Algorithm_test(DataType *a, DataType *b, DataType *c, DataType* r, DataType* x, int n);
void test_step4(DataType *a,DataType *r,DataType *x,int n);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
DataType *a = (DataType *)malloc(Matrix_Dim*sizeof(DataType*));
DataType *b = (DataType *)malloc(Matrix_Dim*sizeof(DataType*));
DataType *c = (DataType *)malloc(Matrix_Dim*sizeof(DataType*));
DataType *r = (DataType *)malloc(Matrix_Dim*sizeof(DataType*));
DataType *x = (DataType *)malloc(Matrix_Dim*sizeof(DataType*));
Init_data(a, b, c, r, x, Matrix_Dim);
Wang_Algorithm_test(a, b, c, r, x, Matrix_Dim);
free(a);
free(b);
free(c);
free(r);
free(x);
cutilExit(argc, argv);
}
void Init_data(DataType *a, DataType *b, DataType *c, DataType* r, DataType* x, int n)
{
for (int i = 0; i<n; i++)
{
a[i]=2.0;
b[i]=1.0;
c[i]=1.0;
x[i]=0.0;
r[i]=4.0;
}
r[0] = 3.0;
r[n-1] = 3.0;
b[n-1] = 0.0;
c[0] = 0.0;
}
void test_step4(DataType *a,DataType *r,DataType *x,int n)
{
for(int i=0;i<n;i++)
x[i] = r[i]/a[i];
}
void Wang_Algorithm_test(DataType *a, DataType *b, DataType *c, DataType* r, DataType* x, int n)
{
DataType *Device_a, *Device_b, *Device_c, *Device_x, *Device_r;
cudaMalloc ((void**)&Device_a, n*sizeof(DataType));
cudaMalloc ((void**)&Device_b, n*sizeof(DataType));
cudaMalloc ((void**)&Device_c, n*sizeof(DataType));
cudaMalloc ((void**)&Device_r, n*sizeof(DataType));
cudaMalloc ((void**)&Device_x, n*sizeof(DataType));
cudaMemcpy(Device_a,a,n*sizeof(DataType),cudaMemcpyHostToDevice);
cudaMemcpy(Device_b,b,n*sizeof(DataType),cudaMemcpyHostToDevice);
cudaMemcpy(Device_c,c,n*sizeof(DataType),cudaMemcpyHostToDevice);
cudaMemcpy(Device_r,r,n*sizeof(DataType),cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
float Gpu_time,Cpu_time;
clock_t timestart,timeend;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
Wang_kernel_step4<<<Block_num_def, Thread_num_def>>>(Device_a, Device_r, Device_x,n);//GPU 程序
Wang_kernel_step4<<<Block_num_def, Thread_num_def>>>(Device_a, Device_r, Device_x,n);
Wang_kernel_step4<<<Block_num_def, Thread_num_def>>>(Device_a, Device_r, Device_x,n);
Wang_kernel_step4<<<Block_num_def, Thread_num_def>>>(Device_a, Device_r, Device_x,n);
Wang_kernel_step4<<<Block_num_def, Thread_num_def>>>(Device_a, Device_r, Device_x,n);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&Gpu_time,start,stop); //GPU 测时
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("\nGpu time is: %f ms", Gpu_time);
timestart = clock();
test_step4(a,r,x,n);
test_step4(a,r,x,n); //CPU 程序
test_step4(a,r,x,n);
test_step4(a,r,x,n);
test_step4(a,r,x,n);
timeend = clock();
Cpu_time = (float)(timeend - timestart)/CLOCKS_PER_SEC * 1000; //CPU 测时
printf("\nCpu time is: %f ms", Cpu_time);
cudaMemcpy(x,Device_x,n*sizeof(DataType),cudaMemcpyDeviceToHost);
cudaFree(Device_a);
cudaFree(Device_b);
cudaFree(Device_c);
cudaFree(Device_r);
cudaFree(Device_x);
}
结果为
Gpu time is: 104.327263 ms
Cpu time is: 156.000000 ms
按理说,各个线程间不存在数据相关,各个线程应该能较好的隐藏访存延迟,加速比应该挺好的。而在这不记内存拷贝时间的情况下,才接近1.5。是不是像这种情况计算密度不高:访存读取两次(r[i],a[i]),只进行一次除法操作,然后再存储一次 不适合CUDA计算?适合的话,这样的情况应该怎么优化?