CUDA运行时间问题

shyboy6104 2010-04-29 08:40:57

写了个求解线性方程解的程序，相比串行程序，性能不敢恭维。于是把各个步骤拆分开，看各个步骤的加速比
其中一个步骤是针对Ax=r,A为对角矩阵的问题，实际上就是求解x[i]=r[i]/a[i]
代码如下：
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#define DataType float
#define Thread_num_def 512
#define Block_num_def 20480
#define Matrix_Dim Thread_num_def*Block_num_def

// includes, project
#include <cutil_inline.h>

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void Init_data(DataType *a, DataType *b, DataType *c, DataType* r, DataType* x, int n);
void Wang_Algorithm_test(DataType *a, DataType *b, DataType *c, DataType* r, DataType* x, int n);
void test_step4(DataType *a,DataType *r,DataType *x,int n);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
DataType *a = (DataType *)malloc(Matrix_Dim*sizeof(DataType*));
DataType *b = (DataType *)malloc(Matrix_Dim*sizeof(DataType*));
DataType *c = (DataType *)malloc(Matrix_Dim*sizeof(DataType*));
DataType *r = (DataType *)malloc(Matrix_Dim*sizeof(DataType*));
DataType *x = (DataType *)malloc(Matrix_Dim*sizeof(DataType*));

Init_data(a, b, c, r, x, Matrix_Dim);

Wang_Algorithm_test(a, b, c, r, x, Matrix_Dim);
free(a);
free(b);
free(c);
free(r);
free(x);
cutilExit(argc, argv);
}
void Init_data(DataType *a, DataType *b, DataType *c, DataType* r, DataType* x, int n)
{
for (int i = 0; i<n; i++)
{
a[i]=2.0;
b[i]=1.0;
c[i]=1.0;
x[i]=0.0;
r[i]=4.0;
}
r[0] = 3.0;
r[n-1] = 3.0;
b[n-1] = 0.0;
c[0] = 0.0;

}

void test_step4(DataType *a,DataType *r,DataType *x,int n)
{
for(int i=0;i<n;i++)
x[i] = r[i]/a[i];
}

void Wang_Algorithm_test(DataType *a, DataType *b, DataType *c, DataType* r, DataType* x, int n)
{
DataType *Device_a, *Device_b, *Device_c, *Device_x, *Device_r;
cudaMalloc ((void**)&Device_a, n*sizeof(DataType));
cudaMalloc ((void**)&Device_b, n*sizeof(DataType));
cudaMalloc ((void**)&Device_c, n*sizeof(DataType));
cudaMalloc ((void**)&Device_r, n*sizeof(DataType));
cudaMalloc ((void**)&Device_x, n*sizeof(DataType));

cudaMemcpy(Device_a,a,n*sizeof(DataType),cudaMemcpyHostToDevice);
cudaMemcpy(Device_b,b,n*sizeof(DataType),cudaMemcpyHostToDevice);
cudaMemcpy(Device_c,c,n*sizeof(DataType),cudaMemcpyHostToDevice);
cudaMemcpy(Device_r,r,n*sizeof(DataType),cudaMemcpyHostToDevice);

cudaEvent_t start, stop;
float Gpu_time,Cpu_time;
clock_t timestart,timeend;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
Wang_kernel_step4<<<Block_num_def, Thread_num_def>>>(Device_a, Device_r, Device_x,n);//GPU 程序
Wang_kernel_step4<<<Block_num_def, Thread_num_def>>>(Device_a, Device_r, Device_x,n);
Wang_kernel_step4<<<Block_num_def, Thread_num_def>>>(Device_a, Device_r, Device_x,n);
Wang_kernel_step4<<<Block_num_def, Thread_num_def>>>(Device_a, Device_r, Device_x,n);
Wang_kernel_step4<<<Block_num_def, Thread_num_def>>>(Device_a, Device_r, Device_x,n);

cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&Gpu_time,start,stop); //GPU 测时
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("\nGpu time is: %f ms", Gpu_time);

timestart = clock();
test_step4(a,r,x,n);
test_step4(a,r,x,n); //CPU 程序
test_step4(a,r,x,n);
test_step4(a,r,x,n);
test_step4(a,r,x,n);
timeend = clock();
Cpu_time = (float)(timeend - timestart)/CLOCKS_PER_SEC * 1000; //CPU 测时
printf("\nCpu time is: %f ms", Cpu_time);

cudaMemcpy(x,Device_x,n*sizeof(DataType),cudaMemcpyDeviceToHost);

cudaFree(Device_a);
cudaFree(Device_b);
cudaFree(Device_c);
cudaFree(Device_r);
cudaFree(Device_x);

}
结果为
Gpu time is: 104.327263 ms
Cpu time is: 156.000000 ms

按理说，各个线程间不存在数据相关，各个线程应该能较好的隐藏访存延迟，加速比应该挺好的。而在这不记内存拷贝时间的情况下，才接近1.5。是不是像这种情况计算密度不高：访存读取两次（r[i],a[i]），只进行一次除法操作，然后再存储一次不适合CUDA计算？适合的话，这样的情况应该怎么优化？

...全文

280 11 打赏收藏转发到动态举报

写回复

用AI写文章

11 条回复

切换为时间正序

请发表友善的回复…

发表回复

shyboy6104 2012-11-22

打赏
举报

引用 10 楼 stray109 的回复:

我的 525M vs i5-2450M Gpu time is: 28.575329 ms Cpu time is: 181.000000 ms

使用了优化选项没？

stray109 2012-04-28

打赏
举报

我的 525M vs i5-2450M
Gpu time is: 28.575329 ms
Cpu time is: 181.000000 ms

shyboy6104 2010-05-09

打赏
举报

[Quote=引用 8 楼 l7331014 的回复:]
引用 6 楼 frog_skywalker 的回复:
8500支持coalescing吗？
不支持的话，对齐了也不快吧

要求更高些,要按id顺序访问.呵呵.
[/Quote]
实际上，程序中就是按ID顺序访问的。为什么性能还这么差呢？

frog_skywalker 2010-05-07

打赏
举报

8500支持coalescing吗？
不支持的话，对齐了也不快吧

无心人_过过小日子 2010-05-07

打赏
举报

[Quote=引用 6 楼 frog_skywalker 的回复:]
8500支持coalescing吗？
不支持的话，对齐了也不快吧
[/Quote]

要求更高些,要按id顺序访问.呵呵.

shyboy6104 2010-05-07

打赏
举报

[Quote=引用 6 楼 frog_skywalker 的回复:]
8500支持coalescing吗？
不支持的话，对齐了也不快吧
[/Quote]
8500是1.1的版本，支持coalescing

无心人_过过小日子 2010-05-06

打赏
举报

[Quote=引用 4 楼 shyboy6104 的回复:]
引用 2 楼 l7331014 的回复:
kernel程序(Wang_kernel_step4)?
问题一般在是访问是否对齐上.访问为主的程序的确没有计算为主的程序更容易加速.但GPU上的带宽一般也是比cpu高不少的...LZ可以计算一下是否充分利用了gpu的带宽.

这个访问应该算是对齐的吧，每个线程都访问一个float型的数，而且数组大小也是16的倍数....
[/Quote]

对齐与否要看kernel是如何访问数据的.和数据的组织方式有一定关系,但不是必然.

shyboy6104 2010-05-06

打赏
举报

[Quote=引用 2 楼 l7331014 的回复:]
kernel程序(Wang_kernel_step4)?
问题一般在是访问是否对齐上.访问为主的程序的确没有计算为主的程序更容易加速.但GPU上的带宽一般也是比cpu高不少的...LZ可以计算一下是否充分利用了gpu的带宽.
[/Quote]

这个访问应该算是对齐的吧，每个线程都访问一个float型的数，而且数组大小也是16的倍数....