580
社区成员
发帖
与我相关
我的任务
分享
#define N (33 * 1024)
int count = 0;
__global__ void add(int *a , int *b , int* c ,int count_)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;//获得线程索引
// printf("tid = %d\n",tid);
count_ += 1;//计算函数运行了几次
while( tid < N)//必须检查是否在正确的偏移范围类内
{
c[ tid ] = a[ tid ] + b[ tid ];
tid += blockDim.x * gridDim.x;
}
}
int main()
{
int a[N] , b[N] , c[N];
int *dev_a, *dev_b , *dev_c;
int *n = 0;
int n_result = 2;
//在GPU上分配内存
cudaMalloc( (void**)&dev_a , N * sizeof(int ) );
cudaMalloc( (void**)&dev_b , N * sizeof(int ) );
cudaMalloc( (void**)&dev_c , N * sizeof(int ) );
cudaMalloc( (void**)&n , sizeof(int ) );
//在CPU上为数组a,b赋初值
for(int i = 0; i < N; i++)
{
a[i] = -i;
b[i] = i * i;
}
//将数组a, b 复制到GPU上去计算
cudaMemcpy( dev_a , a , N * sizeof(int) , cudaMemcpyHostToDevice);
cudaMemcpy( dev_b , b , N * sizeof(int) , cudaMemcpyHostToDevice);
add<<<128,128>>>( dev_a , dev_b , dev_c, *n);
//计算完毕,将数组C从GPU复制到CPU
cudaMemcpy( c , dev_c , N * sizeof(int) , cudaMemcpyDeviceToHost);
//计算完毕,将n从GPU复制到CPU
cudaMemcpy( &count , n , sizeof(int) , cudaMemcpyDeviceToHost);
//验证GPU确实完成了我们要求的工作
bool success = true;
//显示结果
for(int i = 0; i < N ; i++)
{
if( ( a[i] + b[i]) != c[i] )
{
printf("error\n");
success = false;
}
//printf(" %d + %d = %d \n", a[i] , b[i] , c[i]);
}
if(success)
printf("we did it ! \n");
//最后记得释放在GPU上分配的内存
printf("n = %d", count);
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
cudaFree( n );
getchar();
return 0;
}