一个CUDA二维数组相加的例子,请大牛们看看错在哪里
#include "book.h"
__global__ void ADD(int *A,int *B,int *C)
{
int i=threadIdx.x;
int j=threadIdx.y;
while ( i<4 && j<4)
{
C[i][j]=A[i][j]+B[i][j];
}
}
int main( void ) {
a = (int*)malloc( 16 * sizeof(int) );
b = (int*)malloc( 16 * sizeof(int) );
c = (int*)malloc( 16 * sizeof(int) );
int a[4][4]={{1,1,1,1},{1,1,1,1},{1,1,1,1},{1,1,1,1}};
int b[4][4]={{1,1,1,1},{1,1,1,1},{1,1,1,1},{1,1,1,1}};
int *A,*B,*C;
cudaMalloc((void**) &A,sizeof(int)*16);
cudaMalloc((void**) &B,sizeof(int)*16);
cudaMalloc((void**) &C,sizeof(int)*16);
cudaMemcpy(A,a,sizeof(int)*16,cudaMemcpyHostToDevice);
cudaMemcpy(B,b,sizeof(int)*16,cudaMemcpyHostToDevice);
dim3 threadsPerBlock(4,4);
ADD<<<1,threadsPerBlock>>>(A,B,C);
int c[4][4];
cudaMemcpy(c,C,sizeof(int)*16,cudaMemcpyHostToDevice);
printf( "c\n" );
cudaFree(A);
cudaFree(B);
cudaFree(C);
return 0;
}