579
社区成员
发帖
与我相关
我的任务
分享
#include<cuda_runtime_api.h>
#include<stdio.h>
__global__ void add(int a[2][2], int b[2][2], int c[2][2])
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if (i < 2 && j < 2)
{
c[i][j] = a[i][j] + b[i][j];
}
}
int main()
{
int i, j, k;
int a[2][2] = { 1, 2, 3, 4 };
int b[2][2] = { 10, 20, 30, 40 };
int c[2][2] = { 0 };
cudaError_t error = cudaSuccess;
int device_a[2][2], device_b[2][2], device_c[2][2];
error = cudaMalloc((void **)&device_a, sizeof(int)* 4);
error = cudaMalloc((void **)&device_b, sizeof(int)* 4);
error = cudaMalloc((void **)&device_c, sizeof(int)*4);
cudaMemcpy(device_a,a, sizeof(int)* 4, cudaMemcpyHostToDevice);
cudaMemcpy(device_b,b, sizeof(int)* 4, cudaMemcpyHostToDevice);
// dim3 threadsPerBlock(1, 1);
// dim3 numBlocks(2 / threadsPerBlock.x, 2 / threadsPerBlock.y);
add << <1, 4 >> >(device_a, device_b, device_c);
cudaMemcpy(c, device_c, sizeof(int)* 4, cudaMemcpyDeviceToHost);
for (i = 0; i < 2; i++){printf("\n");for (j = 0; j < 2; j++)printf("%4d", c[i][j]);}
return 0;
}