cudaMemcpy2D与cudaMemcpy的复制时间问题
1、[/size] float *ac, *bc, *cc;
clock_t start, endTime;
size_t pitch_a, pitch_b, pitch_c;
int newn = ((n + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;
start = clock();
cudaMallocPitch((void**) &ac, &pitch_a, sizeof(float) * newn, newn);
cudaMallocPitch((void**) &bc, &pitch_b, sizeof(float) * newn, newn);
cudaMallocPitch((void**) &cc, &pitch_c, sizeof(float) * newn, newn);
cudaMemset(ac, 0, pitch_a * newn);
cudaMemset(bc, 0, pitch_b * newn);
cudaMemcpy2D(ac, pitch_a, a, sizeof(float) * lda, sizeof(float) * n, n, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, pitch_b, b, sizeof(float) * ldb, sizeof(float) * n, n, cudaMemcpyHostToDevice);
2、// allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
// initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// allocate device memory
float* d_A;
cutilSafeCall(cudaMalloc((void**) &d_A, mem_size_A));
float* d_B;
cutilSafeCall(cudaMalloc((void**) &d_B, mem_size_B));
// copy host memory to device
cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A,
cudaMemcpyHostToDevice) );
cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B,
cudaMemcpyHostToDevice) );
cudaMemcpy2D花的时间比cudaMemcpy多了将近一倍。。。不懂这是为什么?然后我把第一个程序中的cudaMemcpy2D换成cudaMemcpy后花的时间还是不变??