关于cuda实现点乘的问题~
float dot(float *x,float *y)
{ float sum=0;
unsigned int mem_size=1500 * sizeof(float);
float* h_A = x;
float* h_B = y;
float* h_C = (float*) malloc(mem_size);
// allocate device memory
float* d_A;
cutilSafeCall(cudaMalloc((void**) &d_A, mem_size));
float* d_B;
cutilSafeCall(cudaMalloc((void**) &d_B, mem_size));
// copy host memory to device
cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size,cudaMemcpyHostToDevice) );
cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size,cudaMemcpyHostToDevice) );
// execute the kernel
dim3 threadBlock(500,1);
dim3 kernelGrid(3,1);
dot_cu<<<kernelGrid,threadBlock>>>(d_A, d_B);
// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");
// copy result from device to host
cutilSafeCall(cudaMemcpy(h_C, d_A, mem_size,cudaMemcpyDeviceToHost) );
for(int i=0;i<1500;i++)
sum+=h_C[i];
// clean up memory
free(h_C);
cutilSafeCall(cudaFree(d_A));
cutilSafeCall(cudaFree(d_B));
cudaThreadExit();
return(sum);
}
kernel函数
__global__ void dot_cu(float *A, float *B)
{
unsigned int tid=threadIdx.x+blockDim.x*blockIdx.x;
A[tid]=A[tid]*B[tid];
__syncthreads();
}
想实现两个1500长的一维数组的点乘再求和的运算
但这个程序的运算效率比cpu低 不知道为什么
求指教~~