cuda如何提升运行速度
有一个小程序 如何提升它的速度 求各位大神科普
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include<time.h>
using namespace std;
#define numx 1000
#define numy 1024
__global__ void averge(float *xx, float *yy)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float sum = 0;
for (int j = tid; j < (numx - 1)*numy + 1 + tid; j += numy)
{
sum = sum + xx[j];
}
sum = sum / numx;
for (int j = tid; j < (numx - 1)*numy + 1 + tid; j += numy)
{
xx[j] = xx[j] - sum;
yy[j] = xx[j];
}
}
int main()
{
clock_t start, end;
// start = clock();
for (int i = 0; i < 1; i++)
{
float a[numx][numy] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
float c[numx][numy];
float *dev_a, *dev_b;
// start = clock();
cudaMalloc((void**)&dev_a, numx * numy * sizeof(float));
cudaMalloc((void**)&dev_b, numx * numy * sizeof(float));
// start = clock();
cudaMemcpy(dev_a, a, numx * numy * sizeof(float), cudaMemcpyHostToDevice);
// end = clock();
start = clock();
averge << <32, 32 >> >(dev_a, dev_b);
end = clock();
cudaMemcpy(c, dev_b, numx * numy * sizeof(float), cudaMemcpyDeviceToHost);
/* for (int i = 100; i < 110; i++)
{
cout << c[1][i] << endl;
}
*/
cudaFree(dev_a);
cudaFree(dev_b);
}
// end = clock();
cout << end - start << endl;
}