CPU和GPU的选择问题

ZzConstantly 2015-01-28 03:44:25

为cuda练手，写了个图片rgb转灰度的程序，大约400*400的图像，在gpu上用时1.3ms，在cpu上用时很短（clock测不出来）。想请教一下大家，这是什么原因。是不是只有在kernel运算很复杂时，gpu才能发挥作用？
代码如下，系统是ubuntu server 12.04，本机显卡为Quadro K600, cpu为8核3.70GHz的至强处理器（因为在服务器上远程调用的），谢谢指教!

#include <stdio.h>

#include <opencv2/opencv.hpp>

#include <string>

#include <iostream>

#include <opencv/cv.h>

#include <opencv/highgui.h>

#include "cuda.h"

#include <time.h>

#define DIM 16



using namespace cv;



texture<float> texR;

texture<float> texG;

texture<float> texB;

texture<float> texOut;

__global__ void kernel(float *dev_out)

{

	int idx=threadIdx.x+blockIdx.x*blockDim.x;

	int idy=threadIdx.y+blockIdx.y*blockDim.y;

	int offset=idx+idy*blockDim.x*gridDim.x;

	float dev_r,dev_g,dev_b;

	dev_r=tex1Dfetch(texR,offset);

	dev_g=tex1Dfetch(texG,offset);

	dev_b=tex1Dfetch(texB,offset);



	dev_out[offset]=dev_r*0.5+dev_g*0.5+dev_b*0.5;



}

int main()

{

	//image path

	char* path="//home//ubunserver//yoona.jpg";

	IplImage *image=cvLoadImage(path,0);



	int width=image->width;

	int height=image->height;



	int imagesize=width*height;



	float *r,*g,*b;

	float *out;

       //r,g,b,out均为cuda端的数组

	cudaMalloc((void**)&r,sizeof(float)*imagesize);

	cudaMalloc((void**)&g,sizeof(float)*imagesize);

	cudaMalloc((void**)&b,sizeof(float)*imagesize);

	cudaMalloc((void**)&out,sizeof(float)*imagesize);

        //texR,texG,texB均为纹理内存

	cudaBindTexture(NULL,texR,r,sizeof(float)*imagesize);

	cudaBindTexture(NULL,texG,g,sizeof(float)*imagesize);

	cudaBindTexture(NULL,texB,b,sizeof(float)*imagesize);

	

        //cuda端计时器

	cudaEvent_t sta,sto;

	cudaEventCreate(&sta);

	cudaEventCreate(&sto);



        //cpu端数组

	float *host_r,*host_g,*host_b,*host_out;

	host_r=(float*)malloc(sizeof(float)*imagesize);

	host_g=(float*)malloc(sizeof(float)*imagesize);

	host_b=(float*)malloc(sizeof(float)*imagesize);

	host_out=(float*)malloc(sizeof(float)*imagesize);

        //分别取图像rgb

	for (int x=0;x<height;x++){

		for (int y=0;y<width;y++){

			host_b[x*width+y]=(float)((uchar*)(image->imageData+x*image->width+y))[0];

			host_g[x*width+y]=(float)((uchar*)(image->imageData+x*image->width+y))[1];

			host_r[x*width+y]=(float)((uchar*)(image->imageData+x*image->width+y))[2];

		}

	}



	cudaEventRecord(sta,0);



	cudaMemcpy(r,host_r,sizeof(float)*imagesize,cudaMemcpyHostToDevice);

	cudaMemcpy(g,host_g,sizeof(float)*imagesize,cudaMemcpyHostToDevice);

	cudaMemcpy(b,host_b,sizeof(float)*imagesize,cudaMemcpyHostToDevice);

	cudaMemcpy(out,0,sizeof(float)*imagesize,cudaMemcpyHostToDevice);



	dim3 blockDim(DIM,DIM);

	dim3 gridDim((width+DIM-1)/DIM,(height+DIM-1)/DIM);	

	kernel<<<blockDim,gridDim>>>(out);

	cudaMemcpy(host_out,out,sizeof(float)*imagesize,cudaMemcpyDeviceToHost);



	cudaEventRecord(sto,0);

	cudaEventSynchronize(sto);

	float elapsedTime;

	cudaEventElapsedTime(&elapsedTime,sta,sto);

        printf("cuda time: %3.1f ms\n",elapsedTime);



	cudaFree(r);

	cudaFree(g);

	cudaFree(b);

	cudaFree(out);



        //cpu端计时

	clock_t start,stop;

	start=clock();

	for (int i=0;i<height;i++)

		for(int j=0;j<width;j++){

			host_out[i*width+j]=host_r[i*width+j]*0.5+host_g[i*width+j]*0.5+0.5*host_b[i*width+j];

		}

	stop=clock();

	printf("time: %3.1f ms\n",double(stop-start)/CLOCKS_PER_SEC*1000);

        

        //图片显示

	IplImage *grey_image=cvCreateImage(cvSize(width,height),IPL_DEPTH_8U,1);

	cvCvtColor(image, grey_image, CV_BGR2GRAY);

	for (size_t i=0;i<height;i++){

		uchar* ptr=(uchar*)(grey_image->imageData+i*width);

		for (size_t j=0;j<width;j++){

			ptr[j]=host_out[i*width+j];

		}

	}

	/*

	for (int i=0;i<width;i++){

		for (int j=0;j<height;j++){

			grey_image[i*width+j]=(unsigned char)(host_out[i*width+j]);

		}

	}

	*/

	cvNamedWindow("w1");

	cvShowImage("w1",grey_image);

	cvWaitKey(10000);

	cvDestroyAllWindows();

	cvReleaseImage(&grey_image);

	free(host_r);

	free(host_g);

	free(host_b);

	free(host_out);



	return 0;

}