小弟最近在学习CUDA,尝试加速添加高斯噪声这一过程,但是始终不知道错误原因,一运行就全黑了,所以想请教各位大佬,万分感谢。
// ConsoleApplication1.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "Windows.h"
#include <math.h>
#include <iostream>
#include <cxcore.h>
#include <opencv.hpp>
#include <opencv2/opencv.hpp>
using namespace std;
using namespace cv;
double getRandGaussVal_device(double noisefactor);
#define BLOCKDIM_X 16
#define BLOCKDIM_Y 16
#define GRIDDIM_X 256
#define GRIDDIM_Y 256
int g_iGaussList[1000];//RGB化的高斯值
__global__ void kernel_Gaussianweiguang(/*int width,int height,*/uchar* sensorBuffer,double* data_to_device);
void GenerateGaussNosieList(const int iNum,const int val);
int _tmain(int argc, _TCHAR* argv[])
{
int iTemp_host=getRandGaussVal_device(1);
double *data_to_device,data_device[1]={iTemp_host};
uchar* sensorBuffer=new uchar[1024*768*3];
cudaMalloc((void**)&data_to_device,sizeof(double));
cudaMemcpy(data_to_device,data_device,sizeof(double),cudaMemcpyHostToDevice);///由主机复制到设备
//分配显存空间
int size1=1024*768*3*sizeof(unsigned char);
unsigned char *d_src_imgbuf;
cudaMalloc((void**)&d_src_imgbuf, size1);
//把数据从Host传到Device
cudaMemcpy(d_src_imgbuf, sensorBuffer, size1, cudaMemcpyHostToDevice);
Mat src=imread("Chrysanthemum.jpg");
Mat dst;
uchar* Temp=sensorBuffer;
uchar* Temp1=sensorBuffer;
uchar* tem=sensorBuffer;
int i=0;
for(int r=0;r<src.rows;r++)
{
uchar* data=src.ptr<uchar>(r);
for(int c=0;c<src.cols*3;c++)
{
sensorBuffer[r*src.cols*3+c]=data[c];
}
}
int bx = ceil((double)dst.rows/BLOCKDIM_X); //网格和块的分配
int by = ceil((double)dst.cols/BLOCKDIM_Y);
if(bx > GRIDDIM_X) bx = GRIDDIM_X;
if(by > GRIDDIM_Y) by = GRIDDIM_Y;
dim3 grid(bx,by);//网格的结构
dim3 block(BLOCKDIM_X,BLOCKDIM_Y);//块的结构
GenerateGaussNosieList(1000,20);
int iTemp=0;int i1=0;
/*for(int r=0;r<src.rows;r++)
for(int c=0;c<src.cols*3;c++)
{
Temp[r*src.cols*3+c]=Temp[r*src.cols*3+c]+getRandGaussVal_device(1);
}*/
kernel_Gaussianweiguang<<<grid,block>>>(d_src_imgbuf,data_to_device);
cudaMemcpy(tem, d_src_imgbuf,size1, cudaMemcpyDeviceToHost);//数据传回主机端
Mat src1(src.size(),src.type());
for(int c=0;c<src.rows;c++)
for(int r=0;r<src.cols*3;r++)
{
src1.at<uchar>(c,r)=Temp1[c*src.cols*3+r];
}
imshow("a",src1);waitKey(0);
cudaFree(d_src_imgbuf);
cudaFree(data_to_device);
}
void GenerateGaussNosieList(const int iNum,const int val)
{
for(int i = 0; i < iNum; i++)
{
static bool hasSpare = false;
static double rand1, rand2;
double result;
if(hasSpare)
{
hasSpare = false;
result = sqrt(rand1) * sin(rand2);
}
hasSpare = true;
rand1 = rand() / ((double) RAND_MAX);
if(rand1 < 1e-100) rand1 = 1e-100;
rand1 = -2 * log(rand1);
rand2 = (rand() / ((double) RAND_MAX)) * 2*CV_PI;
result = sqrt(rand1) * cos(rand2);
g_iGaussList[i] = (int)((float)result*(float)val);
cout<<g_iGaussList[i]<<endl;
}
int itest = 0;
}
double getRandGaussVal_device(double noisefactor)
{
double i = 0;
int iRand = 1000*rand()/RAND_MAX;
i = g_iGaussList[iRand]*noisefactor;
return i;
}
__global__ void kernel_Gaussianweiguang(/*int width,int height,*/uchar* d_src_imgbuf,double* data_to_device)
{
uchar* temp=d_src_imgbuf;
int ncols=1024*3;
const int tix=blockDim.x*blockIdx.x+threadIdx.x;
const int tiy=blockDim.y*blockIdx.y+threadIdx.y;
const int threadTotalX=blockDim.x*gridDim.x;
const int threadTotalY=blockDim.y*gridDim.y;
for (int i=tix;i<768;i+=threadTotalX)
{
for (int j=tiy;j<ncols;j+=threadTotalY)
{
temp[i*ncols+j]=temp[i*ncols+j]+*data_to_device;
}
}
}