CUDA kernel函数不执行

feixiangyundjf 2015-05-07 09:47:11
下面的代码,当n_rays = 5000000的时候运行正确,当改成n_rays = 50000000的时候kernel函数就不执行了,直接跳到下一行,大家遇到过kernel函数不执行的问题吗,是什么原因啊?
int main()
{
float dy = 0;
float dx = 0;
float dz = 0;
float p_p = 0.0074;
int n_p_y = 75;
int n_p_x = 75;
float p_l = 0.125;
float f_l = 0.5;
float n_rays = 5000000;

float fnum_l = f_l / p_l;
float fnum_m = fnum_l / (1-M);
p_m = f_m / fnum_m;

float lam = 532e-6;
float betasq = 3.67;
float M1 = s_i / s_o;
float ds_m = 2.44*(1+M1)* fnum_m* lam;
sigma_m = sqrt((ds_m * ds_m)/(8*betasq));
float diffract_m_percent = (ds_m / p_l)*100;
float M2 = f_l / s_i;
float ds_l = 2.44*(1+M2)* fnum_l* lam;
sigma_l = sqrt((ds_l * ds_l)/(8*betasq));
float diffract_l_percent = (ds_l / p_p)*100;

int n_pts = 1;

unsigned int *image;
unsigned int *Image_GPU;
///////////////////////
float T3[16] = {1,0,f_l,0, 0,1,0,f_l, 0,0,1,0, 0,0,0,1};
float L1[16] = {1,0,0,0, 0,1,0,0, -1/f_m,0,1,0, 0,-1/f_m,0,1};
float L2[16] = {1,0,0,0, 0,1,0,0, -1/f_l,0,1,0, 0,-1/f_l,0,1};
float T2[16] = {1,0,s_i,0, 0,1,0,s_i, 0,0,1,0, 0,0,0,1};
float T1[16] = {1,0,s_o + dz,0, 0,1,0,s_o + dz, 0,0,1,0, 0,0,0,1};
float A1[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
/////////////////////////

size_t size = sizeof(unsigned int) * n_p_x * n_p_y;
image = (unsigned int *) malloc (size);
for(int i = 0; i<n_p_y; i++)
for(int j = 0; j<n_p_x; j++)
{
image[i* n_p_x + j] = 0;
}
/////////////////////////////
cudaMalloc((void**)&Image_GPU,size);
cudaMemcpy(Image_GPU,image,size,cudaMemcpyHostToDevice);

///////////////////A1=T2*L1////////////////
for(int i=0;i<4;i++)
for(int j=0;j<4;j++)
for(int k=0;k<4;k++)
{
A1[i*4+j]=A1[i*4+j] + T2[i*4+k]*L1[k*4+j];
}
/////////////////////////////////////
float *T3d;

float *L2d;

float *T1d;
float *A1d;
size_t size_t = sizeof(float) * 16 ;
cudaMalloc((void**)&T3d,size_t);
cudaMalloc((void**)&L2d,size_t);
cudaMalloc((void**)&T1d,size_t);
cudaMalloc((void**)&A1d,size_t);
cudaMemcpy(T3d,T3,size_t,cudaMemcpyHostToDevice);
cudaMemcpy(L2d,L2,size_t,cudaMemcpyHostToDevice);
cudaMemcpy(T1d,T1,size_t,cudaMemcpyHostToDevice);
cudaMemcpy(A1d,A1,size_t,cudaMemcpyHostToDevice);



/////////////////////////
float p_ccd_y = n_p_y * p_p;
float p_ccd_x = n_p_x * p_p;
float coll_angle = 2* atan2( p_m/2 , s_o);
//////////////////////////////////////////////////
int n_l_y = 0;
float y = 0;
for ( y=0; y < p_ccd_y; y=y+p_l)
{
n_l_y = n_l_y+1;
}
n_ly = n_l_y;
float o_y = ((p_ccd_y - (y + p_l))/2);
////////////////////////////////////////
int n_l_x = 0;
float x = 0;
for ( x=0; x < p_ccd_x; x=x+p_l)
{
n_l_x = n_l_x+1;
}
n_lx = n_l_x;
float o_x = ((p_ccd_x - (x + p_l))/2);


///////////////////////////////////////
phi_max = atan2( p_m/2 -dy, s_o +dz);
phi_min = atan2( -p_m/2 -dy , s_o +dz);
float phi_total = phi_max-phi_min;
/////////////
theta_max = atan2( p_m/2 -dx, s_o +dz);
theta_min = atan2( -p_m/2 -dx , s_o +dz);
float theta_total = theta_max-theta_min;


int nrays_phi = ceil((phi_total/coll_angle) * sqrt(n_rays));
int nrays_theta = ceil((theta_total/coll_angle) * sqrt(n_rays));
/////////////////光线角度/////////////////
float* phi;
const int size_pta = n_pts* nrays_phi *nrays_theta * sizeof(float);
phi = (float*) malloc(size_pta);
srand((unsigned)time(NULL));

for(int j = 0; j<nrays_phi *nrays_theta;j++)
{
float n = rand();
phi[j] = phi_min+ (n/32767)*(phi_max-phi_min);
//cout<<phi[j]<<endl;
}
cudaError_t cudaStatus;
float* phid;
cudaStatus = cudaMalloc((void**)&phid,size_pta);
//===========****************
if(cudaStatus != cudaSuccess)
{
cout<< "cudamalloc phid failed!";
return 1;
}
getchar();
cudaMemcpy(phid,phi,size_pta,cudaMemcpyHostToDevice);
////////////////////
float* theta;
theta = (float*) malloc(size_pta);
//srand((unsigned)time(NULL));

for(int j = 0; j<nrays_phi *nrays_theta;j++)
{
float m = rand();
theta[j] = theta_min+ (m/32767)*(theta_max-theta_min);
}

float* thetad;
cudaMalloc((void**)&thetad,size_pta);
cudaMemcpy(thetad,theta,size_pta,cudaMemcpyHostToDevice);
///////////////////////////////////////////////////////////
int TILE_WIDTH = 256;
dim3 dimGrid(ceil((float)nrays_phi* nrays_theta/TILE_WIDTH));
dim3 dimBlock(TILE_WIDTH);
Simulator_1Dkernel<<<dimGrid,dimBlock>>>(sigma_m,sigma_l,p_m,n_ly,n_lx,phid,thetad,nrays_phi,nrays_theta,Image_GPU, A1d,L2d,T3d,T1d);// nsy,nsx, sy,sx, y_ccd,x_ccd,

cudaMemcpy(image,Image_GPU,size,cudaMemcpyDeviceToHost);

for(int i = 0; i<n_p_y; i++)
{
for(int j = 0; j<n_p_x; j++)
{
cout<<image[i*n_p_x + j];
cout<<" ";
}
cout<<endl;
}

}


__global__ void Simulator_1Dkernel(float sigma_m,float sigma_l,float p_m,int n_ly,int n_lx,float*phid,float*thetad,int nrays_phi,int nrays_theta, unsigned int*Image_GPU, float *A1d,float *L2d,float *T3d,float *T1d)
{
unsigned int idx = (blockIdx.x * blockDim.x)+threadIdx.x;
unsigned int sidx = threadIdx.x;
__shared__ float rays[256*4];
curandState s;
float randl;
float randm;

int y_ccd;
int x_ccd;

rays[sidx*4 + 0] = dx;
rays[sidx*4 + 1] = dy;
rays[sidx*4 + 2] = thetad[idx];
rays[sidx*4 + 3] = phid[idx];


///////////T1, ray , ray2////////////////////////
for(int i = 0; i < 4; i++)
{

float temp = 0;
for(int k = 0; k < 4; k++)
{
temp = temp + T1d[i*4 + k] * rays[k + sidx*4];
}
rays[i + sidx*4] = temp;
}

//////////////////////////////////////

if(sqrt( rays[0 + sidx*4] * rays[0 + sidx*4] + rays[1 + sidx*4] * rays[1 + sidx*4])< (p_m/2))
{
///////////Matmul(A1, ray2 , ray3);////////////////
for(int i = 0; i < 4; i++)
{
float temp = 0;
for(int k = 0; k < 4; k++)
{
temp = temp + A1d[i*4 + k] * rays[k + sidx*4];
}
rays[i + sidx*4] = temp;
}
//////////////////////////////////////////////
/*curand_init(idx,0,0,&s);
randl = curand_normal(&s);
rays[0*nrays_phi + idx] = rays[0*nrays_phi + idx] + randl * sigma_l;

curand_init(idx,0,50,&s);
randl = curand_normal(&s);
rays[1*nrays_phi + idx] = rays[1*nrays_phi + idx] + randl * sigma_l;*/

//int n_l = n_ly * n_lx;
nsy = ceil((n_ly*p_l/2 - rays[1 + sidx*4])/p_l);
sy= n_ly*p_l/2 - ((nsy-1) * p_l + p_l/2);

nsx = ceil((n_lx*p_l/2 + rays[0 + sidx*4])/p_l);
sx= ((nsx-1) * p_l + p_l/2)-n_lx*p_l/2 ;

///////////////Matmul(L2, ray3 , ray4);//////////
for(int i = 0; i < 4; i++)
{
float temp = 0;
for(int k = 0; k < 4; k++)
{
temp = temp + L2d[i*4 + k] * rays[k + sidx*4];
}
rays[i + sidx*4] = temp;
}
//////////////////////////////////////
rays[3 + sidx*4] = rays[3 + sidx*4] + sy/f_l;
rays[2 + sidx*4] = rays[2 + sidx*4] + sx/f_l;

//////////////////Matmul(T3, ray4 , ray5);/////////////////
for(int i = 0; i < 4; i++)
{
float temp = 0;
for(int k = 0; k < 4; k++)
{
temp = temp + T3d[i*4 + k] * rays[k + sidx*4];
}
rays[i + sidx*4] = temp;
}
//////////////////////////////////////
/*curand_init(idx,0,100,&s);
randm = curand_normal(&s);
rays[0*nrays_phi + idx] = rays[0*nrays_phi + idx] + randm*sigma_m;
curand_init(idx,0,150,&s);
randm = curand_normal(&s);
rays[1*nrays_phi + idx] = rays[1*nrays_phi + idx] + randm*sigma_m;*/

float npy = n_p_y*p_p/2;
float cey = npy - rays[1 + sidx*4];
y_ccd = ceil(cey/p_p)-1;

float npx = n_p_x*p_p/2;
float cex = npx + rays[0 + sidx*4];
x_ccd = ceil(cex/p_p)-1;


const unsigned int value_y = y_ccd;
const unsigned int value_x = x_ccd;

const unsigned int value = value_y * n_p_x + value_x;


if( value>=0 && value<n_p_y*n_p_x)
{
atomicAdd(&(Image_GPU[value]), 1);
}
}
}
...全文
675 2 打赏 收藏 转发到动态 举报
写回复
用AI写文章
2 条回复
切换为时间正序
请发表友善的回复…
发表回复
Spidey212 2015-05-11
  • 打赏
  • 举报
回复
什么型号的显卡?grid-gx最大支持多少?n_rays=50000000时,dimGrid的值有没有超过单一维度(x)所支持的最大block数?
YCMyTot 2015-05-10
  • 打赏
  • 举报
回复
Kernel调用出错的原因有很多,在Host端调用Kernel函数后,使用如下语句: cudaError_t error_check ; //.... kernel <<< grid , block >>> ( argument0 , argument1 , argument2 ,...... ) ; error_check = cudaGetLastError(); if( error_check != cudaSuccess ){ printf("%s\n" , cudaGetErrorString( error_check ) ); system("pause") ; return 0 ; } 可以得到更加详细的信息。

581

社区成员

发帖
与我相关
我的任务
社区描述
CUDA™是一种由NVIDIA推出的通用并行计算架构,该架构使GPU能够解决复杂的计算问题。 它包含了CUDA指令集架构(ISA)以及GPU内部的并行计算引擎。
社区管理员
  • CUDA编程社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧