新手求助 CUDA诡异现象
核函数如下:
__global__ void testKernel( float *inData,float *oData,int num)
{
oData[blockIdx.x*8+threadIdx.x]=threadIdx.x;
}
程序如下
#define MEMSIZE 4090
运行结果是输出全为0,但是如果把MEMSIZE改成4089以下,结果就是正确的……求助各位大神……这究竟是为了神马……搞了一天,崩溃了
void output(float *inData,int size)
{
for (int i=0;i!=size;i++)
{
std::cout<<inData[i]<<" ";
}
std::cout<<std::endl;
}
void run( int argc, char** argv)
{
cutilDeviceInit(argc,argv);
int memSize=MEMSIZE*sizeof(float);
float *devInData,*devOutData;
float *hostA,*hostB;
hostA=(float *)malloc(memSize);
hostB=(float *)malloc(memSize);
for (int i=0;i<MEMSIZE;i++)
{
hostA[i]=1.1;
hostB[i]=2;
}
cudaError_t err;
cutilSafeCall(err=cudaMalloc((void **)&devInData,memSize));
cutilSafeCall(err=cudaMalloc((void **)&devOutData,memSize));
cutilSafeCall(err=cudaMemcpy(devInData,hostA, memSize,cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(hostB,devInData, memSize,cudaMemcpyDeviceToHost));
dim3 grid(512);
dim3 thread(8);
testKernel<<<grid,thread,memSize>>>(devInData,devOutData,MEMSIZE);
cutilSafeCall(cudaMemcpy(hostB,devOutData, memSize,cudaMemcpyDeviceToHost));
std::cout<<"out";
output(hostB,MEMSIZE);
}