cuda 内核执行失败 为什么呢?
lf864 2011-04-20 03:18:13 联系归约算法时,出现错误,很是不解,下面是程序代码(部分);望高手指点一二#include<stdio.h>
#include<math.h>
#include<direct.h>
#include <io.h>
#include <stdlib.h>
#include <windows.h>
#include <cutil_inline.h>
__global__ void kernel1(float *idata,float *odata)
{
int id=blockIdx.x*blockDim.x+threadIdx.x;
int index=threadIdx.x;
extern __shared__ float sp[];
sp[index]=idata[id];
__syncthreads();
for(int i=1;i<blockDim.x;i*=2)
{
if(index%(2*i)==0)
{
sp[index]+=sp[index+i];
}
__syncthreads();
}
if(threadIdx.x==0)
odata[blockIdx.x]=sp[0];
}
main()
{
float m[10000]={1.0};
float s=0.0;
float *d_p1;
int size=sizeof(m);
cudaMalloc((void **)&d_p1,size);
cudaMemcpy(d_p1,m,size,cudaMemcpyHostToDevice);
float *d_p2;
cudaMalloc((void **)&d_p2,size);
dim3 dimGrid(32,1);
dim3 dimBlock(512,1,1);
kernel1<<<dimGrid,dimBlock>>>(d_p1,d_p2);
cutilCheckMsg("Kernel execution failed\n");
float sum[32];
cudaMemcpy(sum,d_p2,32*sizeof(float),cudaMemcpyDeviceToHost);
for(int i=0;i<32;i++)
{
s+=sum[i];
}
printf("%f",sum);
}
执行结果如下:
cutilCheckMsg cudaThreadSynchronize error:Kernel execution failed : unspecified launch failure.