【求助】kernel 部分运行后无法执行“cudaDeviceSynchronize”是什么问题？

Endless_N 2014-08-30 04:28:58

初学，我自己写了个程序，功能大概是这样的：用一个5*5的卷积核去和一张imageW*imageH的大尺寸图像做滑动卷积，stride是每次卷积核滑动的步长。不用对图像边缘补零。然后运行完输出的是一张（imageW - 5+1）*（imageH - 5+1）的output feature map。PS：卷积核在输入时已经做了相应处理，所以计算部分是直接对应位置相乘然后相加的。

问题描述：编译成功后，执行程序时，运行到 main函数中调用 cudaDeviceSynchronize() 时报错，把 kernel 部分最后那句 d_Output 的语句注释掉后，这个问题就不见了，整个程序都能跑通了，也能计时。
错误提示：CUDA error at matrixConv.cu : 214 code=77 (<unknow>) "cudaDeviceSynchronize()"

头文件：
#include <assert.h>
#include <helper_cuda.h>
// CUDA runtime
#include <cuda_runtime.h>
// Utilities and system includes
#include <helper_functions.h>
#include <helper_cuda.h>

[color=rgb(51, 102, 153) !important]复制代码

kernel 部分程序：
#define KERNEL_SIZE 5
#define BLOCKDIM_X 25//16//25
#define BLOCKDIM_Y 5//10//5
#define stride 1 // the stride of convolution kernel of sliding
#define N 32 // the number of the each thread's element (plan_B)
__constant__ float c_Kernel[KERNEL_SIZE * KERNEL_SIZE];
__global__ void convolutionKernel(
float *d_Output,
float *d_Input,
float *c_Kernel,
int imageW,
int imageH,
int imageW_O
)
{
//The parameter of overlap
int Overlap = KERNEL_SIZE - 1;
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int InBegin = by * (BLOCKDIM_Y - Overlap) * imageW + bx * (BLOCKDIM_X - Overlap);
//int OutBegin = by * BLOCKDIM_Y * imageW_O + bx * BLOCKDIM_X;////plan_A
int OutBegin = by * BLOCKDIM_Y * imageW_O + bx * N;////plan_B
for (int i = InBegin,o = OutBegin; i < N; i++, o++)
{
// Declaration of the shared memoryarray In_sub used to store the sub-matrix of d_Input
__shared__ float In_sub[N * BLOCKDIM_X][BLOCKDIM_Y];////plan_B
for (int j = 0; j < KERNEL_SIZE; j += stride)
{
In_sub[tx][ty] = d_Input[i + j * imageW + (tx - j * KERNEL_SIZE) + ty * stride * imageW];////plan_B
}
__syncthreads();
//Comput and store result
float sum = 0;
for(int j = 0; j < KERNEL_SIZE * KERNEL_SIZE; j++)
{
sum += c_Kernel[j] * In_sub[j][ty];
}
__syncthreads();
d_Output[o + ty * imageW_O] = sum；//（运行时，把这一步注释掉，然后在把main函数中对应的把d_Output复制回Host端的一句也注释掉后，整个程序就能跑通了，也能显示计时结果
}
}

main函数调用呼叫 kernel 的函数convolutionGPU()的部分
// warmup
convolutionGPU(
d_Output,
d_Input,
c_Kernel,
imageW,
imageH,
imageW_O
);
//Game start
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
for (int i = 0; i < iterations; i++)
{
convolutionGPU(
d_Output,
d_Input,
c_Kernel,
imageW,
imageH,
imageW_O
);
}
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&hTimer);
//Game stop
double gpuTime = 0.001 * sdkGetTimerValue(&hTimer) / (double)iterations;

[color=rgb(51, 102, 153) !important]复制代码

PS：
dim3 blocks(imageW/(N * stride + Overlap), (imageH - Overlap)/(BLOCKDIM_Y - Overlap));
dim3 threads(BLOCKDIM_X, BLOCKDIM_Y); //<font color="#0000ff">=（25,5）</font>

...全文