cuda流数据传输和核函数执行无法并行?

禅悟人生 2015-07-22 09:58:45
我的gpu型号为gtx850m,计算能力5.0.按理来说我这显卡肯定支持数据传输和核函数并行的。可是调试中发现无论怎么改写发布顺序。数据传输与核函数就是无法并发执行,但是核函数可以并发,请问什么原因?
核心代码如下:
cudaStream_t *stream1 = (cudaStream_t*)malloc(nStream * sizeof(cudaStream_t));
for (unsigned int i = 0; i<nStream; i++)
HANDLE_ERROR(cudaStreamCreate(&(stream1[i])));

cufftHandle plan1[nStream];
for (unsigned int i = 0; i<nStream; i++)
{
cufftSafeCall(cufftPlan1d(&plan1[i], nChannel, CUFFT_C2C, dataSteam));
cufftSafeCall(cufftSetStream(plan1[i], stream1[i]));
}


for (unsigned int i = 0; i < nStream; i++)
{
HANDLE_ERROR(cudaMemcpyAsync(dXX + (dataSteam * nChannel + iniValue)*i,
xx + dataSteam * nChannel * i, (dataSteam * nChannel + iniValue)*
sizeof(Complex), cudaMemcpyHostToDevice, stream1[i]));
}

for (unsigned int i = 0; i < nStream; i++)
{
cudaMakevv(dXX + i *(dataSteam * nChannel + iniValue), dHH, dVV +
(nStream - 1 - i) * dataSteam * nChannel, stream1[i]);//此函数为一个核函数
}

for (unsigned int i = 0; i < nStream; i++)
{
cufftSafeCall((cufftExecC2C(plan1[i], (cufftComplex *)(dVV +
(nStream - 1 - i) * dataSteam * nChannel), (cufftComplex *)(dVV + (nStream - 1 - i) *
dataSteam * nChannel), CUFFT_FORWARD)));
}

for (unsigned int i = 0; i < nStream; i++)
{
HANDLE_ERROR(cudaMemcpyAsync(yy2 + 64 * 9 + (nStream - 1 - i)*dataSteam * nChannel,
dVV + (nStream - 1 - i)*dataSteam * nChannel, dataSteam * nChannel * sizeof(Complex),
cudaMemcpyDeviceToHost, stream1[i]));
}


profiler结果如下:


改写成这样后也是无法并行:
cudaStream_t *stream1 = (cudaStream_t*)malloc(nStream * sizeof(cudaStream_t));
for (unsigned int i = 0; i<nStream; i++)
HANDLE_ERROR(cudaStreamCreate(&(stream1[i])));

cufftHandle plan1[nStream];
for (unsigned int i = 0; i<nStream; i++)
{
cufftSafeCall(cufftPlan1d(&plan1[i], nChannel, CUFFT_C2C, dataSteam));
cufftSafeCall(cufftSetStream(plan1[i], stream1[i]));
}


for (unsigned int i = 0; i < nStream; i++)
{
HANDLE_ERROR(cudaMemcpyAsync(dXX + (dataSteam * nChannel + iniValue)*i,
xx + dataSteam * nChannel * i, (dataSteam * nChannel + iniValue)*
sizeof(Complex), cudaMemcpyHostToDevice, stream1[i]));
cudaMakevv(dXX + i *(dataSteam * nChannel + iniValue), dHH, dVV +
(nStream - 1 - i) * dataSteam * nChannel, stream1[i]);
cufftSafeCall((cufftExecC2C(plan1[i], (cufftComplex *)(dVV +
(nStream - 1 - i) * dataSteam * nChannel), (cufftComplex *)(dVV + (nStream - 1 - i) *
dataSteam * nChannel), CUFFT_FORWARD)));//在GPU上执行
HANDLE_ERROR(cudaMemcpyAsync(yy2 + 64 * 9 + (nStream - 1 - i)*dataSteam * nChannel,
dVV + (nStream - 1 - i)*dataSteam * nChannel, dataSteam * nChannel * sizeof(Complex),
cudaMemcpyDeviceToHost, stream1[i]));
}


...全文
476 回复 打赏 收藏 转发到动态 举报
写回复
用AI写文章
回复
切换为时间正序
请发表友善的回复…
发表回复

581

社区成员

发帖
与我相关
我的任务
社区描述
CUDA™是一种由NVIDIA推出的通用并行计算架构,该架构使GPU能够解决复杂的计算问题。 它包含了CUDA指令集架构(ISA)以及GPU内部的并行计算引擎。
社区管理员
  • CUDA编程社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧