有没有人遇到这样的问题:0: copyin MemcpyAsync (dev=0x704b27e00, host=0x203707e00, size=32000

bqfuthu 2016-04-02 11:12:46
如题,该错误调试发现,程序终止于语句:ERR = cudaMemcpyAsync(DEN,m_DEN,dm_NPRT)
DEN, m_DEN,都有声明和定义内存,分别是devic和host内存。cudaMemcpyAsync这个语句在该行之前也出现过,前面的也通过的,不知为什么。
另外这个程序可以在我原来的机器(Cuda5.5/K20)上运行,最近换了个地方干活,换了台机器(CUDA7.5/GK110B),就出现这个问题了。
---------------------------------------------------------------------------------------------------------------------------------

然后我就利用cuda-memcheck 发现了以下错误提示:
-------
========= Invalid __shared__ write of size 16
========= at 0x00000388 in nbl_cal_neighborelist_devkernel2c_
========= by thread (0,0,0) in block (14,0,0)
========= Address 0x00000074 is misaligned
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usrb64/nvidiabcuda.so.1 (cuLaunchKernel + 0x2cd) [0x15865d]
========= Host Frame:/opt/pginux86-64/2015/cuda/7.5b64bcudart.so.7.5 [0x146ad]
========= Host Frame:/opt/pginux86-64/2015/cuda/7.5b64bcudart.so.7.5 (cudaLaunch + 0x143) [0x2ece3]
========= Host Frame:./fasMD.exe [0xe8200]
========= Host Frame:./fasMD.exe [0xe78e0]
========= Host Frame:./fasMD.exe [0xe3315]
========= Host Frame:./fasMD.exe [0x5002]
========= Host Frame:./fasMD.exe [0x77fe]
========= Host Frame:./fasMD.exe [0x3b34]
========= Host Frame:b64bc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
========= Host Frame:./fasMD.exe [0x3a29]
-------
查到这的时候,有个朋友说可能不是cudaMemcpyAsync的问题,好像有道理,我去查了一下cal_neighborelist_devkernel2c这个kernel子程序。注释掉,程序可以运行。
但是我仔细检查了这个kernel,也不能找出问题的所在。
我通过屏蔽些地方,或者添加些demo语句。发现可能是里面的shared变量有问题。但是我还是找不到问题的所在。
下面我贴出这个kernel,希望高手作答,谢谢。
--------------------------------------------------------------------------------------------------------------
use constant
implicit none
!
!--- DUMMY VARIABLES
integer, value::NBPC,IP0, NC,NCX,NCY,NCZ, PDX, PDY, PDZ, NPART, CFROM, CTO, mxNAPDEV, mxKVOIS, IA1th0
real(KINDDF), value::cra011,cra021,cra031,cra012,cra022,cra032,cra013,cra023,cra033
real(KINDDF), device::XP(3,NPART)
integer, device::ITYP(NPART), NAC(NC), IA1th(NC)
integer,device::KVOIS(mxNAPDEV)
integer,device::INDI(mxKVOIS,mxNAPDEV)
integer(1),device::mvl(mxKVOIS,mxNAPDEV)

!--- Local variables
!nonshared by threads
real(KINDSF)::POS(3), SEP(3)
integer::IB, IB0, IT, IA, IA0,IA00, JA, NN, I, J, K, ITY

!variables share by all thread
integer,shared::NB, IC, IS0, STARTA, NCXY, NCXYZ,IX0, IY0, IZ0,IC0, NACC0
integer,shared::NS,NACC, IAC, IACE, FROM, TO
integer, shared, dimension(mp_NNC)::CID, IX,IY, IZ, OUT

integer(1), shared, dimension(mp_NNC)::I_mvl
real(KINDSF), shared, dimension(3,mp_NNC)::CXYZ
real(KINDSF), shared, dimension(3,mp_BLOCKSIZE)::SPOS
real(KINDSF), shared::RC2(mp_MXGROUP,mp_MXGROUP)
integer, shared, dimension(mp_BLOCKSIZE)::JTY

IB = (blockidx%y-1) * griddim%x + blockidx%x-1

NB = blockdim%x*blockdim%y
IB0 = IB/NBPC

IP0 = (IB-IB0*NBPC)*NB

IB0 = IB0 + CFROM-1

if(IB0 .GE. CTO) return

IT = (threadidx%y-1)*blockdim%x + threadidx%x
if(IT .EQ. 1) then
RC2(1:mp_MXGROUP,1:mp_MXGROUP) = dcm_RU2(1:mp_MXGROUP,1:mp_MXGROUP)
NCXY = NCX*NCY
NCXYZ = NCXY*NCZ
IS0 = IB0/NCXYZ
IC = IB0-IS0*NCXYZ
IZ0 = IC/NCXY
IY0 = (IC-IZ0*NCXY)/NCX
IX0 = IC-IZ0*NCXY-IY0*NCX
IZ0 = IZ0 + 1
IY0 = IY0 + 1
IX0 = IX0 + 1
IC = IB0 + 1
STARTA = IA1th0 ! IA1th(CFROM)
NACC0 = NAC(IC)
end if
call syncthreads()
if(NACC0 .LE. 0) return
if(IT .LE. mp_NNC) then
OUT(IT)= 0
IZ(IT) = IZ0+mp_NIZ(IT)
IY(IT) = IY0+mp_NIY(IT)
IX(IT) = IX0+mp_NIX(IT)
I_mvl(IT) = 0
CXYZ(1:3,IT) = 0.0d0
If(PDX .AND. IT.GT.1) Then
IF( IX(IT).GT.NCX )THEN
IX(IT) = 1
CXYZ(1,IT) = CXYZ(1,IT) + cra011
CXYZ(2,IT) = CXYZ(2,IT) + cra021
CXYZ(3,IT) = CXYZ(3,IT) + cra031
I_mvl(IT) = ior( I_mvl(IT) , 1)
ELSE IF (IX(IT).LT.1) THEN
IX(IT) = NCX
CXYZ(1,IT) = CXYZ(1,IT) - cra011
CXYZ(2,IT) = CXYZ(2,IT) - cra021
CXYZ(3,IT) = CXYZ(3,IT) - cra031
I_mvl(IT) = ior( I_mvl(IT) , 2)
ENDIF
End If
If(PDY .AND. IT.GT.1) Then
IF( IY(IT).GT.NCY )THEN
IY(IT) = 1
CXYZ(1,IT) = CXYZ(1,IT) + cra012
CXYZ(2,IT) = CXYZ(2,IT) + cra022
CXYZ(3,IT) = CXYZ(3,IT) + cra032
I_mvl(IT) = ior( I_mvl(IT) , 4)
ELSE IF (IY(IT).LT.1) THEN
IY(IT) = NCY
CXYZ(1,IT) = CXYZ(1,IT) - cra012
CXYZ(2,IT) = CXYZ(2,IT) - cra022
CXYZ(3,IT) = CXYZ(3,IT) - cra032
I_mvl(IT) = ior( I_mvl(IT) , 8)
ENDIF
End If
If(PDZ .AND. IT.GT.1) Then
IF( IZ(IT).GT.NCZ )THEN
IZ(IT) = 1
CXYZ(1,IT) = CXYZ(1,IT) + cra013
CXYZ(2,IT) = CXYZ(2,IT) + cra023
CXYZ(3,IT) = CXYZ(3,IT) + cra033
I_mvl(IT) = ior( I_mvl(IT) , 16)
ELSE IF (IZ(IT).LT.1) THEN
IZ(IT) = NCZ
CXYZ(1,IT) = CXYZ(1,IT) - cra013
CXYZ(2,IT) = CXYZ(2,IT) - cra023
CXYZ(3,IT) = CXYZ(3,IT) - cra033
I_mvl(IT) = ior( I_mvl(IT) , 32)
ENDIF
End If
IF( IX(IT) .GT. NCX .OR. IX(IT) .LT. 1) OUT(IT) = 1
IF( IY(IT) .GT. NCY .OR. IY(IT) .LT. 1) OUT(IT) = 1
IF( IZ(IT) .GT. NCZ .OR. IZ(IT) .LT. 1) OUT(IT) = 1
IF(OUT(IT) .EQ. 0) then
CID(IT) = NCXY*(IZ(IT)-1)+NCX*(IY(IT)-1)+IX(IT)+IS0*NCXYZ
ELSE
CID(IT) = 0
END IF
END IF
call syncthreads()
IA = (IT-1) + IA1th(IC) + IP0
IA0 = IA - STARTA + 1
IA00 = IA - IA1th(IC)+1
if(IA00.LE.NACC0) then
POS(1:3)=XP(1:3,IA)
ITY = ITYP(IA)
NN = 0
endif
K=1
NACC = NAC(CID(K))

NS = (NACC-1)/NB+1

IAC = IA1th(CID(K))
IACE = IAC + NACC -1

DO J=1, NS, 1
FROM = min((J-1)*NB+IAC,IACE)
TO = min(FROM+NB-1, IACE)
if(IT+FROM-1<=IACE)then
SPOS(1:3,IT) = XP(1:3, IT+FROM-1) + CXYZ(1:3,K)
JTY(IT) = ITYP(IT+FROM-1)
end if
call syncthreads()

IF(IA00.LE.NACC0) then
DO I=FROM, TO
JA = I-FROM+1
SEP(1:3) = POS(1:3) - SPOS(1:3,JA)

if( SEP(1)*SEP(1)+SEP(2)*SEP(2)+SEP(3)*SEP(3) .LE. RC2(ITY,JTY(JA)) ) then
if(I.NE.IA) then
if(NN<mxKVOIS) then

NN = NN + 1
INDI(NN,IA0) = I
mvl(NN,IA0) = I_mvl(K)
end if
end if
end if
END DO
END IF
call syncthreads()
END DO
DO K=2, mp_NNC
IF(OUT(K)) cycle

NACC = NAC(CID(K))

NS = min((NACC-1)/NB+1, NACC)

IAC = IA1th(CID(K))


IACE = IAC + NACC -1
call syncthreads()

DO J=1, NS
FROM = min((J-1)*NB+IAC,IACE)
TO = min(FROM+NB-1, IACE)
if(IT+FROM-1<=IACE)then
SPOS(1:3,IT) = XP(1:3, IT+FROM-1) + CXYZ(1:3,K)
JTY(IT) = ITYP(IT+FROM-1)
endif
call syncthreads()

IF(IA00.LE.NACC0) then
DO I=FROM, TO
JA = I-FROM+1
SEP(1:3) = POS(1:3) - SPOS(1:3,JA)

if( SEP(1)*SEP(1)+SEP(2)*SEP(2)+SEP(3)*SEP(3) .LE. RC2(ITY,JTY(JA)) ) then
if(NN<mxKVOIS) then

NN = NN + 1
INDI(NN, IA0) = I
mvl(NN, IA0) = I_mvl(K)
end if
end if
END DO
END IF
call syncthreads()
END DO
END DO


IF(IA00 .LE. NACC0) then
KVOIS(IA0) = NN
END IF

return
...全文
1084 回复 打赏 收藏 转发到动态 举报
写回复
用AI写文章
回复
切换为时间正序
请发表友善的回复…
发表回复

2,408

社区成员

发帖
与我相关
我的任务
社区描述
高性能计算
社区管理员
  • 高性能计算社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧