579
社区成员
发帖
与我相关
我的任务
分享
! matrix data
real, managed, allocatable, dimension(:,:) :: A, B, C
real, allocatable, dimension(:,:) :: gold
! matrix data
real, allocatable, dimension(:,:) :: A, B, C, gold
real, allocatable, device, dimension(:,:) :: dA, dB, dC
. . .
allocate(A(N,N))
allocate(B(N,N))
allocate(C(N,N))
allocate(gold(N,N))
call random_number(A)
call random_number(B)
allocate(dA(N,N))
allocate(dB(N,N))
allocate(dC(N,N))
dA = A
dB = B
dC = 0.0
alpha = 1
beta = 0
m = N
k = N
blocks = dim3(N/256, N/16, 1)
threads = dim3(16, 16, 1)
call sgemm_cpu(A, B, gold, m, N, k, alpha, beta)
! timing experiment
time = 0.0
istat = cudaEventRecord(start, 0)
do j = 1, NREPS
call sgemmNN_16x16<<>>(dA, dB, dC, m, N, k, alpha, beta)
end do
istat = cudaEventRecord(stop, 0)
istat = cudaDeviceSynchronize()
istat = cudaEventElapsedTime(time, start, stop)
time = time / (NREPS*1.0e3)
C = dC
! matrix data
real, managed, allocatable, dimension(:,:) :: A, B, C
real, allocatable, dimension(:,:) :: gold
. . .
allocate(A(N,N))
allocate(B(N,N))
allocate(C(N,N))
allocate(gold(N,N))
call random_number(A)
call random_number(B)
C = 0.0
alpha = 1
beta = 0
m = N
k = N
blocks = dim3(N/256, N/16, 1)
threads = dim3(16, 16, 1)
call sgemm_cpu(A, B, gold, m, N, k, alpha, beta)
! timing experiment
time = 0.0
istat = cudaEventRecord(start, 0)
do j = 1, NREPS
call sgemmNN_16x16<<>>(A, B, C, m, N, k, alpha, beta)
end do
istat = cudaEventRecord(stop, 0)
istat = cudaDeviceSynchronize()
istat = cudaEventElapsedTime(time, start, stop)
time = time / (NREPS*1.0e3)