353
社区成员
发帖
与我相关
我的任务
分享
#include "cutil_inline.h"
#include "cublas.h"
#define N 1024
void simple_sgemm(const float *A, const float *B, float *C) {
int i, j, k;
for(i=0; i<N; i++)
for(j=0; j<N; j++) {
float s=0;
for(k=0; k<N; k++) s+=A[k*N+i]*B[j*N+k];
C[j*N+i]=s;
}
}
int main() {
float *h_A=(float*)malloc(N*N*sizeof(float));
float *h_B=(float*)malloc(N*N*sizeof(float));
float *h_C=(float*)malloc(N*N*sizeof(float));
float *h_C_ref=(float*)malloc(N*N*sizeof(float));
float *d_A, *d_B, *d_C;
unsigned int timer1=0;
cutCreateTimer(&timer1);
cutStartTimer(timer1);
printf("simpleCUBLAS test running..\n");
cublasInit();
for(int i=0; i<N*N; i++) {
h_A[i]=rand()/(float)RAND_MAX;
h_B[i]=rand()/(float)RAND_MAX;
}
cublasAlloc(N*N, sizeof(float), (void**)&d_A);
cublasAlloc(N*N, sizeof(float), (void**)&d_B);
cublasAlloc(N*N, sizeof(float), (void**)&d_C);
cublasSetVector(N*N, sizeof(float), h_A, 1, d_A, 1);
cublasSetVector(N*N, sizeof(float), h_B, 1, d_B, 1);
float t0, gpu_t, cpu_t, error_norm=0, ref_norm=0;
cudaThreadSynchronize();
t0=cutGetTimerValue(timer1);
cublasSgemm('n', 'n', N, N, N, 1.0f, d_A, N, d_B, N, 0.0f, d_C, N);
cudaThreadSynchronize();
gpu_t=(cutGetTimerValue(timer1)-t0)/1000.0f;
cublasGetVector(N*N, sizeof(float), d_C, 1, h_C, 1);
t0=cutGetTimerValue(timer1);
simple_sgemm(h_A, h_B, h_C_ref);
cpu_t=(cutGetTimerValue(timer1)-t0)/1000.0f;
printf("N=%4d, GPU=%.6fs(%.3fGflops), CPU=%.6fs(%.3fGflops)\n",
N, gpu_t, 1e-9*N*N*N*2/gpu_t, cpu_t, 1e-9*N*N*N*2/cpu_t);
for(int i=0; i<N*N; i++) {
float diff=h_C_ref[i]-h_C[i];
error_norm+=diff*diff;
ref_norm+=h_C_ref[i]*h_C_ref[i];
}
printf("Test %s\n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");
}
cublasSetMatrix (N, N, sizeof(float), h_A, N, d_A, N);
#include "cutil_inline.h"
#include "cublas.h"
#define N 512
void simple_sgemm(const float A[N][N], const float B[N][N], float C[N][N]) {
int i, j, k;
for(j=0; j<N; j++)
for(i=0; i<N; i++) {
float s=0;
for(k=0; k<N; k++) s+=A[i][k]*B[k][j];
C[j][i]=s;
}
}
float h_A[N][N], h_B[N][N], h_C[N][N], h_C_ref[N][N];
int main() {
float *d_A, *d_B, *d_C;
unsigned int timer1=0;
cutCreateTimer(&timer1);
cutStartTimer(timer1);
printf("simpleCUBLAS test running..\n");
cublasInit();
for(int j=0; j<N; j++)
for(int i=0; i<N; i++) {
h_A[j][i]=rand()/(float)RAND_MAX;
h_B[j][i]=rand()/(float)RAND_MAX;
}
cublasAlloc(N*N, sizeof(float), (void**)&d_A);
cublasAlloc(N*N, sizeof(float), (void**)&d_B);
cublasAlloc(N*N, sizeof(float), (void**)&d_C);
cublasSetVector(N*N, sizeof(float), h_A, 1, d_A, 1);
cublasSetVector(N*N, sizeof(float), h_B, 1, d_B, 1);
float t0, gpu_t, cpu_t, error_norm=0, ref_norm=0;
cudaThreadSynchronize();
t0=cutGetTimerValue(timer1);
cublasSgemm('T', 'T', N, N, N, 1.0f, d_A, N, d_B, N, 0.0f, d_C, N);
cudaThreadSynchronize();
gpu_t=(cutGetTimerValue(timer1)-t0)/1000.0f;
cublasGetVector(N*N, sizeof(float), d_C, 1, h_C, 1);
t0=cutGetTimerValue(timer1);
simple_sgemm(h_A, h_B, h_C_ref);
cpu_t=(cutGetTimerValue(timer1)-t0)/1000.0f;
printf("N=%4d, GPU=%.6fs(%.3fGflops), CPU=%.6fs(%.3fGflops)\n",
N, gpu_t, 1e-9*N*N*N*2/gpu_t, cpu_t, 1e-9*N*N*N*2/cpu_t);
for(int j=0; j<N; j++)
for(int i=0; i<N; i++) {
float diff=h_C_ref[j][i]-h_C[j][i];
error_norm+=diff*diff;
ref_norm+=h_C_ref[j][i]*h_C_ref[j][i];
}
printf("Test %s\n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");
}
#include "cutil_inline.h"
#include "cublas.h"
#define N 512
void simple_sgemm(const float A[N][N], const float B[N][N], float C[N][N]) {
int i, j, k;
for(i=0; i<N; i++)
for(j=0; j<N; j++) {
float s=0;
for(k=0; k<N; k++) s+=A[i][k]*B[k][j];
C[i][j]=s;
}
}
float h_A[N][N], h_B[N][N], h_C[N][N], h_C_ref[N][N];
int main() {
float *d_A, *d_B, *d_C;
unsigned int timer1=0;
cutCreateTimer(&timer1);
cutStartTimer(timer1);
printf("simpleCUBLAS test running..\n");
cublasInit();
for(int i=0; i<N; i++)
for(int j=0; j<N; j++) {
h_A[i][j]=rand()/(float)RAND_MAX;
h_B[i][j]=rand()/(float)RAND_MAX;
}
cublasAlloc(N*N, sizeof(float), (void**)&d_A);
cublasAlloc(N*N, sizeof(float), (void**)&d_B);
cublasAlloc(N*N, sizeof(float), (void**)&d_C);
cublasSetVector(N*N, sizeof(float), h_A, 1, d_A, 1);
cublasSetVector(N*N, sizeof(float), h_B, 1, d_B, 1);
float t0, gpu_t, cpu_t, error_norm=0, ref_norm=0;
cudaThreadSynchronize();
t0=cutGetTimerValue(timer1);
cublasSgemm('n', 'n', N, N, N, 1.0f, d_B, N, d_A, N, 0.0f, d_C, N);
cudaThreadSynchronize();
gpu_t=(cutGetTimerValue(timer1)-t0)/1000.0f;
cublasGetVector(N*N, sizeof(float), d_C, 1, h_C, 1);shuo
t0=cutGetTimerValue(timer1);
simple_sgemm(h_A, h_B, h_C_ref);
cpu_t=(cutGetTimerValue(timer1)-t0)/1000.0f;
printf("N=%4d, GPU=%.6fs(%.3fGflops), CPU=%.6fs(%.3fGflops)\n",
N, gpu_t, 1e-9*N*N*N*2/gpu_t, cpu_t, 1e-9*N*N*N*2/cpu_t);
for(int i=0; i<N; i++)
for(int j=0; j<N; j++) {
float diff=h_C_ref[i][j]-h_C[i][j];
error_norm+=diff*diff;
ref_norm+=h_C_ref[i][j]*h_C_ref[i][j];
}
printf("Test %s\n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");
}