353
社区成员
发帖
与我相关
我的任务
分享
__global__ void matrixinver(float *A,float *B,int i)
{
const int tid=threadIdx.x;
float temp=0.0;
if (tid!=i)
{
temp=A[tid*m+i]/A[i*m+i];
for (int k=0;k<m;k++)
{
A[tid*m+k]=A[tid*m+k]-A[i*m+k]*temp;
B[tid*m+k]=B[tid*m+k]-B[i*m+k]*temp;
}
}
else{}
}
cudaMalloc((void**) &tt_gpu,sizeof(float)*m*m);
cudaMalloc((void**) &B_gpu,sizeof(float)*m*m);
cudaMemcpy2D(tt_gpu,sizeof(float)*m,tt,sizeof(float)*m,sizeof(float)*m,m,cudaMemcpyHostToDevice);
cudaMemcpy2D(B_gpu,sizeof(float)*m,B,sizeof(float)*m,sizeof(float)*m,m,cudaMemcpyHostToDevice);
now=clock();
for (int i=0;i<m;i++)
{
matrixinver<<<1,m,0>>>(tt_gpu,B_gpu,i);
}
cudaMemcpy2D(tt,sizeof(float)*m,tt_gpu,sizeof(float)*m,sizeof(float)*m,m,cudaMemcpyDeviceToHost);
cudaMemcpy2D(B,sizeof(float)*m,B_gpu,sizeof(float)*m,sizeof(float)*m,m,cudaMemcpyDeviceToHost);
then=clock();
//只有调用kernal,和kernal部分
#define N 1000
#define THREAD_NUM 100
#define BLOCK_NUM 10
#define BLOCK_SIZE 32
#define BLOCK_DIM 32
clock_t matinvCUDA(float* a, int lda, float* b, int ldb)
{
clock_t start,finish;
float *ac, *bc;
start=clock();
size_t pitch_a,pitch_b;
cudaMallocPitch((void**) &ac, &pitch_a, sizeof(float) * N, N);
cudaMallocPitch((void**) &bc, &pitch_b, sizeof(float) * N, N);
cudaMemcpy2D(ac, pitch_a, a, sizeof(float) * lda,sizeof(float) * N, N, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, pitch_b, b, sizeof(float) * ldb,sizeof(float) * N, N, cudaMemcpyHostToDevice);
int bx = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 blocks(bx, bx);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
//for (int i=0;i<N;i++)
//{
matrixinv<<<blocks,threads>>>(ac, pitch_a/sizeof(float), bc, pitch_b/sizeof(float), N, 0);
//}
cudaMemcpy2D(a, sizeof(float) * N, ac, pitch_a,sizeof(float) * N, N, cudaMemcpyDeviceToHost);
cudaMemcpy2D(b, sizeof(float) * N, bc, pitch_b,sizeof(float) * N, N, cudaMemcpyDeviceToHost);
cudaFree(ac);
cudaFree(bc);
finish=clock();
return finish-start;
}
__global__ static void matrixinv(float* a, size_t lda, float* b, size_t ldb, int n, int k)
{
__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
const int tidc = threadIdx.x;
const int tidr = threadIdx.y;
const int bidc = blockIdx.x * BLOCK_SIZE;
const int bidr = blockIdx.y * BLOCK_SIZE;
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
if ((xIndex<N)&&(yIndex<N))
{
unsigned int index_ain = yIndex * lda + xIndex;
unsigned int index_bin = yIndex * ldb + xIndex;
matA[tidr][tidc] = a[index_ain];
matB[tidr][tidc] = b[index_bin];
}
//测试
matA[tidr][tidc] += 100;
matB[tidr][tidc] += 100;
__syncthreads();
if ((xIndex<N)&&(yIndex<N))
{
unsigned int index_aout = yIndex * lda + xIndex;
unsigned int index_bout = yIndex * ldb + xIndex;
a[index_aout] = matA[tidr][tidc];
b[index_bout] = matB[tidr][tidc];
}
}
//调用核函数部分,外面的循环意思就是从第一行开始作为主行,进行消元,i是获得主行的行号,但是这么实现起来速度还是不好,之比串行快4s,原因还是内存读取太多
for (int i=0;i<N;i++)
{
matrixinv<<<BLOCK_NUM,THREAD_NUM,sizeof(float)*N*2>>>
(ac,pitch_a/sizeof(float), bc, pitch_b/sizeof(float), i);
cudaThreadSynchronize();
}
__global__ static void matrixinv(float* a, size_t lda, float* b, size_t ldb, int i)
{
const int bid=blockIdx.x;
const int tid=threadIdx.x;
const int row=bid*THREAD_NUM+tid;
extern __shared__ float shared[];
for (int k=0;k<N;k++)
{
shared[k]=a[i*lda+k];
shared[N+k]=b[i*ldb+k];
}
__syncthreads();
__shared__ float ii;
ii=a[i*lda+i];
float temp=0.0;
if (row!=i)
{
temp=a[row*lda+i]/ii;
for (int k=0;k<N;k++)
{
a[row*lda+k]=a[row*lda+k]-shared[k]*temp;
b[row*ldb+k]=b[row*ldb+k]-shared[N+k]*temp;
}
}
else{}
}