579
社区成员
发帖
与我相关
我的任务
分享
#include "device_launch_parameters.h"
#include<iostream>
#include<stdio.h>
#include "cuda_runtime.h"
#include "device_functions.h"
#define TILE_WIDTH 2
//定义一个4*4的矩阵
#define WIDTH 4
#define HEIGHT 4
__global__ void MatrixMulKernel(double * _A, double *_B, double * _C, int _wa, int _wb)
{
//找出该线程所在的行列
int row = blockIdx.y*blockDim.y + threadIdx.y;//该矩阵的行索引
int col = blockIdx.x*blockDim.x + threadIdx.x;//该矩阵的类索引。。。
//int row = threadIdx.y;//当用一个block的时候
//int col = threadIdx.x;
//线程Thread(row,col)负责计算C(row,col)
if (row < _wa&&col < _wa)
{
double sum = 0;
for (int i = 0; i < _wa; ++i)
//_A[row*_wa+i]索引_A的一行,_B[i*_wb+col]索引_B的一列
{
sum += _A[row*_wa + i] * _B[i*_wb + col];
}
_C[row*_wa + col] = sum;
}
}
void MatrixMultiplication_CUDA1(const double* M, const double* N, double* P, int _wa, int _ha, int _wb)
{
cudaSetDevice(0); //设置目标GPU
double *Md, *Nd, *Pd;
int size_a = _wa * _ha * sizeof(double);//字节长度
int size_b = _wb * _wa * sizeof(double);//字 节长度
int size_c = _ha * _wb * sizeof(double);//字节长度
cudaMalloc((void**)&Md, size_a);
cudaMalloc((void**)&Nd, size_b);
cudaMalloc((void**)&Pd, size_c);
//Copies a matrix from the memory* area pointed to by src to the memory area pointed to by dst
cudaMemcpy(Md, M, size_a, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, N, size_b, cudaMemcpyHostToDevice);
dim3 dimGrid(WIDTH / TILE_WIDTH, HEIGHT / WIDTH);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
MatrixMulKernel << < dimGrid, dimBlock >> >(Md, Nd, Pd, _wa, _wb);
cudaMemcpy(P, Pd, size_c, cudaMemcpyDeviceToHost);
//释放设备上的矩阵
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
}
int main()
{
double A[16] = { 1,2,3,4 ,1,2,3,4,1,2,3,4,1,2,3,4};
double B[16] = { 5,6,7,8,5,6,7,8,5,6,7,8,5,6,7,8};
double C[] = {0};
printf("A矩阵\n");
for (int i = 0; i < 16; i++)
{
if (i % 4== 0)
{
printf("\n");
}
printf(" %5f", A[i]);
}
printf("\n");
printf("B矩阵\n");
for (int i = 0; i < 16; i++)
{
if (i % 4 == 0)
{
printf("\n");
}
printf(" %5f", B[i]);
}
printf("\n");
printf("结果C矩阵\n");
MatrixMultiplication_CUDA1(A, B, C, 4, 4, 4);
for (int i = 0; i < 16; i++)
{//输出矩阵样式
if (i % 4 == 0)
{
printf("\n");
}
printf(" %f", C[i]);
}
system("pause");
return 0;
}