602
社区成员
发帖
与我相关
我的任务
分享
kernel void mconv(
int Mdim,
int Ndim,
int Pdim,
global float *A,
global float *B,
global float *C
)
{
int i = get_global_id(0);
int j = get_global_id(1);
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
float tmp = 0;
for(int k = 0; k < Ndim; k++)
tmp += A[i*Ndim+k] * B[k*Pdim+j];
C[i*Pdim + j] = tmp;
}
#pragma OPENCl_EXTENSION cl_amd_printf:enable
//A[N][P] B[P][M] C[N][M]
__kernel void mmul(const int Mdim,const int Ndim,const int Pdim,__global float *A,__global float *B,__global float *C)
{
int k;
int i = get_global_id(0);
int j = get_global_id(1);
float tmp;
//printf("global id(0):%d,global id(1):%d\n",i,j);
if((i<Ndim) && (j<Mdim))
{
tmp = 0.0;
for(k = 0 ; k<Pdim; k++)
{
tmp += A[j*Ndim + k] * B[k*Pdim + j];
}
C[i*Ndim+j] = tmp;
}
}
__kernel void simpleMultiply(
__global float *outputC,
int widthA,
int heightA,
int widthB,
int heightB,
__global float *inputA,
__global float *inputB)
{
//Get global position in Y direction
int row = get_global_id(1);
//Get global position in X direction
int col = get_global_id(0);
float sum = 0.0f;
//Calculate result of one element of Matrix
for(int i=0; i<widthA; i++)
{
sum+=inputA[row*widthA+i] * inputB[i*widthB+col];
}
outputC[row*widthB + col] = sum;
}
主机端程序:
#include <iostream>
#include <fstream>
#include <sstream>
#include <CL\cl.h>
using namespace std;
#define ORDER 1000
#define DIM 2
void initmat(int Mdim,int Pdim,int Ndim,float *A,float *B,float *C)
{
//初始化A[M][P]
int i,j;
for(i=0; i<Mdim*Pdim; i++)
{
A[i] = 1;
}
//初始化B[P][N]
for(i=0; i<Pdim*Ndim; i++)
{
B[i] = 1;
}
}
void results(int Mdim,int Ndim,int Pdim,float *C,double run_time)
{
cout<<"Consume time is: "<<run_time*1.0e-6<<" msecs."<<endl;
int i,j;
int count = 0;
//输出两个数验证下
cout<<C[0]<<" "<<C[1]<<endl;
}
int main(int argc,char **argv)
{
float *A; //A matrix
float *B; //B matrix
float *C; //C matrix(C=A*B)
int Mdim,Ndim,Pdim; //A[N][P] B[P][M] C[N][M]
int err; //err code from OpenCL
int szA,szB,szC; //number of matrix elements
size_t global[DIM]; //global domain size
size_t local[DIM]; //local domain size
cl_device_id device_id; //Computer device id
cl_context context; //Computer context
cl_command_queue commands; //copmputer command queue
cl_program program; //computer program
cl_kernel kernel; //computer kernel
cl_uint nd; //number of dims in NDRange
cl_mem a_in; //Memory object for A matrix
cl_mem b_in; //Memory object for B matrix
cl_mem c_out; //Memory object for C matrix
Ndim = ORDER; Pdim=ORDER; Mdim = ORDER;
//------------------------------------------------------------------------------------
//Set up the OpenCL platform using whichever platform is "first"
//------------------------------------------------------------------------------------
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
err = clGetPlatformIDs(1,&firstPlatformId,&numPlatforms);
err = clGetDeviceIDs(firstPlatformId,CL_DEVICE_TYPE_GPU,1,&device_id,NULL);
cl_context_properties properties[]={
CL_CONTEXT_PLATFORM,
(cl_context_properties)firstPlatformId,
0
};
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
commands = clCreateCommandQueue(context,device_id,CL_QUEUE_PROFILING_ENABLE,&err);
//Set up matrices
szA = Ndim*Pdim; szB = Pdim*Mdim; szC = Ndim*Mdim;
A = (float *)malloc(szA *sizeof(float));
B = (float *)malloc(szB*sizeof(float));
C = (float *)malloc(szC *sizeof(float));
//function to set matrices to known values
initmat(Mdim,Ndim,Pdim,A,B,C);
//-------------------------------------------------------------------------------------
//Set up the buffers,initialize matrices ,and write them
//into global memory
//-------------------------------------------------------------------------------------
a_in = clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(float)*szA,NULL,NULL);
b_in = clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(float)*szB,NULL,NULL);
c_out = clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(float)*szC,NULL,NULL);
//Create the computer program from the source buffer
std::ifstream srcFile("kernel.cl");
std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));
const char *src = srcProg.c_str();
size_t length = srcProg.length();
program = clCreateProgramWithSource(context,1,(const char **)&src,&length,&err);
//Build the program
err = clBuildProgram(program,0,NULL,NULL,NULL,NULL);
if(err != CL_SUCCESS)
{
char buildLog[16384];
clGetProgramBuildInfo(program,device_id,CL_PROGRAM_BUILD_LOG,
sizeof(buildLog),buildLog,NULL);
cerr<<"Error in kernel:"<<buildLog<<endl;
clReleaseProgram(program);
system("pause");
exit(EXIT_FAILURE);
}
//Create the computer kernel from the program
kernel = clCreateKernel(program,"simpleMultiply",&err);
//Write the A and B matrices into compute device memory
err = clEnqueueWriteBuffer(commands,a_in,CL_TRUE,0,sizeof(float)*szA,A,0,NULL,NULL);
err = clEnqueueWriteBuffer(commands,b_in,CL_TRUE,0,sizeof(float)*szB,B,0,NULL,NULL);
//Set the arguments to our compute kernel
err = 0;
err = clSetKernelArg(kernel,0,sizeof(cl_mem),&c_out);
err |= clSetKernelArg(kernel,1,sizeof(cl_int),&Pdim);
err |= clSetKernelArg(kernel,2,sizeof(cl_int),&Ndim);
err |= clSetKernelArg(kernel,3,sizeof(cl_int),&Mdim);
err |= clSetKernelArg(kernel,4,sizeof(cl_int),&Pdim);
err |= clSetKernelArg(kernel,5,sizeof(cl_mem),&a_in);
err |= clSetKernelArg(kernel,6,sizeof(cl_mem),&b_in);
cl_event prof_event;
//Execute the kernel over the entire range of C matrix elements
global[0] = (size_t)Mdim; global[1] = (size_t)Ndim; nd = 2;
err = clEnqueueNDRangeKernel(commands,kernel,nd,NULL,global,NULL,0,NULL,&prof_event);
//size_t globalws[2] = {Mdim,Ndim};
//size_t localws[2] ={4,4};
//err = clEnqueueNDRangeKernel(commands,kernel,2,NULL,globalws,localws,0,NULL,&prof_event);
//wait for the commands to complete before reading back results
clFinish(commands);
cl_ulong ev_start_time = (cl_ulong)0;
cl_ulong ev_end_time = (cl_ulong)0;
err = clGetEventProfilingInfo(prof_event,
CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
&ev_start_time,
NULL);
err = clGetEventProfilingInfo(prof_event,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),
&ev_end_time,
NULL);
//Read back the results from the computer device
err = clEnqueueReadBuffer(commands,c_out,CL_TRUE,0,sizeof(float)*szC,C,0,NULL,NULL);
double run_time = (double)ev_end_time - ev_start_time;
results(Mdim,Ndim,Pdim,C,run_time);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseMemObject(a_in);
clReleaseMemObject(b_in);
clReleaseMemObject(c_out);
clReleaseCommandQueue(commands);
clReleaseContext(context);
system("pause");
return 0;
}