MALI T860 GPU运行速度是CPU的2倍
本人小白,初次接触GPU,对其中的好些问题以及运行原理不懂
我是在ARM上运行的Opencl去操作GPU运行,GPU为MALI T860,代码如下
kernel代码
kernel void Calculate(global const unsigned int * srcPicData,
global const unsigned int * purNum,
global const unsigned int * picNum,
global const float * ratio,
global unsigned int *purPicData)
{
int gid = get_global_id(0);
unsigned int rgb = srcPicData[picNum[gid]];
unsigned int r = ((rgb >> 16) & 0xff) * ratio[gid];
unsigned int g = ((rgb >> 8) & 0xff) * ratio[gid];
unsigned int b = ((rgb) & 0xff) * ratio[gid];
unsigned int purRgb = (0xffu << 24) | ((r & 0xff) << 16) | ((g & 0xff) << 8) | (b & 0xff);
purPicData[purNum[gid]] += purRgb;
}
CPU代码,
OpenClInit();为GPU初始化代码,paintEvent函数定时调用处理数据
cl_context myDialog::CreateContext()
{
cl_int errNum;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
cl_context c_context = 0;
errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
if (errNum != CL_SUCCESS || numPlatforms <= 0)
{
qDebug() << "Failed to find any OpenCL platforms." << errNum;
return NULL;
}
qDebug() << "Plat forms Count : " << numPlatforms;
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)firstPlatformId,
0
};
c_context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
NULL, NULL, &errNum);
if((errNum != CL_SUCCESS) || (c_context == 0))
{
qDebug() << "Create Context From Type Failed!";
}
return c_context;
}
cl_command_queue myDialog::CreateCommandQueue(cl_context c_context, cl_device_id *device)
{
cl_int errNum;
cl_device_id *devices;
size_t deviceBufferSize = -1;
errNum = clGetContextInfo(c_context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
if(errNum != CL_SUCCESS)
{
qDebug() << "clGetContextInfo Failed!";
return NULL;
}
if (deviceBufferSize <= 0)
{
qDebug() << "No devices available.";
return NULL;
}
else
{
qDebug() << "Device Buffer size : " << deviceBufferSize << " " << sizeof(cl_device_id);
}
devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
errNum = clGetContextInfo(c_context, CL_CONTEXT_DEVICES, deviceBufferSize, &devices[0], NULL);
if(errNum != CL_SUCCESS)
{
qDebug() << "clGetContextInfo Failed!";
return NULL;
}
commandQueue = clCreateCommandQueue(c_context, devices[0], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE , &errNum);
if(errNum != CL_SUCCESS)
{
qDebug() << "clCreateCommandQueue Failed!";
return NULL;
}
*device = devices[0];
delete[] devices;
return commandQueue;
}
// 涓夈€佸垱寤哄拰鏋勫缓绋嬪簭瀵硅薄
cl_program myDialog::CreateProgram(cl_context context, cl_device_id device, const char* fileName)
{
cl_int errNum;
std::ifstream kernelFile(fileName, std::ios::in);
if (!kernelFile.is_open())
{
qDebug() << "Failed to open file for reading: " << fileName << endl;
return NULL;
}
std::ostringstream oss;
oss << kernelFile.rdbuf();
std::string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();
program = clCreateProgramWithSource(context, 1,
(const char**)&srcStr,
NULL, &errNum);
if(errNum != CL_SUCCESS)
{
qDebug() << "Create Program Failed!";
return NULL;
}
qDebug() << srcStr;
errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if(errNum != CL_SUCCESS)
{
qDebug() << "Build Program Failed!";
return NULL;
}
return program;
}
bool myDialog::CreateMemObjects(cl_context context, cl_mem memObjects[5],
uint *a, uint *b, uint *c, float *d, uint * result)
{
cl_int errNum;
memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(uint) * SOUWIDTH * SOUHIGH, a, &errNum);
memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(uint) * SOUWIDTH * SOUHIGH, b, &errNum);
memObjects[2] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(uint) * SOUWIDTH * SOUHIGH, c, &errNum);
memObjects[3] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * SOUHIGH * SOUHIGH, d, &errNum);
memObjects[4] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
sizeof(uint) * PICHIGH * PICWIDTH, result, &errNum);
if(errNum != CL_SUCCESS)
{
qDebug() << "Creat Buffer Failed!";
return false;
}
return true;
}
bool myDialog::CreatePicMemObjects(cl_context context, cl_mem memObjects[5],
uint *purPicData, int * srcPicData, int *purNum,int *picNum,float * ratio)
{
cl_int errNum;
memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
SOUWIDTH*SOUHIGH, srcPicData, &errNum);
if(errNum != CL_SUCCESS)
{
qDebug() << "CreatePicMemObjects memObjects[1] Failed!" << errNum;
Cleanup(context,commandQueue,program,kernel,memObjects);
return false;
}
memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(int) * SOUWIDTH*SOUHIGH*3, purNum, &errNum);
if(errNum != CL_SUCCESS)
{
qDebug() << "CreatePicMemObjects memObjects[2] Failed!" << errNum;
Cleanup(context,commandQueue,program,kernel,memObjects);
return false;
}
memObjects[2] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(int) * SOUWIDTH*SOUHIGH*3, picNum, &errNum);
if(errNum != CL_SUCCESS)
{
qDebug() << "CreatePicMemObjects memObjects[3] Failed!" << errNum;
Cleanup(context,commandQueue,program,kernel,memObjects);
return false;
}
memObjects[3] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * SOUWIDTH*SOUHIGH*3, ratio, &errNum);
if(errNum != CL_SUCCESS)
{
qDebug() << "CreatePicMemObjects memObjects[4] Failed!" << errNum;
Cleanup(context,commandQueue,program,kernel,memObjects);
return false;
}
memObjects[4] = clCreateBuffer(context, CL_MEM_READ_WRITE ,
sizeof(uint) * PICWIDTH *PICHIGH, NULL, &errNum);
if(errNum != CL_SUCCESS)
{
qDebug() << "CreatePicMemObjects memObjects[0] Failed!" << errNum;
Cleanup(context,commandQueue,program,kernel,memObjects);
return false;
}
return true;
}
void myDialog::OpenClInit()
{
//clCreateCommandQueue out-of-order execution
cl_int errNum;
device = 0;
context = 0;
// memObjects = { 0, 0, 0, 0, 0 };
for(int i = 0; i < 5; i++)
{
memObjects[i] = 0;
}
commandQueue = 0;
kernel = 0;
localWorkSize[0] = 1;
// 涓€銆侀€夋嫨OpenCL骞冲彴骞跺垱寤轰竴涓笂涓嬫枃
context = CreateContext();
commandQueue = CreateCommandQueue(context, &device);
program = CreateProgram(context, device, "DataCalculate.cl");
kernel = clCreateKernel(program, "Calculate", &errNum);
if(errNum != CL_SUCCESS)
{
qDebug() << "Create Kernel Failed!";
}
return ;
}
//定时调用下面函数
void myDialog::paintEvent(QPaintEvent *)
{
// timer->stop();
static char i = 0;
i++;
if(i%5 == 0)
qDebug() << "Current Time : " << QDateTime::currentDateTime().toString("hh:mm:ss.zzz");
QImage *iGray = new QImage(PICWIDTH,PICHIGH, QImage::Format_RGB888);
//picture ONE
uint * picBuff = ( uint *)malloc(frame->byteCount());
memcpy(picBuff,frame->bits(),frame->byteCount());
for(int count = 0; count < fileXACount[0]; count++)
{
if((surNum1A[count] > 0) && (purNum1A[count] > 0))
{
iGray->setPixel((purNum1A[count])%PICWIDTH, (purNum1A[count])/PICWIDTH, picBuff[surNum1A[count]]);
}
}
memset(picBuffer,0,PICHIGH*PICWIDTH*sizeof(uint));
CreateMemObjects(context, memObjects, picBuff,purNum1B,surNum1B,ratio1B, picBuffer);
clEnqueueWriteBuffer(commandQueue,memObjects[4],CL_TRUE,0,PICHIGH*PICWIDTH*sizeof(uint),picBuffer,0,NULL,NULL);
qDebug() << "Enqueue Write End: " << QDateTime::currentDateTime().toString("hh:mm:ss.zzz");
if(PictureComposeGPU( ) == false)
{
return;
}
for(int count=0; count<fileXBCount[0]; count++)
{
if( purNum1B[count] > 0 )
{
iGray->setPixel((purNum1B[count])%PICWIDTH,(purNum1B[count])/PICWIDTH,picBuffer[purNum1B[count]]);
}
}
//picture TWO
memcpy(picBuff,frame1->bits(),frame1->byteCount());
for(int count = 0; count < fileXACount[1]; count++)
{
if((surNum2A[count] > 0) && (purNum2A[count] > 0))
{
iGray->setPixel((purNum2A[count])%PICWIDTH, (purNum2A[count])/PICWIDTH, picBuff[surNum2A[count]]);
}
}
CreateMemObjects(context, memObjects, picBuff,purNum2B,surNum2B,ratio2B, picBuffer);
if(PictureComposeGPU( ) == false)
{
return;
}
for(int count=0; count<fileXBCount[1]; count++)
{
if( purNum2B[count] > 0 )
{
iGray->setPixel((purNum2B[count])%PICWIDTH,(purNum2B[count])/PICWIDTH,picBuffer[purNum2B[count]]);
}
}
}
经过测试发现,单纯的CPU多线程处理用时是GPU处理的一半,很蛋疼,好久了不知道该如何下手解决。
如果说我代码需要优化的话,应该怎么优化啊,最好能给个代码