关于二维时clEnqueueNDRangeKernel的参数问题

Mysterium 2014-08-11 10:19:58
我尝试做矩阵乘法,这是opencl kernel文件

kernel void mconv(
int Mdim,
int Ndim,
int Pdim,
global float *A,
global float *B,
global float *C
)
{

int i = get_global_id(0);
int j = get_global_id(1);
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
float tmp = 0;
for(int k = 0; k < Ndim; k++)
tmp += A[i*Ndim+k] * B[k*Pdim+j];

C[i*Pdim + j] = tmp;
}


但是在host程序中,我遇到了问题。

在一维的时候,如果向量长度是N,那么work group size应该是ceil(N/local-group-size)*local-group-size,这样在一维时一切正常,
但是二维情形下,
假定矩阵为m*n,n*p,最后结果为m*p的,如果我设置
global work group size[0] =(ceil((float)m/16))*16,
global work group size[1] =(ceil((float)p/8))8 ,
local-group-size = {16,8}, 其中16*8刚好等于我的设备max work group size,最后计算结果就会有很多项不正确,这是怎么回事啊?
...全文
1213 2 打赏 收藏 转发到动态 举报
写回复
用AI写文章
2 条回复
切换为时间正序
请发表友善的回复…
发表回复
清风水岸 2014-12-15
  • 打赏
  • 举报
回复
没时间细看你的代码,但是可以给你我写的,你可以参考: 内核:

#pragma OPENCl_EXTENSION cl_amd_printf:enable
//A[N][P] B[P][M]  C[N][M]
__kernel void mmul(const int Mdim,const int Ndim,const int Pdim,__global float *A,__global float *B,__global float *C)
{
	int k;
	int i = get_global_id(0);				
	int j = get_global_id(1);				
	float tmp;
	//printf("global id(0):%d,global id(1):%d\n",i,j);
	if((i<Ndim) && (j<Mdim))
	{
		tmp = 0.0;
		for(k = 0 ; k<Pdim; k++)
		{
			tmp += A[j*Ndim + k] * B[k*Pdim + j];
		}
		C[i*Ndim+j] = tmp;
		
	}
}
__kernel void simpleMultiply(
				__global float *outputC,
				int widthA,
				int heightA,
				int widthB,
				int heightB,
				__global float *inputA,
				__global float *inputB)
{
	//Get global position in Y direction
	int row = get_global_id(1);
	//Get global position in X direction
	int col = get_global_id(0);
	
	float sum = 0.0f;
	//Calculate result of one element of Matrix 
	for(int i=0; i<widthA; i++)
	{
		sum+=inputA[row*widthA+i] * inputB[i*widthB+col];
	}
	outputC[row*widthB + col] = sum;
}
主机端程序:

#include <iostream>
#include <fstream>
#include <sstream>
#include <CL\cl.h>
using namespace std;
#define ORDER 1000
#define DIM  2
void initmat(int Mdim,int Pdim,int Ndim,float *A,float *B,float *C)
{
	//初始化A[M][P]
	int i,j;
	for(i=0; i<Mdim*Pdim; i++)
	{
		A[i] = 1;
	}
	//初始化B[P][N]
	for(i=0; i<Pdim*Ndim; i++)
	{
		B[i] = 1;
	}
}
void results(int Mdim,int Ndim,int Pdim,float *C,double run_time)
{
	cout<<"Consume time is: "<<run_time*1.0e-6<<" msecs."<<endl;
	int i,j;
	int count = 0;
	//输出两个数验证下
	cout<<C[0]<<"  "<<C[1]<<endl;
}
int main(int argc,char **argv)
{
	float *A;					//A matrix
	float *B;					//B matrix
	float *C;					//C matrix(C=A*B)
	int Mdim,Ndim,Pdim;			//A[N][P] B[P][M]  C[N][M]
	int err;					//err code from OpenCL
	int szA,szB,szC;			//number of matrix elements
	size_t global[DIM];			//global domain size
	size_t local[DIM];			//local domain size
	cl_device_id device_id;		//Computer device id
	cl_context context;			//Computer context
	cl_command_queue commands;	//copmputer command queue
	cl_program program;			//computer program
	cl_kernel kernel;			//computer kernel
	cl_uint nd;					//number of dims in NDRange
	cl_mem a_in;				//Memory object for A matrix
	cl_mem b_in;				//Memory object for B matrix
	cl_mem c_out;				//Memory object for C matrix

	Ndim = ORDER; Pdim=ORDER; Mdim = ORDER;
	//------------------------------------------------------------------------------------
	//Set up the OpenCL platform using whichever platform is "first"
	//------------------------------------------------------------------------------------
	cl_uint numPlatforms;
	cl_platform_id firstPlatformId;

	err = clGetPlatformIDs(1,&firstPlatformId,&numPlatforms);

	err = clGetDeviceIDs(firstPlatformId,CL_DEVICE_TYPE_GPU,1,&device_id,NULL);

	cl_context_properties properties[]={
	 CL_CONTEXT_PLATFORM,
	 (cl_context_properties)firstPlatformId,
	 0
	};
	context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);

	commands = clCreateCommandQueue(context,device_id,CL_QUEUE_PROFILING_ENABLE,&err);

	//Set up matrices
	szA = Ndim*Pdim; szB = Pdim*Mdim; szC = Ndim*Mdim;
	A = (float *)malloc(szA *sizeof(float));
	B = (float *)malloc(szB*sizeof(float));
	C = (float *)malloc(szC *sizeof(float));
	//function to set matrices to known values
	initmat(Mdim,Ndim,Pdim,A,B,C);

	//-------------------------------------------------------------------------------------
	//Set up the buffers,initialize matrices ,and write them 
	//into global memory
	//-------------------------------------------------------------------------------------
	a_in = clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(float)*szA,NULL,NULL);
	b_in = clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(float)*szB,NULL,NULL);
	c_out = clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(float)*szC,NULL,NULL);

	//Create the computer program from the source buffer
	std::ifstream  srcFile("kernel.cl");

	std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));

	const char *src = srcProg.c_str();
	size_t length = srcProg.length();
	program = clCreateProgramWithSource(context,1,(const char **)&src,&length,&err);
	
	//Build the program
	err = clBuildProgram(program,0,NULL,NULL,NULL,NULL);
	if(err != CL_SUCCESS)
	{
		char buildLog[16384];
		clGetProgramBuildInfo(program,device_id,CL_PROGRAM_BUILD_LOG,
			sizeof(buildLog),buildLog,NULL);
		cerr<<"Error in kernel:"<<buildLog<<endl;
		clReleaseProgram(program);
		system("pause");
		exit(EXIT_FAILURE);
	}

	//Create the computer kernel from the program
	kernel = clCreateKernel(program,"simpleMultiply",&err);

	//Write the A and B matrices into compute device memory
	err = clEnqueueWriteBuffer(commands,a_in,CL_TRUE,0,sizeof(float)*szA,A,0,NULL,NULL);
	err = clEnqueueWriteBuffer(commands,b_in,CL_TRUE,0,sizeof(float)*szB,B,0,NULL,NULL);
	//Set the arguments to our compute kernel
	err = 0;
	err = clSetKernelArg(kernel,0,sizeof(cl_mem),&c_out);
	err |= clSetKernelArg(kernel,1,sizeof(cl_int),&Pdim);
	err |= clSetKernelArg(kernel,2,sizeof(cl_int),&Ndim);
	err |= clSetKernelArg(kernel,3,sizeof(cl_int),&Mdim);
	err |= clSetKernelArg(kernel,4,sizeof(cl_int),&Pdim);
	err |= clSetKernelArg(kernel,5,sizeof(cl_mem),&a_in);
	err |= clSetKernelArg(kernel,6,sizeof(cl_mem),&b_in);
	
	cl_event prof_event;

	//Execute the kernel over the entire range of C matrix elements
	global[0] = (size_t)Mdim; global[1] = (size_t)Ndim;  nd = 2;
	err = clEnqueueNDRangeKernel(commands,kernel,nd,NULL,global,NULL,0,NULL,&prof_event);
	//size_t globalws[2] = {Mdim,Ndim};
	//size_t localws[2] ={4,4};
	//err = clEnqueueNDRangeKernel(commands,kernel,2,NULL,globalws,localws,0,NULL,&prof_event);

	//wait for the commands to complete before reading back results
	clFinish(commands);


	cl_ulong ev_start_time = (cl_ulong)0;
	cl_ulong ev_end_time = (cl_ulong)0;
	err = clGetEventProfilingInfo(prof_event,
		CL_PROFILING_COMMAND_START,
		sizeof(cl_ulong),
		&ev_start_time,
		NULL);
	err = clGetEventProfilingInfo(prof_event,
		CL_PROFILING_COMMAND_END,
		sizeof(cl_ulong),
		&ev_end_time,
		NULL);

	//Read back the results from the computer device
	err = clEnqueueReadBuffer(commands,c_out,CL_TRUE,0,sizeof(float)*szC,C,0,NULL,NULL);
	double run_time = (double)ev_end_time - ev_start_time;	

	results(Mdim,Ndim,Pdim,C,run_time);

	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseMemObject(a_in);
	clReleaseMemObject(b_in);
	clReleaseMemObject(c_out);
	clReleaseCommandQueue(commands);
	clReleaseContext(context);
	system("pause");
	return 0;
}
fronteer 2014-08-13
  • 打赏
  • 举报
回复
矩阵乘这种数据操作层的东西 别花时间实现了, 用 clAmdBlas 库, 可直接从 AMD Developer 网站下载, 这个是已经用 OpenCL+GPU 加速实现的库。 多把时间放在应用上。

602

社区成员

发帖
与我相关
我的任务
社区描述
异构开发技术
社区管理员
  • OpenCL和异构编程社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧