使用OpenMP比不使用OpenMP效率更低的例子(求double数组最小值)
一、环境
Intel(R) Core(TM)2 CPU 6320,双核,超频到2.45G.
4G内存
Windows 2003 x64 操作系统
开发平台:Visual C++ 2008
二、代码
void TestMinOutput()
{
const long nSize = 1000000;
static vector <double> longVector(nSize);
FILE* fid = NULL;
fopen_s(&fid, "I:\\longVector.dat", "rb");
fread_s(&longVector[0], sizeof(double)*nSize, sizeof(double), nSize, fid);
fclose(fid);
double minVal[100], minValOmp[100];
DWORD tStart = GetTickCount();
for (long i=0; i<100; i++)
{
minVal[i] = MinOutput(longVector, 220000 + i*5000);
}
DWORD tMin = GetTickCount() - tStart;
tStart = GetTickCount();
for (long i=0; i<100; i++)
{
minValOmp[i] = MinOutputOMP(longVector, 220000 + i*5000);
}
DWORD tMinOmp = GetTickCount() - tStart;
char xxx[200];
sprintf_s(xxx, 200, " minVal = %le, tMin = %d\n minValOmp = %le, tMinOmp = %d",
minVal[0], tMin, minValOmp[0], tMinOmp);
AfxMessageBox(xxx);
}
double MinOutputOMP(const vector <double> &outputs, const long n)
{
static vector <double> ompDblValues(2);
ompDblValues.assign(2, FLT_MAX);
//const long n = (long)outputs.size(); //n=220000
#pragma omp parallel for
for (long i=0; i <n; i++)
{
long nThread = omp_get_thread_num();
ompDblValues[nThread] = (ompDblValues[nThread] < outputs[i]) ? ompDblValues[nThread] : outputs[i];
}
double minPositive= FLT_MAX;
for (long i=0; i <2; i++)
{
minPositive = (minPositive < ompDblValues[i]) ? minPositive : ompDblValues[i];
}
return minPositive;
}
double MinOutput(const vector <double> &outputs, const long n)
{
double minPositive= FLT_MAX;
//const long n = (long)outputs.size(); //n=220000
for (int i=0; i <n; i++)
{
minPositive = (minPositive < outputs[i]) ? minPositive : outputs[i];
}
return minPositive;
}
有人说,把omp_get_thread_num从循环中拿出来可以提高效率,因此MinOutputOMP可改为
double MinOutputOMP(const vector <double> &outputs, const long n)
{
static vector <double> ompDblValues(2);
ompDblValues.assign(2, FLT_MAX);
//const long n = (long)outputs.size(); //n=220000
#pragma omp parallel
{
long nThread = omp_get_thread_num();
#pragma omp for
for (long i=0; i <n; i++)
{
ompDblValues[nThread] = (ompDblValues[nThread] < outputs[i]) ? ompDblValues[nThread] : outputs[i];
}
}
double minPositive= FLT_MAX;
for (long i=0; i <2; i++)
{
minPositive = (minPositive < ompDblValues[i]) ? minPositive : ompDblValues[i];
}
return minPositive;
}
三、运行结果
1. 使用Intel10.1.029编译器
/GL /c /O3 /Og /Ob2 /Oi /Ot /GT /GA /D "WIN32" /D "_WINDOWS" /D "NDEBUG" /D "_AFXDLL" /D "_MBCS" /FD /EHsc /MD /GS /Gy /GR /Yu"StdAfx.h" /Fp"x64|x64\Release/Test.pch" /Fo"x64|x64\Release/" /W3 /nologo /Zi /Qopenmp /QxT /Qparallel
不提出omp_get_thread_num
minVal = 1.085423e-005, tMin = 78
minValOmp = 1.085423e-005, tMin = 1156
提出omp_get_thread_num
minVal = 1.085423e-005, tMin = 78
minValOmp = 1.085423e-005, tMin = 1235
2. 使用VC++2008编译器
/O2 /Ob2 /Oi /Ot /Oy /GT /GL /D "WIN32" /D "_WINDOWS" /D "NDEBUG" /D "_AFXDLL" /D "_MBCS" /FD /EHsc /MD /Gy /openmp /Yu"stdafx.h" /Fp"x64\Release\Test.pch" /Fo"x64\Release\\" /Fd"x64\Release\vc90.pdb" /W3 /nologo /c /Zi /TP /errorReport:prompt
不提出omp_get_thread_num
minVal = 1.085423e-005, tMin = 78
minValOmp = 1.085423e-005, tMin = 703
提出omp_get_thread_num
minVal = 1.085423e-005, tMin=78
minValOmp = 1.085423e-005, tMin=1141
四、结论
在这个例子中
(1) 使用OMP比不使用OMP效率要大大下降;
(2) 把omp_get_thread_num从循环中拿出来并不能提高效率;
(3) VC++2008编译的代码效率比Intel v10.1编译的代码效率高一些。