“英特尔杯”全国计算机多核程序设计大赛源码第一题成绩0.109秒供大家讨论

huanyun 2007-08-03 06:46:40

文件名: answer1_C_Map_0.109.cpp

说明: 使用文件内存映射,在后台用线程读取文件
利用变量 float fDiv; 减少乘法的次数

源码:

/*////////////////////////////////////////
// 版权所有(C) 2000-2008 邓辉 //
// Email： denghui0815@hotmail.com //
// 文件名： answer1.cpp //
// 说明： Intel优化大赛参赛作品 //
////////////////////////////////////////*/
#include <mathimf.h>
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
#include <time.h>
#include <tmmintrin.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <process.h>

#define X_MIN_FLAT -3.402823466e+38F
#ifdef TEST_TIME
__int64 g_xFrequency;

void InitFrequency(void)
{
QueryPerformanceFrequency( (LARGE_INTEGER *)&g_xFrequency );
}

__int64 XGetTickCount(void)
{
__int64 nTick = -1;
QueryPerformanceCounter( (LARGE_INTEGER *)&nTick );
nTick = (__int64)(1000000 * (double)(nTick) / (double)(g_xFrequency));
return nTick;
}
#endif

int main(int argc, char *argv[])
{
#ifdef TEST_TIME
InitFrequency();

__int64 start = XGetTickCount();
#endif
struct stat buf ={0};

char szInput[MAX_PATH] = "r:\\test.dat";

if(argc > 1) strcpy(szInput, argv[1]);

HANDLE hProcess = GetCurrentProcess();

SetPriorityClass(hProcess, REALTIME_PRIORITY_CLASS);

if(stat(szInput, &buf))
{
printf("The file does not exist\n");
exit(0);
}

int nNum = buf.st_size/sizeof(float);

if(nNum < 4)
{
printf("乘积=%f 首数的序号=%d\n", 0, 0);
}
else
{
HANDLE hFile,hFileMappingIn;

hFile = CreateFile(szInput,
GENERIC_READ,
FILE_SHARE_READ,
NULL,
OPEN_EXISTING,
FILE_FLAG_SEQUENTIAL_SCAN,
NULL);

if(hFile == INVALID_HANDLE_VALUE)
{
printf("The file does not exist\n");
exit(0);
}

hFileMappingIn = CreateFileMapping(hFile,NULL,PAGE_READONLY, 0, 0, NULL);

CloseHandle(hFile);

int i=0,nIndex=0;
float fProduct = X_MIN_FLAT,fMin = X_MIN_FLAT,fMul,fTmp[3];
float* pArray = (float*)MapViewOfFile(hFileMappingIn, FILE_MAP_READ, 0, 0, NULL);
float fDiv = 1.0/10000.0/10000.0/10000.0;
i = 0;nNum-=3;

while(i < nNum)
{
if(pArray[i+3] < fMin)
{
i += 4;
}
else
{
fMul = pArray[i] * pArray[i+1] * pArray[i+2] * pArray[i+3];
if(fProduct < fMul)
{
fProduct = fMul;
fMin = fProduct*fDiv;
nIndex = i;
}
++i;
}
}

UnmapViewOfFile(pArray);

#ifdef TEST_TIME
printf("乘积=%f 首数的序号=%d 时间=%10.9f\n", fProduct, nIndex, (double)(XGetTickCount() - start) / 1000000);
#else
printf("乘积=%f 首数的序号=%d\n", fProduct, nIndex);
#endif
}
}

...全文

577 9 打赏收藏转发到动态举报

写回复

用AI写文章

9 条回复

切换为时间正序

请发表友善的回复…

发表回复

ai_3621 2007-08-05

打赏
举报

俺什么技术也没有用，曾经正确提交一次，
运行的时间好像是0.2吧，
等俺后来想优化的时候，时间已经过了。
也就没有再仔细考虑下去,拿出来献丑了。
/*
估计要是优化的话，在0.16左右不成问题。
主要想法就是减少浮点乘积计算次数，
理论上：
最坏情况可以减少到1/2次,
最好情况只要计算3次浮点乘积.
*/

void GetMaxValue(float* pArry,int nSize,int* pindex,float* pValue)
{
unsigned char T0,T1,T2,T3,T4,T5;
unsigned char checkTable[3]={3,2,2};
unsigned char inLoop[2]={4,5};

float MaxFloat_t=0;
int nLoop=0;

int LoopLimit=nSize-8;

for(nLoop=0;nLoop<=LoopLimit;)
{
T0 = MAX_EQ_FLOAT_VALUE(pArry,pArry+4);
T1 = MAX_EQ_FLOAT_VALUE(pArry+1,pArry+5);
T2 = MAX_EQ_FLOAT_VALUE(pArry+2,pArry+6);
T3 = MAX_EQ_FLOAT_VALUE(pArry+3,pArry+7);

T4 = (T0<<1) + T1;
T5 = (T2<<1) + T3;

switch(T4)
{
case 2:
{
MaxFloat_t=(*pArry)*(*(pArry+1))*(*(pArry+2))*(*(pArry+3));
//if(*pnMaxt>*pnMax)
if(MAX_FLOAT_VALUE(&MaxFloat_t,pValue))
{
*pValue=MaxFloat_t;
* pindex = nLoop;
}

if(T5)
{
MaxFloat_t=(*(pArry+checkTable[T5-1]))*(*(pArry+checkTable[T5-1]+1))*(*(pArry+checkTable[T5-1]+2))*(*(pArry+checkTable[T5-1]+3));

//if(*pnMaxt>*pnMax)
if(MAX_FLOAT_VALUE(&MaxFloat_t,pValue))
{
*pValue=MaxFloat_t;
* pindex = nLoop+checkTable[T5-1];
}
}
nLoop += inLoop[T3];
pArry += inLoop[T3];
break;
}
case 3:
{
MaxFloat_t=(*pArry)*(*(pArry+1))*(*(pArry+2))*(*(pArry+3));

//if(*pnMaxt>*pnMax)
if(MAX_FLOAT_VALUE(&MaxFloat_t,pValue))
{
*pValue=MaxFloat_t;
* pindex = nLoop;
}

if(T5==1)
{//conflict
MaxFloat_t=(*(pArry+3))*(*(pArry+4))*(*(pArry+5))*(*(pArry+6));

//if(*pnMaxt>*pnMax)
if(MAX_FLOAT_VALUE(&MaxFloat_t,pValue))
{
*pValue=MaxFloat_t;
* pindex = nLoop+3;
}
}

nLoop += inLoop[T3];
pArry += inLoop[T3];
break;

}
case 1:
{
MaxFloat_t=(*(pArry+1))*(*(pArry+2))*(*(pArry+3))*(*(pArry+4));

//if(*pnMaxt>*pnMax)
if(MAX_FLOAT_VALUE(&MaxFloat_t,pValue))
{
*pValue=MaxFloat_t;
* pindex = nLoop+1;
}

if(T5==1)
{//conflict
MaxFloat_t=(*(pArry+3))*(*(pArry+4))*(*(pArry+5))*(*(pArry+6));

//if(*pnMaxt>*pnMax)
if(MAX_FLOAT_VALUE(&MaxFloat_t,pValue))
{
*pValue=MaxFloat_t;
* pindex = nLoop+3;
}
}

nLoop += inLoop[T3];
pArry += inLoop[T3];
break;
}
default:
{//T4==0
if(T5)
{
MaxFloat_t=(*(pArry+checkTable[T5-1]))*(*(pArry+checkTable[T5-1]+1))*(*(pArry+checkTable[T5-1]+2))*(*(pArry+checkTable[T5-1]+3));
//if(*pnMaxt>*pnMax)
if(MAX_FLOAT_VALUE(&MaxFloat_t,pValue))
{
*pValue=MaxFloat_t;
*pindex = nLoop+checkTable[T5-1];
}
}
nLoop += inLoop[T3];
pArry += inLoop[T3];
break;
}
}
}

LoopLimit=nSize-4;
for(;nLoop<=LoopLimit;nLoop++,pArry++)
{
MaxFloat_t=(*pArry)*(*(pArry+1))*(*(pArry+2))*(*(pArry+3));

//if(*pnMaxt>*pnMax)
if(MAX_FLOAT_VALUE(&MaxFloat_t,pValue))
{
*pValue=MaxFloat_t;
*pindex = nLoop;
}
}

}

aliceZOOZ 2007-08-05

打赏
举报

也就是说，在linux下，直接连乘+双线程就能到0.105
而精打细算的 SSE 可以到单线程 0.088

这些方法速度都很快啊。虽然现在不能再提交了，但仍然可以互相交流一下嘛。
我的经验也和你类似，大部分时间是读文件，而不是乘法。

teal 2007-08-05

打赏
举报

这个题目的关键，并不是减少乘法次数。
IntelCPU又专门的浮点乘法运算器，浮点乘法占用的时钟周期并不多，
如果加上很多if判断以减少乘法，结果只会消耗更多的时间。

此题目本人提交经验：

1 XP系统，文件映射 + 直接连乘 + 单线程 = 时间: 0.141
文件映射 + 直接连乘 + 双线程 = 时间: 0.156 至 0.178
2 linux系统，文件映射 + 直接连乘 + 单线程 = 时间: 0.125
文件映射 + 直接连乘 + 双线程 = 时间: 0.105 至 0.109

3 linux系统，文件映射 + SSE指令 + 精打细算 + 双线程 = 时间：0.103至0.109
4 一个同学说，linux系统，文件映射 + SSE指令 + 精打细算 + 单线程可以达到 0.088，可惜比赛结束了，不能再提交了。

aliceZOOZ 2007-08-04

打赏
举报

这个也很赞，通过andps来完成index的更新，避免了对四个数的单独操作，也减少了branch

whycadi 2007-08-04

打赏
举报

//我用汇编编的乘法部分，在AMD上比直接c语言的快一倍左右，我用P43.0则使用浮点运算的C语言代码要比AMD1.9G快一些，不过SSE的则相当。不知道扣肉上会是什么样
int FindMax2sub(int start,float* maxlist,int* poslist){

int i;
int threadnum;
threadnum=omp_get_thread_num();
_asm{ XORPS xmm3,xmm3
xorps xmm7,xmm7

}

for(i=0;i<num;i+=4){

_asm{
//将连续4个数作为一组
//取4组连续4个数，1234,2345,3456,4567,它们进行sse矢量连乘，就得到4个连续4个数的乘积
mov ecx,[i]
movups xmm2,[data+ecx*4]
movups xmm1,[data+ecx*4+4]
movss xmm6,[i]

mulps xmm1,xmm2
movups xmm2,[data+ecx*4+8]
movups xmm5,[data+ecx*4+12]
shufps xmm6,xmm6,0
//xmm6是当前的数的组序号比如序号为1 2 3 4这组数，xmm6里就是 1 1 1 1

mulps xmm2,xmm5
mulps xmm1,xmm2
movaps xmm4,xmm3
//比较当前这组数与最大的一组数，大于的那个32位为全1,小于的那个为全0
cmpltps xmm4,xmm1
//将xmm6即序号组内本组比最大组大的32位内的数保留，其它清0
andps xmm6,xmm4
//将最大组每个32位变为该位置的最大数
maxps xmm3,xmm1
//更新序号。如果当前组的某一位置的数大于最大数，那么xmm6中这个位置的序号就保留了，否则为0
//由于序号是递增，所以用maxps，可以保证xmm6中不为0的序号会更新到xmm7中
maxps xmm7,xmm6
}

}
__declspec(align(16))int v[4];
__declspec(align(16))float f[4];
_asm{
movaps [v],xmm7
movaps [f],xmm3
}
//再将一组展开，取其中的最大一个数，它的组序号在xmm7中，实际位置要加偏移量
for(i=0;i<4;i++){
if(f[i]>maxlist[threadnum]){
maxlist[threadnum]=f[i];
poslist[threadnum]=v[i]+i;
}
}

return poslist[threadnum];
}

aliceZOOZ 2007-08-03

打赏
举报

这题应该把精力放在内存/文件读取，而不是在计算上。
文件读取的速度太慢了。这种剪枝恰恰可以省下一些内存/文件的读取。真实很巧妙

aliceZOOZ 2007-08-03

打赏
举报

赞，这个剪枝的思路很好。

前面的那个没有用SSE的，和后面那个用SSE的，都是一样的速度吗？

huanyun 2007-08-03

打赏
举报

__inline void XGetMaxMulC_Fast(THREADPARAM* pParam)
{
int nCount = (pParam->nEnd - pParam->nBeg);
int i=0,nIndex=0;
float fProduct = X_MIN_FLAT,fMin = X_MIN_FLAT,fTmp;
float *pArray = pParam->pArray+pParam->nBeg;
float fDiv = 1.0/10000.0/10000.0/10000.0;
while(i < nCount)
{
if(pArray[i+3] < fMin)
{
i += 4;
}
else
{
fTmp = pArray[i] * pArray[i+1] * pArray[i+2] * pArray[i+3];
if(fProduct < fTmp)
{
fProduct = fTmp;
fMin = fProduct*fDiv;
nIndex = i;
}
++i;
}
}

pParam->nIndex = nIndex + pParam->nBeg;
pParam->fProduct = fProduct;
}

__inline void XGetMaxMulC(THREADPARAM* pParam)
{
int i,nBeg,nEnd,nIndex = 0;
float favg,fTmp,fProduct = X_MIN_FLAT;
float* pArray = pParam->pArray;
nBeg = pParam->nBeg;
nEnd = pParam->nEnd;
for( i = nBeg; i < nEnd; ++i)
{
fTmp = pArray[i] * pArray[i+1] *pArray[i+2] * pArray[i+3];
if(fProduct < fTmp)
{
fProduct = fTmp;
nIndex = i;
}
}

pParam->nIndex = nIndex;
pParam->fProduct = fProduct;
}

__inline void XGetMaxMulSSE1(THREADPARAM* pParam)
{
int nCount = (pParam->nEnd - pParam->nBeg) / SSE_ROUND_SIZE4;
int i=0,j=0,k=0,nIndex=0;
float fProduct = X_MIN_FLAT;
float *pArray = pParam->pArray,*pMul;
__m128 fMul[SSE_ROUND_SIZE+4]={0};
pMul = (float*)fMul;

for(j = 0; j < nCount; ++j)
{
fMul[0] = _mm_mul_ps(_mm_load_ps(pArray), _mm_loadu_ps(pArray + 1));

for(i = 0; i < SSE_ROUND_SIZE; i+=4)
{
fMul[i+1] = _mm_mul_ps(_mm_load_ps(pArray + (i << 2) + 4), _mm_loadu_ps(pArray + (i << 2) + 5));
fMul[i+2] = _mm_mul_ps(_mm_load_ps(pArray + (i << 2) + 8), _mm_loadu_ps(pArray + (i << 2) + 9));
fMul[i+3] = _mm_mul_ps(_mm_load_ps(pArray + (i << 2) + 12), _mm_loadu_ps(pArray + (i << 2) + 13));
fMul[i+4] = _mm_mul_ps(_mm_load_ps(pArray + (i << 2) + 16), _mm_loadu_ps(pArray + (i << 2) + 17));

fMul[i] = _mm_mul_ps(fMul[i], _mm_loadu_ps(pMul + (i << 2) + 2));
fMul[i+1] = _mm_mul_ps(fMul[i+1], _mm_loadu_ps(pMul + (i << 2) + 6));
fMul[i+2] = _mm_mul_ps(fMul[i+2], _mm_loadu_ps(pMul + (i << 2) + 10));
fMul[i+3] = _mm_mul_ps(fMul[i+3], _mm_loadu_ps(pMul + (i << 2) + 14));
}

for( i = 0; i < SSE_ROUND_SIZE4; ++i)
{
if(fProduct < pMul[i])
{
fProduct = pMul[i];
nIndex = i + j * SSE_ROUND_SIZE4;
}
}

pArray += SSE_ROUND_SIZE4;
}

pParam->nIndex = nIndex;
pParam->fProduct = fProduct;
}

huanyun 2007-08-03

打赏
举报

文件名: answer1_SSE_0.109.cpp

说明: 使用异步文件IO,使用SSE2优化代码

源码:
/*////////////////////////////////////////
// 版权所有(C) 2000-2008 邓辉 //
// Email： denghui0815@hotmail.com //
// 文件名： answer1.cpp //
// 说明： Intel优化大赛参赛作品 //
////////////////////////////////////////*/
#include <mathimf.h>
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
#include <time.h>
#include <tmmintrin.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <process.h>

#pragma warning( disable : 175)
#pragma warning( disable : 1684)

#define MAX_DEQUE 4
#define MAX_THREAD 32
#define SSE_ROUND_SIZE 256
#define SSE_ROUND_SIZE4 (SSE_ROUND_SIZE*4)
#define READ_ONE_SIZE (SSE_ROUND_SIZE4*32)
#define X_MIN_FLAT -3.402823466e+38F

#ifdef TEST_TIME
__int64 g_xFrequency;

void InitFrequency(void)
{
QueryPerformanceFrequency( (LARGE_INTEGER *)&g_xFrequency );
}

__int64 XGetTickCount(void)
{
__int64 nTick = -1;
QueryPerformanceCounter( (LARGE_INTEGER *)&nTick );
nTick = (__int64)(1000000 * (double)(nTick) / (double)(g_xFrequency));
return nTick;
}
#endif

typedef struct tagThreadParam
{
int nBeg,nEnd,nIndex;
float fProduct;
float* pArray;
}THREADPARAM;

typedef struct tagDequeParam
{
float* pArray;
int nCount,nOffset;
volatile int nState;
}DEQUEPARAM;

const int g_nThread = 2;

THREADPARAM g_ThreadParam[MAX_THREAD]={0};
DEQUEPARAM g_DequeParam[MAX_DEQUE+1]={0};

void XGetMaxMulC(THREADPARAM* pParam);
void XGetMaxMulC_Fast(THREADPARAM* pParam);
void XGetMaxMulSSE1(THREADPARAM* pParam);

void* XAlignPtr(void* pVoid, int nAlign)
{
if(nAlign <= 0)
return pVoid;
else
return (void*)(((long)pVoid + nAlign - 1) & ~(long)(nAlign-1) );
}

float g_fProduct=X_MIN_FLAT;
int g_nIndex=0;
volatile int g_nPopIndex=0;
volatile int g_nPushIndex=0;
volatile BOOL g_bRun = TRUE;

unsigned int __stdcall XThreadWork(void* pParam)
{
SetThreadAffinityMask(GetCurrentThread(), 2);

int nPop = g_nPopIndex = 0;

while(g_bRun)
{
if(g_DequeParam[nPop].nState == 1)
{
g_ThreadParam[0].pArray = g_DequeParam[nPop].pArray;
g_ThreadParam[0].nBeg = 0;
g_ThreadParam[0].nEnd = g_DequeParam[nPop].nCount;

XGetMaxMulSSE1(g_ThreadParam);

if(g_fProduct < g_ThreadParam[0].fProduct)
{
g_fProduct = g_ThreadParam[0].fProduct;
g_nIndex = g_ThreadParam[0].nIndex + g_DequeParam[nPop].nOffset;
}

g_DequeParam[nPop].nState = 0;
++g_nPopIndex;
nPop = g_nPopIndex % MAX_DEQUE;
}
}

return 0;
}

void XInitDequeWork(int nNum, float* pArray, int nOneCount)
{
for(int i = 0; i < MAX_DEQUE; ++i)
{
g_DequeParam[i].pArray = pArray + (nOneCount + 256) * i;
g_DequeParam[i].nCount = nOneCount;
g_DequeParam[i].nState = 0;
}
}

int main(int argc, char* argv[])
{
#ifdef TEST_TIME
InitFrequency();

__int64 start = XGetTickCount();
#endif

struct stat buf ={0};
float *pMemory=NULL, *pArray=NULL;
int nNum=0,nRet=0;
char szInput[MAX_PATH] = "r:\\test.dat";
if(argc > 1) strcpy(szInput, argv[1]);

HANDLE hProcess = GetCurrentProcess();

SetPriorityClass(hProcess, REALTIME_PRIORITY_CLASS);

SetThreadAffinityMask(GetCurrentThread(), 1);

HANDLE hThread = (HANDLE)_beginthreadex(NULL, 0, XThreadWork, NULL, 0, NULL);

if(stat(szInput, &buf))
{
printf("The file does not exist\n");
exit(0);
}

nNum = buf.st_size/sizeof(float);

if(nNum < 4)
{
printf("乘积=%f 首数的序号=%d\n", 0, 0);
}
else
{
int nTime = (nNum-3)/READ_ONE_SIZE;
int nLast = (nNum-3) - nTime*READ_ONE_SIZE;

pMemory = (float *)malloc((READ_ONE_SIZE + 256)*(MAX_DEQUE + 2)* sizeof(float));
pArray = (float *)XAlignPtr(pMemory, 128);

XInitDequeWork(READ_ONE_SIZE, pArray, READ_ONE_SIZE);

FILE *fp=fopen(szInput,"rb");
float fTmp[3];
int i,nPush;
fread(fTmp, sizeof(float), 3, fp);

for(i=0; i < nTime; ++i)
{
nPush = g_nPushIndex % MAX_DEQUE;
while(g_DequeParam[nPush].nState != 0) nRet += i;
g_DequeParam[nPush].nOffset = i * READ_ONE_SIZE;
g_DequeParam[nPush].pArray[0] = fTmp[0];
g_DequeParam[nPush].pArray[1] = fTmp[1];
g_DequeParam[nPush].pArray[2] = fTmp[2];
fread(g_DequeParam[nPush].pArray+3, sizeof(float), READ_ONE_SIZE, fp);
fTmp[0] = g_DequeParam[nPush].pArray[READ_ONE_SIZE];
fTmp[1] = g_DequeParam[nPush].pArray[READ_ONE_SIZE+1];
fTmp[2] = g_DequeParam[nPush].pArray[READ_ONE_SIZE+2];
g_DequeParam[nPush].nState = 1;
++g_nPushIndex;
}

if(nLast > 0)
{
g_ThreadParam[1].pArray = pArray + (READ_ONE_SIZE + 256) * MAX_DEQUE;
g_ThreadParam[1].nBeg = 0;
g_ThreadParam[1].nEnd = nLast;
g_ThreadParam[1].pArray[0] = fTmp[0];
g_ThreadParam[1].pArray[1] = fTmp[1];
g_ThreadParam[1].pArray[2] = fTmp[2];
fread(g_ThreadParam[1].pArray+3, sizeof(float), nLast, fp);
XGetMaxMulC(g_ThreadParam+1);
}

for(int i=0; i < MAX_DEQUE; ++i) while(g_DequeParam[i].nState != 0);

if(nLast > 0 && g_fProduct < g_ThreadParam[1].fProduct)
{
g_fProduct = g_ThreadParam[1].fProduct;
g_nIndex = g_ThreadParam[1].nIndex + nTime * READ_ONE_SIZE;
}

fclose(fp);

#ifdef TEST_TIME
printf("乘积=%f 首数的序号=%d 时间=%10.9f秒\n", g_fProduct, g_nIndex, (double)(XGetTickCount()-start)/1000000);
#else
printf("乘积=%f 首数的序号=%d\n", g_fProduct, g_nIndex);
#endif
}

free(pMemory);

g_bRun = FALSE;

return 0;
}