找到一段优化的好东东,对比赛应该有用
代替dist = sqrt(distx + disty + distz)的东东:
void CSSETestDlg(
float* pArray1, // [输入] 源数组1
float* pArray2, // [输入] 源数组2
float* pArray3, // [输入] 源数组3
float* pResult, // [输出] 用来存放结果的数组
int nSize) // [输入] 数组的大小
{
int nLoop = nSize/ 4;
__m128 m1, m2, m3, m4;
__m128* pSrc1 = (__m128*) pArray1;
__m128* pSrc2 = (__m128*) pArray2;
__m128* pSrc3 = (__m128*) pArray3;
__m128* pDest = (__m128*) pResult;
__m128 m0_5 = _mm_set_ps1(0.5f); // m0_5[0, 1, 2, 3] = 0.5
for ( int i = 0; i < nLoop; i++ )
{
m1 = _mm_mul_ps(*pSrc1, *pSrc1); // m1 = *pSrc1 * *pSrc1
m2 = _mm_mul_ps(*pSrc2, *pSrc2);
m3 = _mm_mul_ps(*pSrc3, *pSrc3);
m4 = _mm_add_ps(m1, m2);
m4 = _mm_add_ps(m4, m3);
*pDest = _mm_sqrt_ps(m4); // m4 = sqrt(m3)
pSrc1++;
pSrc2++;
pSrc3++;
pDest++;
}
}