SSE的代码速度就一定快吗(附测试代码),不解
#include "windows.h"
#include <xmmintrin.h>
void MemCopySSE(LPVOID pDest, LPVOID pSrc, int nSize)
{
int nSSESize = nSize >> 4;
int nTemp = nSSESize << 4;
nSize -= nTemp;
if(nSSESize)
{
__asm
{
push ecx
push edx
push eax
mov eax, [pSrc]
mov edx, [pDest]
mov ecx, [nSSESize]
memcpy_process:
movups xmm0, [eax]
add eax, 10h
movups [edx], xmm0
add edx, 10h
dec ecx
jnz memcpy_process
pop eax
pop edx
pop ecx
emms
}
}
if(nSize)
{
memcpy((BYTE *)pDest + nTemp, (BYTE *)pSrc + nTemp, nSize);
}
}
void memcpy_sse(BYTE* pDest, const BYTE* pSrc, size_t nBytes )
{
void* pDestOrg = pDest;
UINT nAlignDest = (16 - (uintptr_t)pDest) & 15;
memcpy( pDest, pSrc, nAlignDest );
pDest += nAlignDest;
pSrc += nAlignDest;
nBytes -= nAlignDest;
UINT nLoops = nBytes >> 6; // no. of loops to copy 64 bytes
nBytes -= nLoops << 6;
if( ((uintptr_t)pSrc & 15) == 0 )
{
for( int i = nLoops; i > 0; --i )
{
__m128 tmp0 = _mm_load_ps( (float*)(pSrc + 0 ) );
__m128 tmp1 = _mm_load_ps( (float*)(pSrc + 16) );
__m128 tmp2 = _mm_load_ps( (float*)(pSrc + 32) );
__m128 tmp3 = _mm_load_ps( (float*)(pSrc + 48) );
_mm_store_ps( (float*)(pDest + 0 ), tmp0 );
_mm_store_ps( (float*)(pDest + 16), tmp1 );
_mm_store_ps( (float*)(pDest + 32), tmp2 );
_mm_store_ps( (float*)(pDest + 48), tmp3 );
pSrc += 64;
pDest += 64;
}
}
else
{
for( int i = nLoops; i > 0; --i )
{
__m128 tmp0 = _mm_loadu_ps( (float*)(pSrc + 0 ) );
__m128 tmp1 = _mm_loadu_ps( (float*)(pSrc + 16) );
__m128 tmp2 = _mm_loadu_ps( (float*)(pSrc + 32) );
__m128 tmp3 = _mm_loadu_ps( (float*)(pSrc + 48) );
_mm_store_ps( (float*)(pDest + 0 ), tmp0 );
_mm_store_ps( (float*)(pDest + 16), tmp1 );
_mm_store_ps( (float*)(pDest + 32), tmp2 );
_mm_store_ps( (float*)(pDest + 48), tmp3 );
pSrc += 64;
pDest += 64;
}
}
memcpy( pDest, pSrc, nBytes);
}
int _tmain(int argc, _TCHAR* argv[])
{
DWORD dwSrc[5120];
DWORD dwDes[5120];
DWORD s1=1, e1=2, s2=3, e2=4, s3 = 5, e3 = 6;
s1 = GetTickCount();
for (int i = 0; i < 100000; i++)
memcpy(dwDes, dwSrc, sizeof(dwSrc));
e1 = GetTickCount();
s2 = GetTickCount();
for (int i = 0; i < 100000; i++)
MemCopySSE(dwDes, dwSrc, sizeof(dwSrc));
e2 = GetTickCount();
s3 = GetTickCount();
for (int i = 0; i < 100000; i++)
memcpy_sse((BYTE *)dwDes, (BYTE *)dwSrc, sizeof(dwSrc));
e3 = GetTickCount();
TCHAR s[64];
_stprintf(s, _T("%d-%d-%d"), e1-s1, e2-s2, e3-s3);
MessageBox(0, s, 0, 0);
return 0;
}
如题,本人的机器cpu pD2.8,xp sp3,测试结果531-578-1703,memcpy代码摘自网页,大家的测试结果是怎样的呢