有用过SSE的么?怎么发现SSE居然比C还慢,帖子内有代码大家可以自己编译运行试试看!!!
梁跃 2006-04-11 09:47:37 下面的代码是计算向量归一化到单位向量,可以直接编译运行,发现C的版本要远快于SSE的版本,不知道是程序写的有问题还是SSE的确不行!
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <conio.h>
typedef _declspec(align(16)) float vec3_t[3];
inline void vec_normalize_sse(vec3_t vec)
{
_asm {
mov esi, vec
movaps xmm0, [esi]
movaps xmm1, xmm0
mulps xmm1, xmm1
movaps xmm2, xmm1
shufps xmm2, xmm1, 0xe1
movaps xmm3, xmm1
shufps xmm3, xmm1, 0xc6
addps xmm1, xmm2
addps xmm1, xmm3
shufps xmm1, xmm1, 0x00
sqrtps xmm1, xmm1
divps xmm0, xmm1
movaps [esi], xmm0
}
}
inline void vec_normalize_c(vec3_t vec)
{
float len;
len = vec[0]*vec[0] + vec[1]*vec[1] + vec[2]*vec[2];
len = (float)sqrt(len);
len = 1.0f/len;
vec[0] *= len;
vec[1] *= len;
vec[2] *= len;
}
int main()
{
int i, s, e, count;
vec3_t vec;
count = 10000000;
vec[0] = 1.0f;
vec[1] = 2.0f;
vec[2] = 3.0f;
s = clock();
for (i = 0; i < count; i++) {
vec_normalize_sse(vec);
}
e = clock();
printf("sse = %d, %f, %f, %f\n", e - s, vec[0], vec[1], vec[2]);
vec[0] = 1.0f;
vec[1] = 2.0f;
vec[2] = 3.0f;
s = clock();
for (i = 0; i < count; i++) {
vec_normalize_c(vec);
}
e = clock();
printf("c = %d, %f, %f, %f\n", e - s, vec[0], vec[1], vec[2]);
getch();
return 0;
}