65,182
社区成员




static inline void histogram_add( const uint16_t x[16], uint16_t y[16] )
{
int i;
for ( i = 0; i < 16; ++i ) {
y[i] += x[i];
}
}
static inline void histogram_muladd( const uint16_t a, const uint16_t x[16],
uint16_t y[16] )
{
int i;
for ( i = 0; i < 16; ++i ) {
y[i] += a * x[i];
}
}
1>ctmf.c(169): error C2718: “__m256i”: 具有 __declspec(align('32')) 的实参将不被对齐
1>ctmf.c(169): error C2718: “__m256i”: 具有 __declspec(align('32')) 的实参将不被对齐
1>ctmf.c(169): error C2440: “=”: 无法从“int”转换为“__m256i”
1>
应该怎么修改
#if defined(__SSE2__)
static inline void histogram_add( const uint16_t x[16], uint16_t y[16] )
{
*(__m128i*) &y[0] = _mm_add_epi16( *(__m128i*) &y[0], *(__m128i*) &x[0] );
*(__m128i*) &y[8] = _mm_add_epi16( *(__m128i*) &y[8], *(__m128i*) &x[8] );
}
#elif defined(__MMX__)
static inline void histogram_add( const uint16_t x[16], uint16_t y[16] )
{
*(__m64*) &y[0] = _mm_add_pi16( *(__m64*) &y[0], *(__m64*) &x[0] );
*(__m64*) &y[4] = _mm_add_pi16( *(__m64*) &y[4], *(__m64*) &x[4] );
*(__m64*) &y[8] = _mm_add_pi16( *(__m64*) &y[8], *(__m64*) &x[8] );
*(__m64*) &y[12] = _mm_add_pi16( *(__m64*) &y[12], *(__m64*) &x[12] );
}
#elif defined(__ALTIVEC__)
static inline void histogram_add( const uint16_t x[16], uint16_t y[16] )
{
*(vector unsigned short*) &y[0] = vec_add( *(vector unsigned short*) &y[0], *(vector unsigned short*) &x[0] );
*(vector unsigned short*) &y[8] = vec_add( *(vector unsigned short*) &y[8], *(vector unsigned short*) &x[8] );
}
#else
[/quote]
我这个从0.07s提升到0.05了不知道算不算提升。。。release下面提示了0.01s
#include <intrin.h>
typedef unsigned __int16 WORD;
__inline void histogram_add_Intrinsic_SSE2(const WORD x[16], WORD y[16])
{
__m128i b2 = _mm_load_si128((__m128i *)&y[8]);
__m128i b1 = _mm_load_si128((__m128i *)y);
__m128i a2 = _mm_load_si128((__m128i *)&x[8]);
__m128i a1 = _mm_load_si128((__m128i *)x);
a1 = _mm_add_epi16(a1, b1);
a2 = _mm_add_epi16(a2, b2);
_mm_store_si128((__m128i *)&y[8], a2);
_mm_store_si128((__m128i *)y, a1);
}
__inline void histogram_add_Intrinsic_AVX2(const WORD x[16], WORD y[16])
{
__m256i b = _mm256_load_si256((__m256i *)y);
__m256i a = _mm256_load_si256((__m256i *)x);
a = _mm256_add_epi16(a, b);
_mm256_store_si256((__m256i *)y, a);
}
注意两个版本传入数据需要16字节、32字节对齐,否则要使用非对齐指令。
#if defined(__SSE2__)
static inline void histogram_add( const uint16_t x[16], uint16_t y[16] )
{
*(__m128i*) &y[0] = _mm_add_epi16( *(__m128i*) &y[0], *(__m128i*) &x[0] );
*(__m128i*) &y[8] = _mm_add_epi16( *(__m128i*) &y[8], *(__m128i*) &x[8] );
}
#elif defined(__MMX__)
static inline void histogram_add( const uint16_t x[16], uint16_t y[16] )
{
*(__m64*) &y[0] = _mm_add_pi16( *(__m64*) &y[0], *(__m64*) &x[0] );
*(__m64*) &y[4] = _mm_add_pi16( *(__m64*) &y[4], *(__m64*) &x[4] );
*(__m64*) &y[8] = _mm_add_pi16( *(__m64*) &y[8], *(__m64*) &x[8] );
*(__m64*) &y[12] = _mm_add_pi16( *(__m64*) &y[12], *(__m64*) &x[12] );
}
#elif defined(__ALTIVEC__)
static inline void histogram_add( const uint16_t x[16], uint16_t y[16] )
{
*(vector unsigned short*) &y[0] = vec_add( *(vector unsigned short*) &y[0], *(vector unsigned short*) &x[0] );
*(vector unsigned short*) &y[8] = vec_add( *(vector unsigned short*) &y[8], *(vector unsigned short*) &x[8] );
}
#else