[讨论]复数乘法SSE,SSE2优化
float版的SSE优化来源于AMD官方,double版的SSE2优化
源代码(包含初级版的FFT大数乘法,源代码已经测试exe):
http://www.77studio.net/files/cmplx_test.rar
SSE2版本没有经过测试(我的CPU不支持),欢迎测试,讨论
下面是float版的复数乘法,一次同时计算4个:
__declspec(naked)
void cmplx_multiply_sse(float *x, float *y, int num_cmplx_elem, float *prod)
{
__asm
{
;
; TO ASSEMBLE INTO *.obj DO THE FOLLOWING:
; ml.exe -coff -c cmplx_multiply_sse.asm
;
;.586
;.K3D
;.XMM
;_TEXT SEGMENT
;PUBLIC _cmplx_multiply_sse
;_cmplx_multiply_sse PROC NEAR
;==============================================================================
; INSTRUCTIONS BELOW SAVE THE REGISTER STATE WITH WHICH THIS ROUTINE WAS ENTERED
; REGISTERS (EAX, ECX, EDX ARE CONSIDERED VOLATILE AND ASSUMED TO BE CHANGED)
; WHILE THE REGISTERS BELOW MUST BE PRESERVED IF THE USER IS CHANGING THEM
push ebp
mov ebp, esp
;==============================================================================
; parameters passed into routine:
; [ebp+8] = ->x
; [ebp+12] = ->y
; [ebp+16] = num_cmplx_elem
; [ebp+20] = ->prod
;==============================================================================
push ebx ; preserve contents in ebx,esi, and edi on stack
push esi ;
push edi ;
;===============================================================================
; THE CODE BELOW PUTS THE FLOATING POINT SIGN MASK
; [800000000000000800000000000000h]
; TO FLIP THE SIGN OF PACKED SINGLE PRECISION NUMBERS BY USING XORPS
;==============================================================================
mov eax, esp ; Copy stack pointer into EAX.
mov ebx, 16
sub esp, 32 ; Subtract 32 bytes from stack pointer.
and eax, 15 ; AND old stack pointer address with 15 to
; determine # of bytes the address is past a
; 16-byte-aligned address.
sub ebx, eax ; EBX = # of bytes above ESP to next
; 16-byte-aligned address
mov edi, 0h ; EDI = 00000000h
mov esi, 80000000h ; ESI = 80000000h
shr ebx, 2 ; EBX = # of DWORDs past 16-byte-aligned address
mov [esp+4*ebx+12], esi ; Move into address esp+4*ebx the single-precision
mov [esp+4*ebx+8], edi ; floating-point sign mask.
mov [esp+4*ebx+4], esi
mov [esp+4*ebx], edi
;==============================================================================
; THE 4 ASM LINES BELOW LOAD THE FUNCTION s ARGUMENTS INTO GENERAL-PURPOSE
; REGISTERS (GPRS)
; esi = address of array "x"
; edi = address of array "y"
; ecx = # of cmplx products to compute
; eax = address of product to which results are stored
;==============================================================================
mov esi, [ebp+8] ; esi = ->x
mov edi, [ebp+12] ; edi = ->y
mov ecx, [ebp+16] ; ecx = num_cmplx_elem
mov eax, [ebp+20] ; eax = ->prod
;==============================================================================
; THE 6 ASM LINES BELOW OFFSET THE ADDRESS TO THE ARRAYS x[] AND y[] SUCH
; THAT THEY CAN BE ACCESSED IN THE MOST EFFICIENT MANNER AS ILLUSTRATED
; BELOW IN THE LOOP mult4cmplxnum_loop WITH THE MINIMUM NUMBER OF
; ADDRESS INCREMENTS
;==============================================================================
mov edx, ecx ; edx = num_cmplx_elem
neg ecx ; ecx = -num_cmplx_elem
shl edx, 3 ; edx = 8 * num_cmplx_elem = # bytes in x[] and y[] to multiply
add esi, edx ; esi = -> to last element of x[] to multiply
add edi, edx ; edi = -> to last element of y[] to multiply
add eax, edx ; eax = -> end of prod[] to calculate
;==============================================================================
; THIS LOOP MULTIPLIES 4 COMPLEX #s FROM "x[]" UPON 4 COMPLEX #s FROM "y[]"
; AND RETURNS THE PRODUCT IN "prod[]".
;==============================================================================
ALIGN 16 ; Align address of loop to a 16-byte boundary.
eight_cmplx_prod_loop:
///*
movaps xmm0, [esi+ecx*8] ; xmm0=[x1i,x1r,x0i,x0r]
movaps xmm1, [esi+ecx*8+16] ; xmm1=[x3i,x3r,x2i,x2r]
movaps xmm4, [edi+ecx*8] ; xmm4=[y1i,y1r,y0i,y0r]
movaps xmm5, [edi+ecx*8+16] ; xmm5=[y3i,y3r,y2i,y2r]
movaps xmm2, xmm0 ; xmm2=[x1i,x1r,x0i,x0r]
movaps xmm3, xmm1 ; xmm3=[x3i,x3r,x2i,x2r]
movaps xmm6, xmm4 ; xmm6=[y1i,y1r,y0i,y0r]
movaps xmm7, xmm5 ; xmm7=[y3i,y3r,y2i,y2r]
//*/
/*
// 数据无法对齐时可以用这个
movups xmm0, [esi+ecx*8] ; xmm0=[x1i,x1r,x0i,x0r]
movups xmm1, [esi+ecx*8+16] ; xmm1=[x3i,x3r,x2i,x2r]
movups xmm4, [edi+ecx*8] ; xmm4=[y1i,y1r,y0i,y0r]
movups xmm5, [edi+ecx*8+16] ; xmm5=[y3i,y3r,y2i,y2r]
movups xmm2, xmm0 ; xmm2=[x1i,x1r,x0i,x0r]
movups xmm3, xmm1 ; xmm3=[x3i,x3r,x2i,x2r]
movups xmm6, xmm4 ; xmm6=[y1i,y1r,y0i,y0r]
movups xmm7, xmm5 ; xmm7=[y3i,y3r,y2i,y2r]
//*/
shufps xmm0, xmm0, 10100000b ; xmm0=[x1r,x1r,x0r,x0r]
shufps xmm1, xmm1, 10100000b ; xmm1=[x3r,x3r,x2r,x2r]
shufps xmm2, xmm2, 11110101b ; xmm2=[x1i,x1i,x0i,x0i]
shufps xmm3, xmm3, 11110101b ; xmm3=[x3i,x3i,x2i,x2i]
xorps xmm6, [esp+4*ebx] ; xmm6=[-y1i,y1r,-y0i,y0r]
xorps xmm7, [esp+4*ebx] ; xmm7=[-y3i,y3r,-y2i,y2r]
mulps xmm0, xmm4 ; xmm0=[x1r*y1i,x1r*y1r,x0r*y0i,x0r*y0r]
mulps xmm1, xmm5 ; xmm1=[x3r*y3i,x3r*y3r,x2r*y2i,x2r*y2r]
shufps xmm7, xmm7, 10110001b ; xmm7=[y3r,-y3i,y2r,-y2i]
mulps xmm2, xmm6 ; xmm2=[x1i*y1r,-x1i*y1i,x0i*y0r,-x0i*y0i]
mulps xmm3, xmm7 ; xmm3=[x3i*y3r,-x3i*y3i,x2i*y2r,-x2i*y2i]
addps xmm0, xmm2 ; xmm0=[x1r*y1i+x1i*y1r,x1r*y1r-x1i*y1i,
; x0r*y0i+x0i*y0r,x0r*y0r-x0i*y0i]
addps xmm1, xmm3 ; xmm1=[x3r*y3i+x3i*y3r,x3r*y3r-x3i*y3i,
; x2r*y2i+x2i*y2r,x2r*y2r-x2i*y2i]
movntps [eax+ecx*8], xmm0 ; Stream XMM0 and XMM1 to representative
movntps [eax+ecx*8+16], xmm1 ; memory address of prod[].
add ecx, 4 ; ECX = ECX + 4, 一次计算了4个(float)复数
jnz eight_cmplx_prod_loop
sfence ; Finish all memory writes.
;==============================================================================
; INSTRUCTIONS BELOW RESTORE THE REGISTER STATE WITH WHICH THIS ROUTINE WAS
; ENTERED
; REGISTERS EAX, ECX, AND EDX ARE CONSIDERED VOLATILE AND ASSUMED TO BE CHANGED
; WHILE THE REGISTERS BELOW MUST BE PRESERVED IF THE USER IS CHANGING THEM
add esp, 32
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
;==============================================================================
ret
}
}