--- trunk/xvidcore/src/plugins/x86_asm/plugin_ssim-a.asm 2006/10/11 13:55:32 1730 +++ branches/release-1_2-branch/xvidcore/src/plugins/x86_asm/plugin_ssim-a.asm 2008/11/28 16:54:45 1820 @@ -24,122 +24,109 @@ BITS 32 -%macro cglobal 1 - %ifdef PREFIX - %ifdef MARK_FUNCS - global _%1:function %1.endfunc-%1 - %define %1 _%1:function %1.endfunc-%1 - %else - global _%1 - %define %1 _%1 - %endif - %else - %ifdef MARK_FUNCS - global %1:function %1.endfunc-%1 - %else - global %1 - %endif - %endif -%endmacro +%include "nasm.inc" -%macro ACC_ROW 1 - movq %1,[ecx] +%macro ACC_ROW 2 + movq %1,[ TMP0] + movq %2,[TMP0+TMP1] psadbw %1,mm0 - add ecx, edx -%endmacro - - ;load a dq from mem to a xmm reg -%macro LOAD_XMM 2 - movdqu %1,[%2] - ;movhps %1,[%2+8] -%endmacro - -%macro WRITE_XMM 2 - ;movlps [%1],%2 - ;movhps [%1+8],%2 - movdqu [%1],%2 + psadbw %2,mm0 + lea TMP0, [TMP0+2*TMP1] + paddw %1, %2 %endmacro %macro CONSIM_1x8_SSE2 0 - LOAD_XMM xmm0,ecx - LOAD_XMM xmm1,edx - pxor xmm2,xmm2 + movdqu xmm0,[TMP0] + movdqu xmm1,[TMP1] ;unpack to words punpcklbw xmm0,xmm2 punpcklbw xmm1,xmm2 - ;devo - psubw xmm0,xmm6 - movaps xmm2,xmm0 - pmaddwd xmm2,xmm0 - paddd xmm3,xmm2 - - ;devc - psubw xmm1,xmm7 - movaps xmm2,xmm1 - pmaddwd xmm2,xmm1 - paddd xmm4,xmm2 - - ;corr - pmaddwd xmm1,xmm0 - paddd xmm5,xmm1 -%endmacro + movaps xmm3,xmm0 + movaps xmm4,xmm1 + pmaddwd xmm0,xmm0;orig + pmaddwd xmm1,xmm1;comp + pmaddwd xmm3,xmm4;corr + + paddd xmm5,xmm0 + paddd xmm6,xmm1 + paddd xmm7,xmm3 +%endmacro %macro CONSIM_1x8_MMX 0 - movq mm0,[ecx];orig - movq mm1,[edx];comp - pxor mm2,mm2;null vector - - ;unpack low half of qw to words - punpcklbw mm0,mm2 - punpcklbw mm1,mm2 - - ;devo - psubw mm0,mm6 - movq mm2,mm0 - pmaddwd mm2,mm0 - paddd mm3,mm2; - - ;devc - psubw mm1,mm7 - movq mm2,mm1 - pmaddwd mm2,mm1 - paddd mm4,mm2 + movq mm0,[TMP0];orig + movq mm1,[TMP1];comp + + ;unpack low half of qw to words + punpcklbw mm0,mm2 + punpcklbw mm1,mm2 + + movq mm3,mm0 + pmaddwd mm3,mm0 + paddd mm5,mm3; + + movq mm4,mm1 + pmaddwd mm4,mm1 + paddd mm6,mm4; - ;corr pmaddwd mm1,mm0 - paddd mm5,mm1 + paddd mm7,mm1 - movq mm0,[ecx] - movq mm1,[edx] - pxor mm2,mm2;null vector - - ;unpack high half of qw to words - punpckhbw mm0,mm2 - punpckhbw mm1,mm2 - - ;devo - psubw mm0,mm6 - movq mm2,mm0 - pmaddwd mm2,mm0 - paddd mm3,mm2; - - ;devc - psubw mm1,mm7 - movq mm2,mm1 - pmaddwd mm2,mm1 - paddd mm4,mm2 + movq mm0,[TMP0];orig + movq mm1,[TMP1];comp + + ;unpack high half of qw to words + punpckhbw mm0,mm2 + punpckhbw mm1,mm2 + + movq mm3,mm0 + pmaddwd mm3,mm0 + paddd mm5,mm3; + + movq mm4,mm1 + pmaddwd mm4,mm1 + paddd mm6,mm4; - ;corr pmaddwd mm1,mm0 - paddd mm5,mm1 + paddd mm7,mm1 %endmacro +%macro CONSIM_WRITEOUT 3 + mov eax,prm4d;lumo + mul eax; lumo^2 + add eax, 32 + shr eax, 6; 64*lum0^2 + movd TMP0d,%1 + sub TMP0d, eax + + mov TMP1,prm6; pdevo + mov dword [TMP1],TMP0d + + mov eax,prm5d ;lumc + mul eax; lumc^2 + add eax, 32 + shr eax, 6; 64*lumc^2 + movd TMP0d,%2 + sub TMP0d, eax + + mov TMP1,prm7; pdevc + mov dword [TMP1],TMP0d + mov eax,prm4d;lumo + mul prm5d; lumo*lumc, should fit in _EAX + add eax, 32 + shr eax, 6; 64*lumo*lumc + movd TMP0d,%3 + sub TMP0d, eax + + mov TMP1,prm8; pcorr + mov dword [TMP1],TMP0d +%endmacro -SECTION .text + +SECTION .rotext align=SECTION_ALIGN cglobal lum_8x8_mmx cglobal consim_sse2 @@ -147,161 +134,142 @@ ;int lum_8x8_c(uint8_t* ptr, uint32_t stride) -ALIGN 16 +ALIGN SECTION_ALIGN lum_8x8_mmx: - mov ecx, [esp + 4] ;ptr - mov edx, [esp + 8];stride + mov TMP0, prm1 ;ptr + mov TMP1, prm2 ;stride pxor mm0,mm0 - ACC_ROW mm1 - ACC_ROW mm2 - paddw mm1 ,mm2 + ACC_ROW mm1, mm2 - ACC_ROW mm3 - ACC_ROW mm4 - paddw mm3 ,mm4 - - ACC_ROW mm5 - ACC_ROW mm6 - paddw mm5, mm6 - - ACC_ROW mm7 - ACC_ROW mm4 - paddw mm7, mm4 + ACC_ROW mm3, mm4 + + ACC_ROW mm5, mm6 + + ACC_ROW mm7, mm4 paddw mm1, mm3 paddw mm5, mm7 paddw mm1, mm5 movd eax,mm1 - emms ret -.endfunc - -ALIGN 16 -consim_mmx: - mov ecx,[esp+4] ;ptro - pxor mm6,mm6; - - mov edx,[esp+8] ;ptrc - pxor mm3,mm3;devo - pxor mm4,mm4;devc - movd mm6,[esp + 16];lumo - pxor mm7,mm7 - mov eax,[esp+12];stride - movd mm7,[esp + 20];lumc - pshufw mm6,mm6,00000000b - pxor mm5,mm5;corr - pshufw mm7,mm7,00000000b - - CONSIM_1x8_MMX - add ecx,eax - add edx,eax - CONSIM_1x8_MMX - add ecx,eax - add edx,eax - CONSIM_1x8_MMX - add ecx,eax - add edx,eax - CONSIM_1x8_MMX - add ecx,eax - add edx,eax - CONSIM_1x8_MMX - add ecx,eax - add edx,eax - CONSIM_1x8_MMX - add ecx,eax - add edx,eax - CONSIM_1x8_MMX - add ecx,eax - add edx,eax - CONSIM_1x8_MMX - - pshufw mm0,mm3,01001110b - paddd mm3,mm0 - pshufw mm1,mm4,01001110b - paddd mm4,mm1 - pshufw mm2,mm5,01001110b - paddd mm5,mm2 - - ;load target pointer - mov ecx,[esp + 24]; pdevo - movd [ecx],mm3 - mov edx,[esp + 28]; pdevc - movd [edx],mm4 - mov eax,[esp + 32]; corr - movd [eax],mm5 - emms - ret -.endfunc +ENDFUNC +ALIGN SECTION_ALIGN consim_sse2: - mov ecx,[esp+4] ;ptro - pxor xmm6,xmm6; - mov edx,[esp+8] ;ptrc - pxor xmm3,xmm3;devo - pxor xmm4,xmm4;devc - movd xmm6,[esp + 16];lumo - pxor xmm7,xmm7 - mov eax,[esp+12];stride - movd xmm7,[esp + 20];lumc - pxor xmm5,xmm5;corr + mov TMP0,prm1 ;ptro + mov TMP1,prm2 ;ptrc + mov _EAX, prm3 ;stride + + pxor xmm2,xmm2;null vektor + pxor xmm5,xmm5;devo + pxor xmm6,xmm6;devc + pxor xmm7,xmm7;corr ;broadcast lumo/c - ;punpcklbw xmm6,xmm6 + punpcklbw xmm6,xmm6 punpcklwd xmm6,xmm6 pshufd xmm6,xmm6,00000000b;or shufps - ;punpcklbw xmm7,xmm7 + punpcklbw xmm7,xmm7 punpcklwd xmm7,xmm7 pshufd xmm7,xmm7,00000000b CONSIM_1x8_SSE2 - add ecx,eax - add edx,eax + add TMP0,_EAX + add TMP1,_EAX CONSIM_1x8_SSE2 - add ecx,eax - add edx,eax + add TMP0,_EAX + add TMP1,_EAX CONSIM_1x8_SSE2 - add ecx,eax - add edx,eax + add TMP0,_EAX + add TMP1,_EAX CONSIM_1x8_SSE2 - add ecx,eax - add edx,eax + add TMP0,_EAX + add TMP1,_EAX CONSIM_1x8_SSE2 - add ecx,eax - add edx,eax + add TMP0,_EAX + add TMP1,_EAX CONSIM_1x8_SSE2 - add ecx,eax - add edx,eax + add TMP0,_EAX + add TMP1,_EAX CONSIM_1x8_SSE2 - add ecx,eax - add edx,eax + add TMP0,_EAX + add TMP1,_EAX CONSIM_1x8_SSE2 -;accumulate xmm3-5 - pshufd xmm0, xmm3, 0EH ; Get bit 64-127 from xmm1 (or use movhlps) - paddd xmm3, xmm0 ; Sums are in 2 dwords - pshufd xmm0, xmm3, 01H ; Get bit 32-63 from xmm0 - paddd xmm3, xmm0 ; Sum is in one dword - - pshufd xmm1, xmm4, 0EH ; Get bit 64-127 from xmm1 (or use movhlps) - paddd xmm4, xmm1 ; Sums are in 2 dwords - pshufd xmm1, xmm4, 01H ; Get bit 32-63 from xmm0 - paddd xmm4, xmm1 ; Sum is in one dword - - pshufd xmm2, xmm5, 0EH ; Get bit 64-127 from xmm1 (or use movhlps) - paddd xmm5, xmm2 ; Sums are in 2 dwords - pshufd xmm2, xmm5, 01H ; Get bit 32-63 from xmm0 - paddd xmm5, xmm2 ; Sum is in one dword - - - ;load target pointer - mov ecx,[esp + 24]; pdevo - movd [ecx],xmm3 - mov edx,[esp + 28]; pdevc - movd [edx],xmm4 - mov eax,[esp + 32]; corr - movd [eax],xmm5 + ;accumulate xmm5-7 + pshufd xmm0, xmm5, 0x0E + paddd xmm5, xmm0 + pshufd xmm0, xmm5, 0x01 + paddd xmm5, xmm0 + + pshufd xmm1, xmm6, 0x0E + paddd xmm6, xmm1 + pshufd xmm1, xmm6, 0x01 + paddd xmm6, xmm1 + + pshufd xmm2, xmm7, 0x0E + paddd xmm7, xmm2 + pshufd xmm2, xmm7, 0x01 + paddd xmm7, xmm2 + + CONSIM_WRITEOUT xmm5,xmm6,xmm7 ret -.endfunc \ No newline at end of file +ENDFUNC + + + + + +ALIGN SECTION_ALIGN +consim_mmx: + mov TMP0,prm1 ;ptro + mov TMP1,prm2 ;ptrc + mov _EAX,prm3;stride + pxor mm2,mm2;null + pxor mm5,mm5;devo + pxor mm6,mm6;devc + pxor mm7,mm7;corr + + CONSIM_1x8_MMX + add TMP0,_EAX + add TMP1,_EAX + CONSIM_1x8_MMX + add TMP0,_EAX + add TMP1,_EAX + CONSIM_1x8_MMX + add TMP0,_EAX + add TMP1,_EAX + CONSIM_1x8_MMX + add TMP0,_EAX + add TMP1,_EAX + CONSIM_1x8_MMX + add TMP0,_EAX + add TMP1,_EAX + CONSIM_1x8_MMX + add TMP0,_EAX + add TMP1,_EAX + CONSIM_1x8_MMX + add TMP0,_EAX + add TMP1,_EAX + CONSIM_1x8_MMX + + movq mm0,mm5 + psrlq mm0,32 + paddd mm5,mm0 + movq mm1,mm6 + psrlq mm1,32 + paddd mm6,mm1 + movq mm2,mm7 + psrlq mm2,32 + paddd mm7,mm2 + + CONSIM_WRITEOUT mm5,mm6,mm7 + ret +ENDFUNC + +%ifidn __OUTPUT_FORMAT__,elf +section ".note.GNU-stack" noalloc noexec nowrite progbits +%endif