--- trunk/xvidcore/src/motion/x86_asm/sad_sse2.asm 2004/08/22 11:46:10 1535 +++ trunk/xvidcore/src/motion/x86_asm/sad_sse2.asm 2008/11/26 01:04:34 1795 @@ -20,52 +20,33 @@ ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; * $Id: sad_sse2.asm,v 1.11 2004-08-22 11:46:10 edgomez Exp $ +; * $Id: sad_sse2.asm,v 1.16 2008-11-26 01:04:34 Isibaar Exp $ ; * ; ***************************************************************************/ -BITS 32 - -%macro cglobal 1 - %ifdef PREFIX - %ifdef MARK_FUNCS - global _%1:function - %define %1 _%1:function - %else - global _%1 - %define %1 _%1 - %endif - %else - %ifdef MARK_FUNCS - global %1:function - %else - global %1 - %endif - %endif -%endmacro +%include "nasm.inc" ;============================================================================= ; Read only data ;============================================================================= -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif +DATA -ALIGN 64 +ALIGN SECTION_ALIGN zero times 4 dd 0 ;============================================================================= ; Code ;============================================================================= -SECTION .text +SECTION .rotext align=SECTION_ALIGN cglobal sad16_sse2 cglobal dev16_sse2 +cglobal sad16_sse3 +cglobal dev16_sse3 + ;----------------------------------------------------------------------------- ; uint32_t sad16_sse2 (const uint8_t * const cur, <- assumed aligned! ; const uint8_t * const ref, @@ -74,50 +55,61 @@ ;----------------------------------------------------------------------------- -%macro SAD_16x16_SSE2 0 - movdqu xmm0, [edx] - movdqu xmm1, [edx+ecx] - lea edx,[edx+2*ecx] - movdqa xmm2, [eax] - movdqa xmm3, [eax+ecx] - lea eax,[eax+2*ecx] +%macro SAD_16x16_SSE2 1 + %1 xmm0, [TMP1] + %1 xmm1, [TMP1+TMP0] + lea TMP1,[TMP1+2*TMP0] + movdqa xmm2, [_EAX] + movdqa xmm3, [_EAX+TMP0] + lea _EAX,[_EAX+2*TMP0] psadbw xmm0, xmm2 paddusw xmm6,xmm0 psadbw xmm1, xmm3 paddusw xmm6,xmm1 %endmacro -ALIGN 16 -sad16_sse2: - mov eax, [esp+ 4] ; cur (assumed aligned) - mov edx, [esp+ 8] ; ref - mov ecx, [esp+12] ; stride +%macro SAD16_SSE2_SSE3 1 + mov _EAX, prm1 ; cur (assumed aligned) + mov TMP1, prm2 ; ref + mov TMP0, prm3 ; stride pxor xmm6, xmm6 ; accum - SAD_16x16_SSE2 - SAD_16x16_SSE2 - SAD_16x16_SSE2 - SAD_16x16_SSE2 - SAD_16x16_SSE2 - SAD_16x16_SSE2 - SAD_16x16_SSE2 - SAD_16x16_SSE2 + SAD_16x16_SSE2 %1 + SAD_16x16_SSE2 %1 + SAD_16x16_SSE2 %1 + SAD_16x16_SSE2 %1 + SAD_16x16_SSE2 %1 + SAD_16x16_SSE2 %1 + SAD_16x16_SSE2 %1 + SAD_16x16_SSE2 %1 pshufd xmm5, xmm6, 00000010b paddusw xmm6, xmm5 pextrw eax, xmm6, 0 ret +%endmacro + +ALIGN SECTION_ALIGN +sad16_sse2: + SAD16_SSE2_SSE3 movdqu +ENDFUNC + + +ALIGN SECTION_ALIGN +sad16_sse3: + SAD16_SSE2_SSE3 lddqu +ENDFUNC ;----------------------------------------------------------------------------- ; uint32_t dev16_sse2(const uint8_t * const cur, const uint32_t stride); ;----------------------------------------------------------------------------- -%macro MEAN_16x16_SSE2 0 ; eax: src, ecx:stride, mm7: zero or mean => mm6: result - movdqu xmm0, [eax] - movdqu xmm1, [eax+ecx] - lea eax, [eax+2*ecx] ; + 2*stride +%macro MEAN_16x16_SSE2 1 ; _EAX: src, TMP0:stride, mm7: zero or mean => mm6: result + %1 xmm0, [_EAX] + %1 xmm1, [_EAX+TMP0] + lea _EAX, [_EAX+2*TMP0] ; + 2*stride psadbw xmm0, xmm7 paddusw xmm6, xmm0 psadbw xmm1, xmm7 @@ -125,25 +117,24 @@ %endmacro -ALIGN 16 -dev16_sse2: - mov eax, [esp+ 4] ; src - mov ecx, [esp+ 8] ; stride +%macro MEAN16_SSE2_SSE3 1 + mov _EAX, prm1 ; src + mov TMP0, prm2 ; stride pxor xmm6, xmm6 ; accum pxor xmm7, xmm7 ; zero - MEAN_16x16_SSE2 - MEAN_16x16_SSE2 - MEAN_16x16_SSE2 - MEAN_16x16_SSE2 - - MEAN_16x16_SSE2 - MEAN_16x16_SSE2 - MEAN_16x16_SSE2 - MEAN_16x16_SSE2 + MEAN_16x16_SSE2 %1 + MEAN_16x16_SSE2 %1 + MEAN_16x16_SSE2 %1 + MEAN_16x16_SSE2 %1 + + MEAN_16x16_SSE2 %1 + MEAN_16x16_SSE2 %1 + MEAN_16x16_SSE2 %1 + MEAN_16x16_SSE2 %1 - mov eax, [esp+ 4] ; src again + mov _EAX, prm1 ; src again pshufd xmm7, xmm6, 10b paddusw xmm7, xmm6 @@ -153,17 +144,34 @@ packuswb xmm7, xmm7 pshufd xmm7, xmm7, 00000000b - MEAN_16x16_SSE2 - MEAN_16x16_SSE2 - MEAN_16x16_SSE2 - MEAN_16x16_SSE2 - - MEAN_16x16_SSE2 - MEAN_16x16_SSE2 - MEAN_16x16_SSE2 - MEAN_16x16_SSE2 + MEAN_16x16_SSE2 %1 + MEAN_16x16_SSE2 %1 + MEAN_16x16_SSE2 %1 + MEAN_16x16_SSE2 %1 + + MEAN_16x16_SSE2 %1 + MEAN_16x16_SSE2 %1 + MEAN_16x16_SSE2 %1 + MEAN_16x16_SSE2 %1 pshufd xmm7, xmm6, 10b paddusw xmm7, xmm6 pextrw eax, xmm7, 0 ret +%endmacro + +ALIGN SECTION_ALIGN +dev16_sse2: + MEAN16_SSE2_SSE3 movdqu +ENDFUNC + +ALIGN SECTION_ALIGN +dev16_sse3: + MEAN16_SSE2_SSE3 lddqu +ENDFUNC + + +%ifidn __OUTPUT_FORMAT__,elf +section ".note.GNU-stack" noalloc noexec nowrite progbits +%endif +