--- trunk/xvidcore/src/motion/x86_asm/sad_mmx.asm 2005/02/19 23:20:27 1599 +++ trunk/xvidcore/src/motion/x86_asm/sad_mmx.asm 2008/12/04 14:41:50 1844 @@ -20,41 +20,19 @@ ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; * $Id: sad_mmx.asm,v 1.17 2005-02-19 23:20:27 suxen_drol Exp $ +; * $Id: sad_mmx.asm,v 1.21 2008-12-04 14:41:50 Isibaar Exp $ ; * ; ***************************************************************************/ -BITS 32 - -%macro cglobal 1 - %ifdef PREFIX - %ifdef MARK_FUNCS - global _%1:function %1.endfunc-%1 - %define %1 _%1:function %1.endfunc-%1 - %else - global _%1 - %define %1 _%1 - %endif - %else - %ifdef MARK_FUNCS - global %1:function %1.endfunc-%1 - %else - global %1 - %endif - %endif -%endmacro +%include "nasm.inc" ;============================================================================= ; Read only data ;============================================================================= -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif +DATA -ALIGN 16 +ALIGN SECTION_ALIGN mmx_one: times 4 dw 1 @@ -63,15 +41,15 @@ ;============================================================================= %macro SAD_16x16_MMX 0 - movq mm0, [eax] - movq mm1, [edx] + movq mm0, [_EAX] + movq mm1, [TMP1] - movq mm2, [eax+8] - movq mm3, [edx+8] + movq mm2, [_EAX+8] + movq mm3, [TMP1+8] movq mm4, mm0 psubusb mm0, mm1 - lea eax, [eax+ecx] + lea _EAX, [_EAX+TMP0] movq mm5, mm2 psubusb mm2, mm3 @@ -84,7 +62,7 @@ punpcklbw mm0,mm7 movq mm3, mm2 punpckhbw mm1,mm7 - lea edx, [edx+ecx] + lea TMP1, [TMP1+TMP0] punpcklbw mm2,mm7 paddusw mm0, mm1 punpckhbw mm3,mm7 @@ -95,14 +73,14 @@ %endmacro %macro SAD_8x8_MMX 0 - movq mm0, [eax] - movq mm1, [edx] + movq mm0, [_EAX] + movq mm1, [TMP1] - movq mm2, [eax+ecx] - movq mm3, [edx+ecx] + movq mm2, [_EAX+TMP0] + movq mm3, [TMP1+TMP0] - lea eax,[eax+2*ecx] - lea edx,[edx+2*ecx] + lea _EAX,[_EAX+2*TMP0] + lea TMP1,[TMP1+2*TMP0] movq mm4, mm0 psubusb mm0, mm1 @@ -128,16 +106,16 @@ %macro SADV_16x16_MMX 0 - movq mm0, [eax] - movq mm1, [edx] + movq mm0, [_EAX] + movq mm1, [TMP1] - movq mm2, [eax+8] + movq mm2, [_EAX+8] movq mm4, mm0 - movq mm3, [edx+8] + movq mm3, [TMP1+8] psubusb mm0, mm1 psubusb mm1, mm4 - lea eax,[eax+ecx] + lea _EAX,[_EAX+TMP0] por mm0, mm1 movq mm4, mm2 @@ -155,19 +133,19 @@ punpckhbw mm3,mm7 paddusw mm5, mm0 paddusw mm2,mm3 - lea edx,[edx+ecx] + lea TMP1,[TMP1+TMP0] paddusw mm6, mm2 %endmacro %macro SADBI_16x16_MMX 2 ; SADBI_16x16_MMX( int_ptr_offset, bool_increment_ptr ); - movq mm0, [edx+%1] - movq mm2, [ebx+%1] + movq mm0, [TMP1+%1] + movq mm2, [_EBX+%1] movq mm1, mm0 movq mm3, mm2 %if %2 != 0 - add edx, ecx + add TMP1, TMP0 %endif punpcklbw mm0, mm7 @@ -176,7 +154,7 @@ punpckhbw mm3, mm7 %if %2 != 0 - add ebx, ecx + add _EBX, TMP0 %endif paddusw mm0, mm2 ; mm01 = ref1 + ref2 @@ -186,13 +164,13 @@ psrlw mm0, 1 ; mm01 >>= 1 psrlw mm1, 1 - movq mm2, [eax+%1] + movq mm2, [_EAX+%1] movq mm3, mm2 punpcklbw mm2, mm7 ; mm23 = src punpckhbw mm3, mm7 %if %2 != 0 - add eax, ecx + add _EAX, TMP0 %endif movq mm4, mm0 @@ -210,9 +188,9 @@ %endmacro %macro MEAN_16x16_MMX 0 - movq mm0, [eax] - movq mm2, [eax+8] - lea eax, [eax+ecx] + movq mm0, [_EAX] + movq mm2, [_EAX+8] + lea _EAX, [_EAX+TMP0] movq mm1, mm0 punpcklbw mm0, mm7 movq mm3, mm2 @@ -226,9 +204,9 @@ %endmacro %macro ABS_16x16_MMX 0 - movq mm0, [eax] - movq mm2, [eax+8] - lea eax, [eax+ecx] + movq mm0, [_EAX] + movq mm2, [_EAX+8] + lea _EAX, [_EAX+TMP0] movq mm1, mm0 movq mm3, mm2 punpcklbw mm0, mm7 @@ -264,7 +242,7 @@ ; Code ;============================================================================= -SECTION .text +TEXT cglobal sad16_mmx cglobal sad16v_mmx @@ -286,12 +264,12 @@ ; ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN sad16_mmx: - mov eax, [esp+ 4] ; Src1 - mov edx, [esp+ 8] ; Src2 - mov ecx, [esp+12] ; Stride + mov _EAX, prm1 ; Src1 + mov TMP1, prm2 ; Src2 + mov TMP0, prm3 ; Stride pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero @@ -322,7 +300,7 @@ movd eax, mm6 ret -.endfunc +ENDFUNC ;----------------------------------------------------------------------------- ; @@ -332,12 +310,12 @@ ; ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN sad8_mmx: - mov eax, [esp+ 4] ; Src1 - mov edx, [esp+ 8] ; Src2 - mov ecx, [esp+12] ; Stride + mov _EAX, prm1 ; Src1 + mov TMP1, prm2 ; Src2 + mov TMP0, prm3 ; Stride pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero @@ -355,7 +333,7 @@ movd eax, mm6 ret -.endfunc +ENDFUNC ;----------------------------------------------------------------------------- ; @@ -366,16 +344,20 @@ ; ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN sad16v_mmx: - push ebx - push edi - - mov eax, [esp + 8 + 4] ; Src1 - mov edx, [esp + 8 + 8] ; Src2 - mov ecx, [esp + 8 + 12] ; Stride - mov ebx, [esp + 8 + 16] ; sad ptr + mov _EAX, prm1 ; Src1 + mov TMP1, prm2 ; Src2 + mov TMP0, prm3 ; Stride + + push _EBX + push _EDI +%ifdef ARCH_IS_X86_64 + mov _EBX, prm4 +%else + mov _EBX, [_ESP + 8 + 16] ; sad ptr +%endif pxor mm5, mm5 ; accum pxor mm6, mm6 ; accum @@ -402,8 +384,8 @@ paddd mm5, mm2 paddd mm6, mm3 - movd [ebx], mm5 - movd [ebx + 4], mm6 + movd [_EBX], mm5 + movd [_EBX + 4], mm6 paddd mm5, mm6 @@ -433,20 +415,20 @@ paddd mm5, mm2 paddd mm6, mm3 - movd [ebx + 8], mm5 - movd [ebx + 12], mm6 + movd [_EBX + 8], mm5 + movd [_EBX + 12], mm6 paddd mm5, mm6 movd eax, mm5 - add eax, edi + add _EAX, _EDI - pop edi - pop ebx + pop _EDI + pop _EBX ret -.endfunc +ENDFUNC ;----------------------------------------------------------------------------- ; @@ -457,17 +439,22 @@ ; ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN sad16bi_mmx: - push ebx - mov eax, [esp+4+ 4] ; Src - mov edx, [esp+4+ 8] ; Ref1 - mov ebx, [esp+4+12] ; Ref2 - mov ecx, [esp+4+16] ; Stride + mov _EAX, prm1 ; Src + mov TMP1, prm2 ; Ref1 + mov TMP0, prm4 ; Stride + + push _EBX +%ifdef ARCH_IS_X86_64 + mov _EBX, prm3 ; Ref2 +%else + mov _EBX, [_ESP+4+12] ; Ref2 +%endif pxor mm6, mm6 ; accum2 pxor mm7, mm7 -.Loop +.Loop: SADBI_16x16_MMX 0, 0 SADBI_16x16_MMX 8, 1 SADBI_16x16_MMX 0, 0 @@ -508,10 +495,10 @@ paddd mm6, mm7 movd eax, mm6 - pop ebx + pop _EBX ret -.endfunc +ENDFUNC ;----------------------------------------------------------------------------- ; @@ -522,17 +509,22 @@ ; ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN sad8bi_mmx: - push ebx - mov eax, [esp+4+ 4] ; Src - mov edx, [esp+4+ 8] ; Ref1 - mov ebx, [esp+4+12] ; Ref2 - mov ecx, [esp+4+16] ; Stride + mov _EAX, prm1 ; Src + mov TMP1, prm2 ; Ref1 + mov TMP0, prm4 ; Stride + + push _EBX +%ifdef ARCH_IS_X86_64 + mov _EBX, prm3 +%else + mov _EBX, [_ESP+4+12] ; Ref2 +%endif pxor mm6, mm6 ; accum2 pxor mm7, mm7 -.Loop +.Loop: SADBI_16x16_MMX 0, 1 SADBI_16x16_MMX 0, 1 SADBI_16x16_MMX 0, 1 @@ -548,9 +540,9 @@ paddd mm6, mm7 movd eax, mm6 - pop ebx + pop _EBX ret -.endfunc +ENDFUNC ;----------------------------------------------------------------------------- ; @@ -559,10 +551,10 @@ ; ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN dev16_mmx: - mov eax, [esp+ 4] ; Src - mov ecx, [esp+ 8] ; Stride + mov _EAX, prm1 ; Src + mov TMP0, prm2 ; Stride pxor mm7, mm7 ; zero pxor mm5, mm5 ; accum1 @@ -602,7 +594,7 @@ ; mm5 is the new accum pxor mm5, mm5 - mov eax, [esp+ 4] ; Src + mov _EAX, prm1 ; Src ABS_16x16_MMX ABS_16x16_MMX @@ -630,7 +622,7 @@ movd eax, mm6 ret -.endfunc +ENDFUNC ;----------------------------------------------------------------------------- ; @@ -652,22 +644,20 @@ %endmacro sse8_16bit_mmx: - push esi - push edi ;; Load the function params - mov esi, [esp+8+4] - mov edi, [esp+8+8] - mov edx, [esp+8+12] + mov _EAX, prm1 + mov TMP0, prm2 + mov TMP1, prm3 ;; Reset the sse accumulator pxor mm2, mm2 ;; Let's go %rep 8 - ROW_SSE_16bit_MMX esi, edi - lea esi, [esi+edx] - lea edi, [edi+edx] + ROW_SSE_16bit_MMX _EAX, TMP0 + lea _EAX, [_EAX+TMP1] + lea TMP0, [TMP0+TMP1] %endrep ;; Finish adding each dword of the accumulator @@ -677,10 +667,8 @@ movd eax, mm2 ;; All done - pop edi - pop esi ret -.endfunc +ENDFUNC ;----------------------------------------------------------------------------- ; @@ -714,13 +702,11 @@ %endmacro sse8_8bit_mmx: - push esi - push edi ;; Load the function params - mov esi, [esp+8+4] - mov edi, [esp+8+8] - mov edx, [esp+8+12] + mov _EAX, prm1 + mov TMP0, prm2 + mov TMP1, prm3 ;; Reset the sse accumulator pxor mm6, mm6 @@ -730,9 +716,9 @@ ;; Let's go %rep 8 - ROW_SSE_8bit_MMX esi, edi - lea esi, [esi+edx] - lea edi, [edi+edx] + ROW_SSE_8bit_MMX _EAX, TMP0 + lea _EAX, [_EAX+TMP1] + lea TMP0, [TMP0+TMP1] %endrep ;; Finish adding each dword of the accumulator @@ -742,8 +728,11 @@ movd eax, mm6 ;; All done - pop edi - pop esi ret -.endfunc +ENDFUNC + + +%ifidn __OUTPUT_FORMAT__,elf +section ".note.GNU-stack" noalloc noexec nowrite progbits +%endif