--- trunk/xvidcore/src/image/x86_asm/postprocessing_sse2.asm 2008/11/14 15:43:28 1794 +++ trunk/xvidcore/src/image/x86_asm/postprocessing_sse2.asm 2008/11/26 01:04:34 1795 @@ -22,39 +22,13 @@ ; * ; *************************************************************************/ -BITS 32 - -%macro cglobal 1 - %ifdef PREFIX - %ifdef MARK_FUNCS - global _%1:function %1.endfunc-%1 - %define %1 _%1:function %1.endfunc-%1 - %define ENDFUNC .endfunc - %else - global _%1 - %define %1 _%1 - %define ENDFUNC - %endif - %else - %ifdef MARK_FUNCS - global %1:function %1.endfunc-%1 - %define ENDFUNC .endfunc - %else - global %1 - %define ENDFUNC - %endif - %endif -%endmacro +%include "nasm.inc" ;=========================================================================== ; read only data ;=========================================================================== -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif +DATA xmm_0x80: times 16 db 0x80 @@ -63,7 +37,7 @@ ; Code ;============================================================================= -SECTION .text +SECTION .rotext align=SECTION_ALIGN cglobal image_brightness_sse2 @@ -90,34 +64,40 @@ mov [%1 + 15], %2 %endmacro -ALIGN 16 +ALIGN SECTION_ALIGN image_brightness_sse2: - push esi - push edi ; 8 bytes offset for push - sub esp, 32 ; 32 bytes for local data (16bytes will be used, 16bytes more to align correctly mod 16) + mov eax, prm5d ; brightness offset value + mov TMP1, prm1 ; Dst + mov TMP0, prm2 ; stride + + push _ESI + push _EDI ; 8 bytes offset for push + sub _ESP, 32 ; 32 bytes for local data (16bytes will be used, 16bytes more to align correctly mod 16) movdqa xmm6, [xmm_0x80] ; Create a offset...offset vector - mov eax, [esp+8+32+20] ; brightness offset value - mov edx, esp ; edx will be esp aligned mod 16 - add edx, 15 ; edx = esp + 15 - and edx, ~15 ; edx = (esp + 15)&(~15) - CREATE_OFFSET_VECTOR edx, al - movdqa xmm7, [edx] - - mov edx, [esp+8+32+4] ; Dst - mov ecx, [esp+8+32+8] ; stride - mov esi, [esp+8+32+12] ; width - mov edi, [esp+8+32+16] ; height + mov TMP1, _ESP ; TMP1 will be esp aligned mod 16 + add TMP1, 15 ; TMP1 = esp + 15 + and TMP1, ~15 ; TMP1 = (esp + 15)&(~15) + CREATE_OFFSET_VECTOR TMP1, al + movdqa xmm7, [TMP1] + +%ifdef ARCH_IS_X86_64 + mov _ESI, prm3 + mov _EDI, prm4 +%else + mov _ESI, [_ESP+8+32+12] ; width + mov _EDI, [_ESP+8+32+16] ; height +%endif .yloop: - xor eax, eax + xor _EAX, _EAX .xloop: - movdqa xmm0, [edx + eax] - movdqa xmm1, [edx + eax + 16] ; xmm0 = [dst] + movdqa xmm0, [TMP1 + _EAX] + movdqa xmm1, [TMP1 + _EAX + 16] ; xmm0 = [dst] paddb xmm0, xmm6 ; unsigned -> signed domain paddb xmm1, xmm6 @@ -126,20 +106,20 @@ psubb xmm0, xmm6 psubb xmm1, xmm6 ; signed -> unsigned domain - movdqa [edx + eax], xmm0 - movdqa [edx + eax + 16], xmm1 ; [dst] = xmm0 + movdqa [TMP1 + _EAX], xmm0 + movdqa [TMP1 + _EAX + 16], xmm1 ; [dst] = xmm0 - add eax,32 - cmp eax,esi + add _EAX,32 + cmp _EAX,_ESI jl .xloop - add edx, ecx ; dst += stride - sub edi, 1 + add TMP1, TMP0 ; dst += stride + sub _EDI, 1 jg .yloop - add esp, 32 - pop edi - pop esi + add _ESP, 32 + pop _EDI + pop _ESI ret ENDFUNC