[svn] / tags / branch-release-1-0 / xvidcore / src / motion / x86_asm / sad_3dne.asm Repository:
ViewVC logotype

View of /tags/branch-release-1-0/xvidcore/src/motion/x86_asm/sad_3dne.asm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 908 - (download) (annotate)
Thu Mar 6 21:12:04 2003 UTC (16 years ago)
File size: 10285 byte(s)
This commit was manufactured by cvs2svn to create tag 'branch-release-1-0'.
;/**************************************************************************
; *
; *	XVID MPEG-4 VIDEO CODEC
; *	xmm sum of absolute difference
; *
; *	This program is an implementation of a part of one or more MPEG-4
; *	Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
; *	to use this software module in hardware or software products are
; *	advised that its use may infringe existing patents or copyrights, and
; *	any such use would be at such party's own risk.  The original
; *	developer of this software module and his/her company, and subsequent
; *	editors and their companies, will have no liability for use of this
; *	software or modifications or derivatives thereof.
; *
; *	This program is free software; you can redistribute it and/or modify
; *	it under the terms of the GNU General Public License as published by
; *	the Free Software Foundation; either version 2 of the License, or
; *	(at your option) any later version.
; *
; *	This program is distributed in the hope that it will be useful,
; *	but WITHOUT ANY WARRANTY; without even the implied warranty of 
; *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *	GNU General Public License for more details.
; *
; *	You should have received a copy of the GNU General Public License
; *	along with this program; if not, write to the Free Software
; *	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; *
; *************************************************************************/
;
; these 3dne functions are compatible with iSSE, but are optimized specifically for 
; K7 pipelines
;
;------------------------------------------------------------------------------
; 09.12.2002  Athlon optimizations contributed by Jaan Kalda 
;------------------------------------------------------------------------------

bits 32

%macro cglobal 1 
	%ifdef PREFIX
		global _%1 
		%define %1 _%1
	%else
		global %1
	%endif
%endmacro

%ifdef FORMAT_COFF
section .data data
%else
section .data data align=16
%endif

align 16
mmx_one	times 4	dw 1

section .text

cglobal  sad16_3dne
cglobal  sad8_3dne
cglobal  sad16bi_3dne
cglobal  sad8bi_3dne
cglobal  dev16_3dne

;===========================================================================
;
; uint32_t sad16_3dne(const uint8_t * const cur,
;					const uint8_t * const ref,
;					const uint32_t stride,
;					const uint32_t best_sad);
;
;===========================================================================
; optimization: 21% faster
%macro SAD_16x16_SSE 1
    movq mm7, [eax]    
    movq mm6, [eax+8]
    psadbw mm7, [edx]
    psadbw mm6, [edx+8]
%if (%1) 
	paddd mm1,mm5
%endif
    movq mm5, [eax+ecx]
    movq mm4, [eax+ecx+8]
    psadbw mm5, [edx+ecx]
    psadbw mm4, [edx+ecx+8]
    movq mm3, [eax+2*ecx]
    movq mm2, [eax+2*ecx+8]
    psadbw mm3, [edx+2*ecx]
    psadbw mm2, [edx+2*ecx+8]
%if (%1) 
	movd [esp+4*(%1-1)],mm1
%else 
    sub	esp,byte 12
%endif
    movq mm1, [eax+ebx]
    movq mm0, [eax+ebx+8]
    psadbw mm1, [edx+ebx]
    psadbw mm0, [edx+ebx+8]
    lea eax,[eax+4*ecx]
    lea edx,[edx+4*ecx]
    paddd mm7,mm6
    paddd mm5,mm4
    paddd mm3,mm2
    paddd mm1,mm0
    paddd mm5,mm7
    paddd mm1,mm3
%endmacro
 
align 16
sad16_3dne:

    mov eax, [esp+ 4] ; Src1
    mov edx, [esp+ 8] ; Src2
    mov ecx, [esp+12] ; Stride
    push ebx
    lea	ebx,[2*ecx+ecx]
    SAD_16x16_SSE 0
    SAD_16x16_SSE 1
    SAD_16x16_SSE 2
    SAD_16x16_SSE 3
    mov	ecx,[esp]
    add ecx,[esp+4]
    add ecx,[esp+8]    
	paddd mm1,mm5
    mov	ebx,[esp+12]
    add esp,byte 4+12
    movd eax, mm1
    add eax,ecx
    ret


;===========================================================================
;
; uint32_t sad8_3dne(const uint8_t * const cur,
;					const uint8_t * const ref,
;					const uint32_t stride);
;
;===========================================================================
align 16
sad8_3dne:

    mov eax, [esp+ 4] ; Src1
    mov ecx, [esp+12] ; Stride
    mov edx, [esp+ 8] ; Src2
    push ebx
    lea ebx, [ecx+2*ecx]
    
    movq mm0, [byte eax] ;0
    psadbw mm0, [byte edx]
    movq mm1, [eax+ecx] ;1
    psadbw mm1, [edx+ecx]

    movq mm2, [eax+2*ecx] ;2
    psadbw mm2, [edx+2*ecx]
    movq mm3, [eax+ebx] ;3
    psadbw mm3, [edx+ebx]

    paddd mm0,mm1

    movq mm4, [byte eax+4*ecx] ;4
    psadbw mm4, [edx+4*ecx]
    movq mm5, [eax+2*ebx] ;6
    psadbw mm5, [edx+2*ebx]

    paddd mm2,mm3
    paddd mm0,mm2

    lea ebx, [ebx+4*ecx] ;3+4=7
    lea ecx,[ecx+4*ecx] ; 5
    movq mm6, [eax+ecx] ;5
    psadbw mm6, [edx+ecx]
    movq mm7, [eax+ebx] ;7
    psadbw mm7, [edx+ebx]
    paddd mm4,mm5    
    paddd mm6,mm7
    paddd mm0,mm4
    mov ebx,[esp]
    add esp,byte 4
    paddd mm0,mm6
    movd eax, mm0

    ret


;===========================================================================
;
; uint32_t sad16bi_3dne(const uint8_t * const cur,
;					const uint8_t * const ref1,
;					const uint8_t * const ref2,
;					const uint32_t stride);
;
;===========================================================================
;optimization: 14% faster
%macro SADBI_16x16_SSE0 0
    movq mm2, [edx]
    movq mm3, [edx+8]

    movq mm5, [byte eax]
    movq mm6, [eax+8]
    pavgb mm2, [byte ebx]
    pavgb mm3, [ebx+8]
    
    add edx, ecx    
    psadbw mm5, mm2
    psadbw mm6, mm3

    add eax, ecx
    add ebx, ecx
    movq mm2, [byte edx]

    movq mm3, [edx+8]
    movq mm0, [byte eax]

    movq mm1, [eax+8]
    pavgb mm2, [byte ebx]

    pavgb mm3, [ebx+8]
    add edx, ecx
    add eax, ecx

    add ebx, ecx
    psadbw mm0, mm2
    psadbw mm1, mm3

%endmacro
%macro SADBI_16x16_SSE 0
    movq mm2, [byte edx]
    movq mm3, [edx+8]
    paddusw mm5,mm0
    paddusw mm6,mm1
    movq mm0, [eax]
    movq mm1, [eax+8] 
    pavgb mm2, [ebx]    
    pavgb mm3, [ebx+8]
    add edx, ecx
    add eax, ecx
    add ebx, ecx
    psadbw mm0, mm2
    psadbw mm1, mm3    
%endmacro

align 16
sad16bi_3dne:
    mov eax, [esp+ 4] ; Src
    mov edx, [esp+ 8] ; Ref1
    mov ecx, [esp+16] ; Stride
    push ebx
    mov ebx, [esp+4+12] ; Ref2

    SADBI_16x16_SSE0
    SADBI_16x16_SSE
    SADBI_16x16_SSE
    SADBI_16x16_SSE
    SADBI_16x16_SSE
    SADBI_16x16_SSE
    SADBI_16x16_SSE

    SADBI_16x16_SSE
    SADBI_16x16_SSE
    SADBI_16x16_SSE
    SADBI_16x16_SSE
    SADBI_16x16_SSE
    SADBI_16x16_SSE
    SADBI_16x16_SSE
    SADBI_16x16_SSE
    paddusw mm5,mm0
    paddusw mm6,mm1

    pop ebx
    paddusw mm6,mm5
    movd eax, mm6
    ret
;=========================================================================== 
; 
; uint32_t sad8bi_3dne(const uint8_t * const cur, 
; const uint8_t * const ref1, 
; const uint8_t * const ref2, 
; const uint32_t stride); 
; 
;=========================================================================== 

%macro SADBI_8x8_3dne 0 
   movq mm2, [edx] 
   movq mm3, [edx+ecx] 
   pavgb mm2, [eax] 
   pavgb mm3, [eax+ecx] 
   lea edx, [edx+2*ecx] 
   lea eax, [eax+2*ecx] 
   paddusw mm5,mm0 
   paddusw mm6,mm1 
   movq mm0, [ebx] 
   movq mm1, [ebx+ecx] 
   lea ebx, [ebx+2*ecx] 
   psadbw mm0, mm2 
   psadbw mm1, mm3 
%endmacro 

align 16 
sad8bi_3dne: 
   mov eax, [esp+12] ; Ref2 
   mov edx, [esp+ 8] ; Ref1 
   mov ecx, [esp+16] ; Stride 
   push ebx 
   mov ebx, [esp+4+ 4] ; Src 

   movq mm2, [edx] 
   movq mm3, [edx+ecx] 
   pavgb mm2, [eax] 
   pavgb mm3, [eax+ecx] 
   lea edx, [edx+2*ecx] 
   lea eax, [eax+2*ecx] 
   movq mm5, [ebx] 
   movq mm6, [ebx+ecx] 
   lea ebx, [ebx+2*ecx] 
   psadbw mm5, mm2 
   psadbw mm6, mm3 

   movq mm2, [edx] 
   movq mm3, [edx+ecx] 
   pavgb mm2, [eax] 
   pavgb mm3, [eax+ecx] 
   lea edx, [edx+2*ecx] 
   lea eax, [eax+2*ecx] 
   movq mm0, [ebx] 
   movq mm1, [ebx+ecx] 
   lea ebx, [ebx+2*ecx] 
   psadbw mm0, mm2 
   psadbw mm1, mm3 

   movq mm2, [edx] 
   movq mm3, [edx+ecx] 
   pavgb mm2, [eax] 
   pavgb mm3, [eax+ecx] 
   lea edx, [edx+2*ecx] 
   lea eax, [eax+2*ecx] 
   paddusw mm5,mm0 
   paddusw mm6,mm1 
   movq mm0, [ebx]  
   movq mm1, [ebx+ecx] 
   lea ebx, [ebx+2*ecx] 
   psadbw mm0, mm2 
   psadbw mm1, mm3 

   movq mm2, [edx] 
   movq mm3, [edx+ecx] 
   pavgb mm2, [eax] 
   pavgb mm3, [eax+ecx] 
   paddusw mm5,mm0 
   paddusw mm6,mm1  
   movq mm0, [ebx] 
   movq mm1, [ebx+ecx] 
   psadbw mm0, mm2 
   psadbw mm1, mm3 
   paddusw mm5,mm0 
   paddusw mm6,mm1 

   paddusw mm6,mm5 
   mov ebx,[esp]
   add esp,byte 4
   movd eax, mm6  
   ret 


;===========================================================================
;
; uint32_t dev16_3dne(const uint8_t * const cur,
;					const uint32_t stride);
;
;===========================================================================
; optimization: 25 % faster
%macro ABS_16x16_SSE 1
%if (%1 == 0)
    movq mm7, [eax]
    psadbw mm7, mm4
    mov	esi,esi
    movq mm6, [eax+8]
    movq mm5, [eax+ecx]
    movq mm3, [eax+ecx+8]
    psadbw mm6, mm4

    movq mm2, [byte eax+2*ecx]
    psadbw mm5, mm4
    movq mm1, [eax+2*ecx+8]
    psadbw mm3, mm4

    movq mm0, [dword eax+edx]
    psadbw mm2, mm4
    add	eax,edx
    psadbw mm1, mm4
%endif
%if (%1 == 1)    
    psadbw mm0, mm4
    paddd mm7, mm0
    movq mm0, [eax+8]
    psadbw mm0, mm4
    paddd mm6, mm0

    movq mm0, [byte eax+ecx]
    psadbw mm0, mm4
    
    paddd mm5, mm0
    movq mm0, [eax+ecx+8]

    psadbw mm0, mm4
    paddd mm3, mm0
    movq mm0, [eax+2*ecx]
    psadbw mm0, mm4
    paddd mm2, mm0

    movq mm0, [eax+2*ecx+8]
    add	eax,edx
    psadbw mm0, mm4
    paddd mm1, mm0
    movq mm0, [eax]
%endif
%if (%1 == 2)
    psadbw mm0, mm4
    paddd mm7, mm0
    movq mm0, [eax+8]
    psadbw mm0, mm4
    paddd mm6, mm0
%endif
%endmacro

align 16
dev16_3dne:

    mov eax, [esp+ 4] ; Src
    mov ecx, [esp+ 8] ; Stride
    lea	edx,[ecx+2*ecx]
    
    pxor mm4, mm4
align 8    
    ABS_16x16_SSE 0
    ABS_16x16_SSE 1
    ABS_16x16_SSE 1
    ABS_16x16_SSE 1
    ABS_16x16_SSE 1
    paddd mm1, mm2
    paddd mm3, mm5    
    ABS_16x16_SSE 2
    paddd mm7, mm6
    paddd mm1, mm3
    mov eax, [esp+ 4] ; Src
    paddd mm7,mm1
    punpcklbw mm7,mm7 ;xxyyaazz
	pshufw mm4,mm7,055h
    ; mm4 contains the mean
    pxor mm1, mm1
    
    ABS_16x16_SSE 0
    ABS_16x16_SSE 1
    ABS_16x16_SSE 1
    ABS_16x16_SSE 1
    ABS_16x16_SSE 1
    paddd mm1, mm2
    paddd mm3, mm5
    ABS_16x16_SSE 2
    paddd mm7, mm6
    paddd mm1, mm3
    paddd mm7,mm1
    movd eax, mm7
    ret

No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4