Diff of /branches/release-1_2-branch/xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm

-trunk/xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm
revision 215, Sun Jun 16 17:12:37 2002 UTC
+branches/release-1_2-branch/xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm
revision 1820, Fri Nov 28 16:54:45 2008 UTC
 Line 1
- ;/**************************************************************************
+ ;/****************************************************************************
  ; *
  ; *     XVID MPEG-4 VIDEO CODEC
- ; *     mmx 8bit<->16bit transfers
+ ; *  - 8<->16 bit transfer functions -
  ; *
- ; *     This program is an implementation of a part of one or more MPEG-4
+ ; *  Copyright (C) 2001 Peter Ross <pross@xvid.org>
- ; *     Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
+ ; *                2001-2008 Michael Militzer <michael@xvid.org>
- ; *     to use this software module in hardware or software products are
+ ; *                2002 Pascal Massimino <skal@planet-d.net>
- ; *     advised that its use may infringe existing patents or copyrights, and
- ; *     any such use would be at such party's own risk.  The original
- ; *     developer of this software module and his/her company, and subsequent
- ; *     editors and their companies, will have no liability for use of this
- ; *     software or modifications or derivatives thereof.
  ; *
  ; *     This program is free software; you can redistribute it and/or modify
  ; *     it under the terms of the GNU General Public License as published by
-Line 24
+Line 19
  ; *
  ; *     You should have received a copy of the GNU General Public License
  ; *     along with this program; if not, write to the Free Software
- ; *     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  ; *
- ; *************************************************************************/
+ ; * $Id: mem_transfer_mmx.asm,v 1.20 2008-11-26 01:04:34 Isibaar Exp $
- ;/**************************************************************************
- ; *
- ; *     History:
  ; *
- ; * 07.01.2002  merge functions from compensate_mmx; rename functions
+ ; ***************************************************************************/
- ; *     07.11.2001      initial version; (c)2001 peter ross <pross@cs.rmit.edu.au>
- ; *
- ; *************************************************************************/
+ %include "nasm.inc"
- bits 32
+ ;=============================================================================
+ ; Read only data
+ ;=============================================================================
- %macro cglobal 1
+ DATA
-         %ifdef PREFIX
-                 global _%1
-                 %define %1 _%1
-         %else
-                 global %1
-         %endif
- %endmacro
+ ALIGN SECTION_ALIGN
+ mmx_one:
+         dw 1, 1, 1, 1
- section .text
+ ;=============================================================================
+ ; Code
+ ;=============================================================================
+ SECTION .rotext align=SECTION_ALIGN
+ cglobal transfer_8to16copy_mmx
+ cglobal transfer_16to8copy_mmx
+ cglobal transfer_8to16sub_mmx
+ cglobal transfer_8to16subro_mmx
+ cglobal transfer_8to16sub2_mmx
+ cglobal transfer_8to16sub2_xmm
+ cglobal transfer_8to16sub2ro_xmm
+ cglobal transfer_16to8add_mmx
+ cglobal transfer8x8_copy_mmx
+ cglobal transfer8x4_copy_mmx
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;
  ; void transfer_8to16copy_mmx(int16_t * const dst,
  ;                                                       const uint8_t * const src,
  ;                                                       uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- align 16
- cglobal transfer_8to16copy_mmx
- transfer_8to16copy_mmx
-                 push    esi
-                 push    edi
-                 mov     edi, [esp + 8 + 4]              ; dst
+ %macro COPY_8_TO_16 1
-                 mov     esi, [esp + 8 + 8]              ; src
+   movq mm0, [_EAX]
-                 mov ecx, [esp + 8 + 12]         ; stride
+   movq mm1, [_EAX+TMP1]
+   movq mm2, mm0
-                 pxor mm7, mm7                           ; mm7 = zero
+   movq mm3, mm1
+   punpcklbw mm0, mm7
-                 mov eax, 8
+   movq [TMP0+%1*32], mm0
+   punpcklbw mm1, mm7
- .loop
+   movq [TMP0+%1*32+16], mm1
-                 movq mm0, [esi]
+   punpckhbw mm2, mm7
-                 movq mm1, mm0
+   punpckhbw mm3, mm7
-                 punpcklbw mm0, mm7              ; mm01 = unpack([src])
+   lea _EAX, [_EAX+2*TMP1]
-                 punpckhbw mm1, mm7
+   movq [TMP0+%1*32+8], mm2
+   movq [TMP0+%1*32+24], mm3
-                 movq [edi], mm0                 ; [dst] = mm01
+ %endmacro
-                 movq [edi + 8], mm1
-                 add edi, 16
+ ALIGN SECTION_ALIGN
-                 add esi, ecx
+ transfer_8to16copy_mmx:
-                 dec eax
-                 jnz .loop
-                 pop edi
+   mov TMP0, prm1 ; Dst
-                 pop esi
+   mov _EAX, prm2 ; Src
+   mov TMP1, prm3 ; Stride
+   pxor mm7, mm7
+   COPY_8_TO_16 0
+   COPY_8_TO_16 1
+   COPY_8_TO_16 2
+   COPY_8_TO_16 3
                  ret
+ ENDFUNC
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
  ;
  ; void transfer_16to8copy_mmx(uint8_t * const dst,
  ;                                                       const int16_t * const src,
  ;                                                       uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- align 16
- cglobal transfer_16to8copy_mmx
- transfer_16to8copy_mmx
-                 push    esi
-                 push    edi
-                 mov     edi, [esp + 8 + 4]              ; dst
+ %macro COPY_16_TO_8 1
-                 mov     esi, [esp + 8 + 8]              ; src
+   movq mm0, [_EAX+%1*32]
-                 mov ecx, [esp + 8 + 12]         ; stride
+   movq mm1, [_EAX+%1*32+8]
+   packuswb mm0, mm1
-                 mov eax, 8
+   movq [TMP0], mm0
+   movq mm2, [_EAX+%1*32+16]
- .loop
+   movq mm3, [_EAX+%1*32+24]
-                 movq mm0, [esi]
+   packuswb mm2, mm3
-                 packuswb mm0, [esi + 8]         ; mm0 = pack([src])
+   movq [TMP0+TMP1], mm2
+ %endmacro
-                 movq [edi], mm0                         ; [dst] = mm0
-                 add esi, 16
-                 add edi, ecx
-                 dec eax
-                 jnz .loop
-                 pop edi
+ ALIGN SECTION_ALIGN
-                 pop esi
+ transfer_16to8copy_mmx:
+   mov TMP0, prm1 ; Dst
+   mov _EAX, prm2 ; Src
+   mov TMP1, prm3 ; Stride
+   COPY_16_TO_8 0
+   lea TMP0,[TMP0+2*TMP1]
+   COPY_16_TO_8 1
+   lea TMP0,[TMP0+2*TMP1]
+   COPY_16_TO_8 2
+   lea TMP0,[TMP0+2*TMP1]
+   COPY_16_TO_8 3
                  ret
+ ENDFUNC
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
  ;
  ; void transfer_8to16sub_mmx(int16_t * const dct,
  ;                               uint8_t * const cur,
  ;                               const uint8_t * const ref,
  ;                               const uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- ;/**************************************************************************
- ; *
- ; *     History:
- ; *
- ; * 27.12.2001  renamed from 'compensate' to 'transfer_8to16sub'
- ; * 02.12.2001  loop unrolled, code runs 10% faster now (Isibaar)
- ; * 30.11.2001  16 pixels are processed per iteration (Isibaar)
- ; * 30.11.2001  .text missing
- ; *     06.11.2001      inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
- ; *
- ; *************************************************************************/
- align 16
- cglobal transfer_8to16sub_mmx
- transfer_8to16sub_mmx
-                 push    esi
-                 push    edi
-                 push    ebx
-                 mov     edi, [esp + 12 + 4]             ; dct [out]
-                 mov     edx, [esp + 12 + 8]             ; cur [in/out]
-                 mov     esi, [esp + 12 + 12]            ; ref [in]
-                 mov ecx, [esp + 12 + 16]                ; stride [in]
-                 mov eax, edx                            ; cur -> eax
-                 mov ebx, esi                            ; ref -> ebx
-                 add eax, ecx                            ; cur + stride
-                 add ebx, ecx                            ; ref + stride
-                 shl ecx, 1
-                 pxor mm7, mm7                   ; mm7 = zero
+ ; when second argument == 1, reference (ebx) block is to current (_EAX)
+ %macro COPY_8_TO_16_SUB 2
-                 movq mm0, [edx]                 ; mm01 = [cur]
+   movq mm0, [_EAX]      ; cur
+   movq mm2, [_EAX+TMP1]
                  movq mm1, mm0
-                 punpcklbw mm0, mm7
-                 punpckhbw mm1, mm7
-                 movq mm4, [eax]
-                 movq mm5, mm4
-                 punpcklbw mm4, mm7
-                 punpckhbw mm5, mm7
-                 movq mm2, [esi]                 ; mm23 = [ref]
                  movq mm3, mm2
-                 movq mm6, [ebx]
+   punpcklbw mm0, mm7
-                 movq [edx], mm2                 ; [cur] = [ref]
-                 movq [eax], mm6
                  punpcklbw mm2, mm7
+   movq mm4, [_EBX]      ; ref
+   punpckhbw mm1, mm7
                  punpckhbw mm3, mm7
+   movq mm5, [_EBX+TMP1]  ; ref
-                 psubsw mm0, mm2                 ; mm01 -= mm23
+   movq mm6, mm4
+ %if %2 == 1
-                 movq mm2, mm6
+   movq [_EAX], mm4
+   movq [_EAX+TMP1], mm5
-                 punpcklbw mm2, mm7
+ %endif
+   punpcklbw mm4, mm7
                  punpckhbw mm6, mm7
+   psubsw mm0, mm4
+   psubsw mm1, mm6
+   movq mm6, mm5
+   punpcklbw mm5, mm7
+   punpckhbw mm6, mm7
+   psubsw mm2, mm5
+   lea _EAX, [_EAX+2*TMP1]
+   psubsw mm3, mm6
+   lea _EBX,[_EBX+2*TMP1]
+   movq [TMP0+%1*32+ 0], mm0 ; dst
+   movq [TMP0+%1*32+ 8], mm1
+   movq [TMP0+%1*32+16], mm2
+   movq [TMP0+%1*32+24], mm3
+ %endmacro
-                 psubsw mm1, mm3
+ ALIGN SECTION_ALIGN
+ transfer_8to16sub_mmx:
-                 psubsw mm4, mm2
+   mov TMP0, prm1 ; Dst
-                 psubsw mm5, mm6
+   mov _EAX, prm2 ; Cur
+   mov TMP1, prm4 ; Stride
+   push _EBX
+ %ifdef ARCH_IS_X86_64
+   mov _EBX, prm3
+ %else
+   mov _EBX, [_ESP+4+12] ; Ref
+ %endif
+   pxor mm7, mm7
-                 movq [edi], mm0                 ; dct[] = mm01
+   COPY_8_TO_16_SUB 0, 1
-                 movq [edi + 8], mm1
+   COPY_8_TO_16_SUB 1, 1
-                 movq [edi + 16], mm4
+   COPY_8_TO_16_SUB 2, 1
-                 movq [edi + 24], mm5
+   COPY_8_TO_16_SUB 3, 1
-                 add edx, ecx
-                 add esi, ecx
-                 add eax, ecx
-                 add ebx, ecx
-                 movq mm0, [edx]                 ; mm01 = [cur]
+   pop _EBX
-                 movq mm1, mm0
+   ret
+ ENDFUNC
-                 punpcklbw mm0, mm7
-                 punpckhbw mm1, mm7
-                 movq mm4, [eax]
+ ALIGN SECTION_ALIGN
-                 movq mm5, mm4
+ transfer_8to16subro_mmx:
+   mov TMP0, prm1 ; Dst
+   mov _EAX, prm2 ; Cur
+   mov TMP1, prm4 ; Stride
+   push _EBX
+ %ifdef ARCH_IS_X86_64
+   mov _EBX, prm3
+ %else
+   mov _EBX, [_ESP+4+12] ; Ref
+ %endif
+   pxor mm7, mm7
-                 punpcklbw mm4, mm7
+   COPY_8_TO_16_SUB 0, 0
-                 punpckhbw mm5, mm7
+   COPY_8_TO_16_SUB 1, 0
+   COPY_8_TO_16_SUB 2, 0
+   COPY_8_TO_16_SUB 3, 0
-                 movq mm2, [esi]                 ; mm23 = [ref]
+   pop _EBX
-                 movq mm3, mm2
+   ret
+ ENDFUNC
-                 movq mm6, [ebx]
-                 movq [edx], mm2                 ; [cur] = [ref]
+ ;-----------------------------------------------------------------------------
-                 movq [eax], mm6
+ ;
+ ; void transfer_8to16sub2_mmx(int16_t * const dct,
+ ;                               uint8_t * const cur,
+ ;                               const uint8_t * ref1,
+ ;                               const uint8_t * ref2,
+ ;                               const uint32_t stride)
+ ;
+ ;-----------------------------------------------------------------------------
-                 punpcklbw mm2, mm7
+ %macro COPY_8_TO_16_SUB2_MMX 1
+   movq mm0, [_EAX]      ; cur
+   movq mm2, [_EAX+TMP1]
+   ; mm4 <- (ref1+ref2+1) / 2
+   movq mm4, [_EBX]      ; ref1
+   movq mm1, [_ESI]      ; ref2
+   movq mm6, mm4
+   movq mm3, mm1
+   punpcklbw mm4, mm7
+   punpcklbw mm1, mm7
+   punpckhbw mm6, mm7
                  punpckhbw mm3, mm7
+   paddusw mm4, mm1
-                 psubsw mm0, mm2                 ; mm01 -= mm23
+   paddusw mm6, mm3
+   paddusw mm4, [mmx_one]
-                 movq mm2, mm6
+   paddusw mm6, [mmx_one]
+   psrlw mm4, 1
-                 punpcklbw mm2, mm7
+   psrlw mm6, 1
+   packuswb mm4, mm6
+   movq [_EAX], mm4
+     ; mm5 <- (ref1+ref2+1) / 2
+   movq mm5, [_EBX+TMP1]  ; ref1
+   movq mm1, [_ESI+TMP1]  ; ref2
+   movq mm6, mm5
+   movq mm3, mm1
+   punpcklbw mm5, mm7
+   punpcklbw mm1, mm7
                  punpckhbw mm6, mm7
+   punpckhbw mm3, mm7
+   paddusw mm5, mm1
+   paddusw mm6, mm3
+   paddusw mm5, [mmx_one]
+   paddusw mm6, [mmx_one]
+   lea _ESI, [_ESI+2*TMP1]
+   psrlw mm5, 1
+   psrlw mm6, 1
+   packuswb mm5, mm6
+   movq [_EAX+TMP1], mm5
-                 psubsw mm1, mm3
-                 psubsw mm4, mm2
-                 psubsw mm5, mm6
-                 movq [edi + 32], mm0                    ; dct[] = mm01
-                 movq [edi + 40], mm1
-                 movq [edi + 48], mm4
-                 movq [edi + 56], mm5
-                 add edx, ecx
-                 add esi, ecx
-                 add eax, ecx
-                 add ebx, ecx
-                 movq mm0, [edx]                 ; mm01 = [cur]
                  movq mm1, mm0
-                 punpcklbw mm0, mm7
-                 punpckhbw mm1, mm7
-                 movq mm4, [eax]
-                 movq mm5, mm4
-                 punpcklbw mm4, mm7
-                 punpckhbw mm5, mm7
-                 movq mm2, [esi]                 ; mm23 = [ref]
                  movq mm3, mm2
+   punpcklbw mm0, mm7
-                 movq mm6, [ebx]
-                 movq [edx], mm2                 ; [cur] = [ref]
-                 movq [eax], mm6
                  punpcklbw mm2, mm7
+   punpckhbw mm1, mm7
                  punpckhbw mm3, mm7
-                 psubsw mm0, mm2                 ; mm01 -= mm23
+   movq mm6, mm4
+   punpcklbw mm4, mm7
-                 movq mm2, mm6
-                 punpcklbw mm2, mm7
                  punpckhbw mm6, mm7
+   psubsw mm0, mm4
+   psubsw mm1, mm6
+   movq mm6, mm5
+   punpcklbw mm5, mm7
+   punpckhbw mm6, mm7
+   psubsw mm2, mm5
+   lea _EAX, [_EAX+2*TMP1]
+   psubsw mm3, mm6
+   lea _EBX, [_EBX+2*TMP1]
+   movq [TMP0+%1*32+ 0], mm0 ; dst
+   movq [TMP0+%1*32+ 8], mm1
+   movq [TMP0+%1*32+16], mm2
+   movq [TMP0+%1*32+24], mm3
+ %endmacro
-                 psubsw mm1, mm3
+ ALIGN SECTION_ALIGN
+ transfer_8to16sub2_mmx:
-                 psubsw mm4, mm2
+   mov TMP0, prm1   ; Dst
-                 psubsw mm5, mm6
+   mov TMP1d, prm5d ; Stride
+   mov _EAX, prm2   ; Cur
+   push _EBX
+ %ifdef ARCH_IS_X86_64
+   mov _EBX, prm3
+ %else
+   mov _EBX, [_ESP+4+12] ; Ref1
+ %endif
-                 movq [edi + 64], mm0                    ; dct[] = mm01
+   push _ESI
-                 movq [edi + 72], mm1
+ %ifdef ARCH_IS_X86_64
-                 movq [edi + 80], mm4
+   mov _ESI, prm4
-                 movq [edi + 88], mm5
+ %else
+   mov _ESI, [_ESP+8+16] ; Ref2
-                 add edx, ecx
+ %endif
-                 add esi, ecx
-                 add eax, ecx
-                 add ebx, ecx
-                 movq mm0, [edx]                 ; mm01 = [cur]
+   pxor mm7, mm7
-                 movq mm1, mm0
-                 punpcklbw mm0, mm7
+   COPY_8_TO_16_SUB2_MMX 0
-                 punpckhbw mm1, mm7
+   COPY_8_TO_16_SUB2_MMX 1
+   COPY_8_TO_16_SUB2_MMX 2
+   COPY_8_TO_16_SUB2_MMX 3
-                 movq mm4, [eax]
+   pop _ESI
-                 movq mm5, mm4
+   pop _EBX
+   ret
+ ENDFUNC
-                 punpcklbw mm4, mm7
+ ;-----------------------------------------------------------------------------
-                 punpckhbw mm5, mm7
+ ;
+ ; void transfer_8to16sub2_xmm(int16_t * const dct,
+ ;                               uint8_t * const cur,
+ ;                               const uint8_t * ref1,
+ ;                               const uint8_t * ref2,
+ ;                               const uint32_t stride)
+ ;
+ ;-----------------------------------------------------------------------------
-                 movq mm2, [esi]                 ; mm23 = [ref]
+ %macro COPY_8_TO_16_SUB2_SSE 1
+   movq mm0, [_EAX]      ; cur
+   movq mm2, [_EAX+TMP1]
+   movq mm1, mm0
                  movq mm3, mm2
-                 movq mm6, [ebx]
+   punpcklbw mm0, mm7
-                 movq [edx], mm2                 ; [cur] = [ref]
-                 movq [eax], mm6
                  punpcklbw mm2, mm7
+   movq mm4, [_EBX]     ; ref1
+   pavgb mm4, [_ESI]     ; ref2
+   movq [_EAX], mm4
+   punpckhbw mm1, mm7
                  punpckhbw mm3, mm7
+   movq mm5, [_EBX+TMP1] ; ref
+   pavgb mm5, [_ESI+TMP1] ; ref2
+   movq [_EAX+TMP1], mm5
-                 psubsw mm0, mm2                 ; mm01 -= mm23
+   movq mm6, mm4
+   punpcklbw mm4, mm7
-                 movq mm2, mm6
-                 punpcklbw mm2, mm7
                  punpckhbw mm6, mm7
+   psubsw mm0, mm4
+   psubsw mm1, mm6
+   lea _ESI, [_ESI+2*TMP1]
+   movq mm6, mm5
+   punpcklbw mm5, mm7
+   punpckhbw mm6, mm7
+   psubsw mm2, mm5
+   lea _EAX, [_EAX+2*TMP1]
+   psubsw mm3, mm6
+   lea _EBX, [_EBX+2*TMP1]
+   movq [TMP0+%1*32+ 0], mm0 ; dst
+   movq [TMP0+%1*32+ 8], mm1
+   movq [TMP0+%1*32+16], mm2
+   movq [TMP0+%1*32+24], mm3
+ %endmacro
-                 psubsw mm1, mm3
+ ALIGN SECTION_ALIGN
+ transfer_8to16sub2_xmm:
+   mov TMP0, prm1   ; Dst
+   mov _EAX, prm2   ; Cur
+   mov TMP1d, prm5d ; Stride
+   push _EBX
+ %ifdef ARCH_IS_X86_64
+   mov _EBX, prm3 ; Ref1
+ %else
+   mov _EBX, [_ESP+4+12] ; Ref1
+ %endif
-                 psubsw mm4, mm2
+   push _ESI
-                 psubsw mm5, mm6
+ %ifdef ARCH_IS_X86_64
+   mov _ESI, prm4 ; Ref1
+ %else
+   mov _ESI, [_ESP+8+16] ; Ref2
+ %endif
+   pxor mm7, mm7
-                 movq [edi + 96], mm0                    ; dct[] = mm01
+   COPY_8_TO_16_SUB2_SSE 0
-                 movq [edi + 104], mm1
+   COPY_8_TO_16_SUB2_SSE 1
-                 movq [edi + 112], mm4
+   COPY_8_TO_16_SUB2_SSE 2
-                 movq [edi + 120], mm5
+   COPY_8_TO_16_SUB2_SSE 3
-                 pop ebx
-                 pop edi
-                 pop esi
+   pop _ESI
+   pop _EBX
                  ret
+ ENDFUNC
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;
- ; void transfer_8to16sub2_xmm(int16_t * const dct,
+ ; void transfer_8to16sub2ro_xmm(int16_t * const dct,
- ;                                                         uint8_t * const cur,
+ ;                               const uint8_t * const cur,
  ;                                                         const uint8_t * ref1,
  ;                                                         const uint8_t * ref2,
- ;                                                         const uint32_t stride);
+ ;                               const uint32_t stride)
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- align 16
- cglobal transfer_8to16sub2_xmm
- transfer_8to16sub2_xmm
-                 push edi
-                 push esi
-                 push ebx
-                 mov edi, [esp + 12 +  4] ; edi = &dct
-                 mov esi, [esp + 12 +  8] ; esi = &cur
-                 mov ebx, [esp + 12 + 12] ; ebx = &ref1
-                 mov edx, [esp + 12 + 16] ; edx = &ref2
-                 mov eax, [esp + 12 + 20] ; eax = stride
-                 pxor mm7, mm7   ; mm7 = 0
-                 shl eax, 1      ; eax = stride<<1
-                 add eax, 16
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 0] ; mm0 = cur row
-                 movq mm2, [ebx + 0]     ; mm2 = ref1 row
-                 movq mm3, [edx + 0]     ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 8] ; mm0 = cur row
-                 movq mm2, [ebx + 8]     ; mm2 = ref1 row
-                 movq mm3, [edx + 8]     ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 16] ; mm0 = cur row
-                 movq mm2, [ebx + 16]    ; mm2 = ref1 row
-                 movq mm3, [edx + 16]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 24] ; mm0 = cur row
-                 movq mm2, [ebx + 24]    ; mm2 = ref1 row
-                 movq mm3, [edx + 24]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 32] ; mm0 = cur row
-                 movq mm2, [ebx + 32]    ; mm2 = ref1 row
-                 movq mm3, [edx + 32]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
+ %macro COPY_8_TO_16_SUB2RO_SSE 1
-                 add edi, eax    ; edi = &(next dct row)
+   movq mm0, [_EAX]      ; cur
+   movq mm2, [_EAX+TMP1]
-                 ; Row processing
+   movq mm1, mm0
-                 ; One row at a time
+   movq mm3, mm2
-                 movq mm0, [esi + 40] ; mm0 = cur row
-                 movq mm2, [ebx + 40]    ; mm2 = ref1 row
-                 movq mm3, [edx + 40]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 48] ; mm0 = cur row
-                 movq mm2, [ebx + 48]    ; mm2 = ref1 row
-                 movq mm3, [edx + 48]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 56] ; mm0 = cur row
-                 movq mm2, [ebx + 56]    ; mm2 = ref1 row
-                 movq mm3, [edx + 56]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
+   punpcklbw mm0, mm7
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
+   punpcklbw mm2, mm7
+   movq mm4, [_EBX]     ; ref1
+   pavgb mm4, [_ESI]     ; ref2
+   punpckhbw mm1, mm7
+   punpckhbw mm3, mm7
+   movq mm5, [_EBX+TMP1] ; ref
+   pavgb mm5, [_ESI+TMP1] ; ref2
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
+   movq mm6, mm4
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
+   punpcklbw mm4, mm7
+   punpckhbw mm6, mm7
+   psubsw mm0, mm4
+   psubsw mm1, mm6
+   lea _ESI, [_ESI+2*TMP1]
+   movq mm6, mm5
+   punpcklbw mm5, mm7
+   punpckhbw mm6, mm7
+   psubsw mm2, mm5
+   lea _EAX, [_EAX+2*TMP1]
+   psubsw mm3, mm6
+   lea _EBX, [_EBX+2*TMP1]
+   movq [TMP0+%1*32+ 0], mm0 ; dst
+   movq [TMP0+%1*32+ 8], mm1
+   movq [TMP0+%1*32+16], mm2
+   movq [TMP0+%1*32+24], mm3
+ %endmacro
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
+ ALIGN SECTION_ALIGN
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
+ transfer_8to16sub2ro_xmm:
+   pxor mm7, mm7
+   mov TMP0, prm1   ; Dst
+   mov _EAX, prm2   ; Cur
+   mov TMP1d, prm5d ; Stride
+   push _EBX
+ %ifdef ARCH_IS_X86_64
+   mov _EBX, prm3
+ %else
+   mov _EBX, [_ESP+4+12] ; Ref1
+ %endif
-                 ; Exit
+   push _ESI
+ %ifdef ARCH_IS_X86_64
+   mov _ESI, prm4
+ %else
+   mov _ESI, [_ESP+8+16] ; Ref2
+ %endif
-                 pop ebx
+   COPY_8_TO_16_SUB2RO_SSE 0
-                 pop esi
+   COPY_8_TO_16_SUB2RO_SSE 1
-                 pop edi
+   COPY_8_TO_16_SUB2RO_SSE 2
+   COPY_8_TO_16_SUB2RO_SSE 3
+   pop _ESI
+   pop _EBX
                  ret
+ ENDFUNC
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;
  ; void transfer_16to8add_mmx(uint8_t * const dst,
  ;                                               const int16_t * const src,
  ;                                               uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- align 16
- cglobal transfer_16to8add_mmx
- transfer_16to8add_mmx
-                 push    esi
-                 push    edi
-                 mov     edi, [esp + 8 + 4]              ; dst
-                 mov     esi, [esp + 8 + 8]              ; src
-                 mov ecx, [esp + 8 + 12]         ; stride
-                 pxor mm7, mm7
-                 mov eax, 8
+ %macro COPY_16_TO_8_ADD 1
+   movq mm0, [TMP0]
- .loop
+   movq mm2, [TMP0+TMP1]
-                 movq mm0, [edi]
                  movq mm1, mm0
-                 punpcklbw mm0, mm7              ; mm23 = unpack([dst])
+   movq mm3, mm2
+   punpcklbw mm0, mm7
+   punpcklbw mm2, mm7
                  punpckhbw mm1, mm7
+   punpckhbw mm3, mm7
+   paddsw mm0, [_EAX+%1*32+ 0]
+   paddsw mm1, [_EAX+%1*32+ 8]
+   paddsw mm2, [_EAX+%1*32+16]
+   paddsw mm3, [_EAX+%1*32+24]
+   packuswb mm0, mm1
+   movq [TMP0], mm0
+   packuswb mm2, mm3
+   movq [TMP0+TMP1], mm2
+ %endmacro
-                 movq mm2, [esi]                 ; mm01 = [src]
-                 movq mm3, [esi + 8]
-                 paddsw mm0, mm2                 ; mm01 += mm23
+ ALIGN SECTION_ALIGN
-                 paddsw mm1, mm3
+ transfer_16to8add_mmx:
+   mov TMP0, prm1 ; Dst
+   mov _EAX, prm2 ; Src
+   mov TMP1, prm3 ; Stride
+   pxor mm7, mm7
-                 packuswb mm0, mm1               ; [dst] = pack(mm01)
+   COPY_16_TO_8_ADD 0
-                 movq [edi], mm0
+   lea TMP0,[TMP0+2*TMP1]
+   COPY_16_TO_8_ADD 1
+   lea TMP0,[TMP0+2*TMP1]
+   COPY_16_TO_8_ADD 2
+   lea TMP0,[TMP0+2*TMP1]
+   COPY_16_TO_8_ADD 3
+   ret
+ ENDFUNC
-                 add esi, 16
+ ;-----------------------------------------------------------------------------
-                 add edi, ecx
+ ;
-                 dec eax
+ ; void transfer8x8_copy_mmx(uint8_t * const dst,
-                 jnz .loop
+ ;                                       const uint8_t * const src,
+ ;                                       const uint32_t stride);
+ ;
+ ;
+ ;-----------------------------------------------------------------------------
-                 pop edi
+ %macro COPY_8_TO_8 0
-                 pop esi
+   movq mm0, [_EAX]
+   movq mm1, [_EAX+TMP1]
+   movq [TMP0], mm0
+   lea _EAX, [_EAX+2*TMP1]
+   movq [TMP0+TMP1], mm1
+ %endmacro
+ ALIGN SECTION_ALIGN
+ transfer8x8_copy_mmx:
+   mov TMP0, prm1 ; Dst
+   mov _EAX, prm2 ; Src
+   mov TMP1, prm3 ; Stride
+   COPY_8_TO_8
+   lea TMP0,[TMP0+2*TMP1]
+   COPY_8_TO_8
+   lea TMP0,[TMP0+2*TMP1]
+   COPY_8_TO_8
+   lea TMP0,[TMP0+2*TMP1]
+   COPY_8_TO_8
                  ret
+ ENDFUNC
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
  ;
- ; void transfer8x8_copy_mmx(uint8_t * const dst,
+ ; void transfer8x4_copy_mmx(uint8_t * const dst,
  ;                                       const uint8_t * const src,
  ;                                       const uint32_t stride);
  ;
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- align 16
+ ALIGN SECTION_ALIGN
- cglobal transfer8x8_copy_mmx
+ transfer8x4_copy_mmx:
- transfer8x8_copy_mmx
+   mov TMP0, prm1 ; Dst
-                 push    esi
+   mov _EAX, prm2 ; Src
-                 push    edi
+   mov TMP1, prm3 ; Stride
-                 mov     edi, [esp + 8 + 4]              ; dst [out]
+   COPY_8_TO_8
-                 mov     esi, [esp + 8 + 8]              ; src [in]
+   lea TMP0,[TMP0+2*TMP1]
-                 mov eax, [esp + 8 + 12]         ; stride [in]
+   COPY_8_TO_8
+   ret
-                 movq mm0, [esi]
+ ENDFUNC
-                 movq mm1, [esi+eax]
-                 movq [edi], mm0
-                 movq [edi+eax], mm1
-                 add esi, eax
-                 add edi, eax
-                 add esi, eax
-                 add edi, eax
-                 movq mm0, [esi]
-                 movq mm1, [esi+eax]
-                 movq [edi], mm0
-                 movq [edi+eax], mm1
-                 add esi, eax
-                 add edi, eax
-                 add esi, eax
-                 add edi, eax
-                 movq mm0, [esi]
-                 movq mm1, [esi+eax]
-                 movq [edi], mm0
-                 movq [edi+eax], mm1
-                 add esi, eax
-                 add edi, eax
-                 add esi, eax
-                 add edi, eax
-                 movq mm0, [esi]
-                 movq mm1, [esi+eax]
-                 movq [edi], mm0
-                 movq [edi+eax], mm1
-                 add esi, eax
-                 add edi, eax
-                 add esi, eax
-                 add edi, eax
-                 pop edi
-                 pop esi
-                 ret
+ %ifidn __OUTPUT_FORMAT__,elf
+ section ".note.GNU-stack" noalloc noexec nowrite progbits
+ %endif

 Legend:



Removed from v.215
 


changed lines


 
Added in v.1820
 Legend:



Removed from v.215
 


changed lines


 
Added in v.1820
-Removed from v.215
+Added in v.1820

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4