Diff of /branches/dev-api-4/xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm

-trunk/xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm
revision 216, Sun Jun 16 17:25:18 2002 UTC
+branches/dev-api-4/xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm
revision 1192, Tue Oct 28 22:23:03 2003 UTC
 Line 1
- ;/**************************************************************************
+ ;/****************************************************************************
  ; *
  ; *     XVID MPEG-4 VIDEO CODEC
- ; *     mmx 8bit<->16bit transfers
+ ; *  - 8<->16 bit transfer functions -
  ; *
- ; *     This program is an implementation of a part of one or more MPEG-4
+ ; *  Copyright (C) 2001 Peter Ross <pross@xvid.org>
- ; *     Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
+ ; *                2001 Michael Militzer <isibaar@xvid.org>
- ; *     to use this software module in hardware or software products are
+ ; *                2002 Pascal Massimino <skal@planet-d.net>
- ; *     advised that its use may infringe existing patents or copyrights, and
- ; *     any such use would be at such party's own risk.  The original
- ; *     developer of this software module and his/her company, and subsequent
- ; *     editors and their companies, will have no liability for use of this
- ; *     software or modifications or derivatives thereof.
  ; *
  ; *     This program is free software; you can redistribute it and/or modify
  ; *     it under the terms of the GNU General Public License as published by
-Line 24
+Line 19
  ; *
  ; *     You should have received a copy of the GNU General Public License
  ; *     along with this program; if not, write to the Free Software
- ; *     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  ; *
- ; *************************************************************************/
+ ; * $Id: mem_transfer_mmx.asm,v 1.9.2.1 2003-10-28 22:23:03 edgomez Exp $
- ;/**************************************************************************
- ; *
- ; *     History:
  ; *
- ; * 07.01.2002  merge functions from compensate_mmx; rename functions
+ ; ***************************************************************************/
- ; *     07.11.2001      initial version; (c)2001 peter ross <pross@cs.rmit.edu.au>
- ; *
- ; *************************************************************************/
- bits 32
+ BITS 32
  %macro cglobal 1
          %ifdef PREFIX
-Line 49
+Line 36
          %endif
  %endmacro
+ ;=============================================================================
+ ; Code
+ ;=============================================================================
- section .text
+ SECTION .text
+ cglobal transfer_8to16copy_mmx
+ cglobal transfer_16to8copy_mmx
+ cglobal transfer_8to16sub_mmx
+ cglobal transfer_8to16subro_mmx
+ cglobal transfer_8to16sub2_mmx
+ cglobal transfer_8to16sub2_xmm
+ cglobal transfer_16to8add_mmx
+ cglobal transfer8x8_copy_mmx
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;
  ; void transfer_8to16copy_mmx(int16_t * const dst,
  ;                                                       const uint8_t * const src,
  ;                                                       uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- align 16
- cglobal transfer_8to16copy_mmx
- transfer_8to16copy_mmx
-                 push    esi
-                 push    edi
-                 mov     edi, [esp + 8 + 4]              ; dst
-                 mov     esi, [esp + 8 + 8]              ; src
-                 mov ecx, [esp + 8 + 12]         ; stride
-                 pxor mm7, mm7                           ; mm7 = zero
-                 mov eax, 8
+ %macro COPY_8_TO_16 1
+   movq mm0, [eax]
- .loop
+   movq mm1, [eax+edx]
-                 movq mm0, [esi]
+   movq mm2, mm0
-                 movq mm1, mm0
+   movq mm3, mm1
-                 punpcklbw mm0, mm7              ; mm01 = unpack([src])
+   punpcklbw mm0, mm7
-                 punpckhbw mm1, mm7
+   movq [ecx+%1*32], mm0
+   punpcklbw mm1, mm7
-                 movq [edi], mm0                 ; [dst] = mm01
+   movq [ecx+%1*32+16], mm1
-                 movq [edi + 8], mm1
+   punpckhbw mm2, mm7
+   punpckhbw mm3, mm7
+   lea eax, [eax+2*edx]
+   movq [ecx+%1*32+8], mm2
+   movq [ecx+%1*32+24], mm3
+ %endmacro
-                 add edi, 16
+ ALIGN 16
-                 add esi, ecx
+ transfer_8to16copy_mmx:
-                 dec eax
-                 jnz .loop
-                 pop edi
+   mov ecx, [esp+ 4] ; Dst
-                 pop esi
+   mov eax, [esp+ 8] ; Src
+   mov edx, [esp+12] ; Stride
+   pxor mm7, mm7
+   COPY_8_TO_16 0
+   COPY_8_TO_16 1
+   COPY_8_TO_16 2
+   COPY_8_TO_16 3
                  ret
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
  ;
  ; void transfer_16to8copy_mmx(uint8_t * const dst,
  ;                                                       const int16_t * const src,
  ;                                                       uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- align 16
- cglobal transfer_16to8copy_mmx
- transfer_16to8copy_mmx
-                 push    esi
-                 push    edi
-                 mov     edi, [esp + 8 + 4]              ; dst
+ %macro COPY_16_TO_8 1
-                 mov     esi, [esp + 8 + 8]              ; src
+   movq mm0, [eax+%1*32]
-                 mov ecx, [esp + 8 + 12]         ; stride
+   movq mm1, [eax+%1*32+8]
+   packuswb mm0, mm1
-                 mov eax, 8
+   movq [ecx], mm0
+   movq mm2, [eax+%1*32+16]
- .loop
+   movq mm3, [eax+%1*32+24]
-                 movq mm0, [esi]
+   packuswb mm2, mm3
-                 packuswb mm0, [esi + 8]         ; mm0 = pack([src])
+   movq [ecx+edx], mm2
+ %endmacro
-                 movq [edi], mm0                         ; [dst] = mm0
-                 add esi, 16
-                 add edi, ecx
-                 dec eax
-                 jnz .loop
-                 pop edi
+ ALIGN 16
-                 pop esi
+ transfer_16to8copy_mmx:
+   mov ecx, [esp+ 4] ; Dst
+   mov eax, [esp+ 8] ; Src
+   mov edx, [esp+12] ; Stride
+   COPY_16_TO_8 0
+   lea ecx,[ecx+2*edx]
+   COPY_16_TO_8 1
+   lea ecx,[ecx+2*edx]
+   COPY_16_TO_8 2
+   lea ecx,[ecx+2*edx]
+   COPY_16_TO_8 3
                  ret
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
  ;
  ; void transfer_8to16sub_mmx(int16_t * const dct,
  ;                               uint8_t * const cur,
  ;                               const uint8_t * const ref,
  ;                               const uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- ;/**************************************************************************
- ; *
- ; *     History:
- ; *
- ; * 27.12.2001  renamed from 'compensate' to 'transfer_8to16sub'
- ; * 02.12.2001  loop unrolled, code runs 10% faster now (Isibaar)
- ; * 30.11.2001  16 pixels are processed per iteration (Isibaar)
- ; * 30.11.2001  .text missing
- ; *     06.11.2001      inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
- ; *
- ; *************************************************************************/
- align 16
- cglobal transfer_8to16sub_mmx
- transfer_8to16sub_mmx
-                 push    esi
-                 push    edi
-                 push    ebx
-                 mov     edi, [esp + 12 + 4]             ; dct [out]
-                 mov     edx, [esp + 12 + 8]             ; cur [in/out]
-                 mov     esi, [esp + 12 + 12]            ; ref [in]
-                 mov ecx, [esp + 12 + 16]                ; stride [in]
-                 mov eax, edx                            ; cur -> eax
+ ; when second argument == 1, reference (ebx) block is to current (eax)
-                 mov ebx, esi                            ; ref -> ebx
+ %macro COPY_8_TO_16_SUB 2
-                 add eax, ecx                            ; cur + stride
+   movq mm0, [eax]      ; cur
-                 add ebx, ecx                            ; ref + stride
+   movq mm2, [eax+edx]
-                 shl ecx, 1
-                 pxor mm7, mm7                   ; mm7 = zero
-                 movq mm0, [edx]                 ; mm01 = [cur]
                  movq mm1, mm0
-                 punpcklbw mm0, mm7
-                 punpckhbw mm1, mm7
-                 movq mm4, [eax]
-                 movq mm5, mm4
-                 punpcklbw mm4, mm7
-                 punpckhbw mm5, mm7
-                 movq mm2, [esi]                 ; mm23 = [ref]
                  movq mm3, mm2
-                 movq mm6, [ebx]
-                 movq [edx], mm2                 ; [cur] = [ref]
-                 movq [eax], mm6
-                 punpcklbw mm2, mm7
-                 punpckhbw mm3, mm7
-                 psubsw mm0, mm2                 ; mm01 -= mm23
-                 movq mm2, mm6
-                 punpcklbw mm2, mm7
-                 punpckhbw mm6, mm7
-                 psubsw mm1, mm3
-                 psubsw mm4, mm2
-                 psubsw mm5, mm6
-                 movq [edi], mm0                 ; dct[] = mm01
-                 movq [edi + 8], mm1
-                 movq [edi + 16], mm4
-                 movq [edi + 24], mm5
-                 add edx, ecx
-                 add esi, ecx
-                 add eax, ecx
-                 add ebx, ecx
-                 movq mm0, [edx]                 ; mm01 = [cur]
-                 movq mm1, mm0
                  punpcklbw mm0, mm7
-                 punpckhbw mm1, mm7
-                 movq mm4, [eax]
-                 movq mm5, mm4
-                 punpcklbw mm4, mm7
-                 punpckhbw mm5, mm7
-                 movq mm2, [esi]                 ; mm23 = [ref]
-                 movq mm3, mm2
-                 movq mm6, [ebx]
-                 movq [edx], mm2                 ; [cur] = [ref]
-                 movq [eax], mm6
                  punpcklbw mm2, mm7
+   movq mm4, [ebx]      ; ref
+   punpckhbw mm1, mm7
                  punpckhbw mm3, mm7
+   movq mm5, [ebx+edx]  ; ref
-                 psubsw mm0, mm2                 ; mm01 -= mm23
+   movq mm6, mm4
+ %if %2 == 1
-                 movq mm2, mm6
+   movq [eax], mm4
+   movq [eax+edx], mm5
-                 punpcklbw mm2, mm7
+ %endif
+   punpcklbw mm4, mm7
                  punpckhbw mm6, mm7
+   psubsw mm0, mm4
+   psubsw mm1, mm6
+   movq mm6, mm5
+   punpcklbw mm5, mm7
+   punpckhbw mm6, mm7
+   psubsw mm2, mm5
+   lea eax, [eax+2*edx]
+   psubsw mm3, mm6
+   lea ebx,[ebx+2*edx]
+   movq [ecx+%1*32+ 0], mm0 ; dst
+   movq [ecx+%1*32+ 8], mm1
+   movq [ecx+%1*32+16], mm2
+   movq [ecx+%1*32+24], mm3
+ %endmacro
-                 psubsw mm1, mm3
+ ALIGN 16
+ transfer_8to16sub_mmx:
-                 psubsw mm4, mm2
+   mov ecx, [esp  + 4] ; Dst
-                 psubsw mm5, mm6
+   mov eax, [esp  + 8] ; Cur
+   push ebx
+   mov ebx, [esp+4+12] ; Ref
+   mov edx, [esp+4+16] ; Stride
+   pxor mm7, mm7
-                 movq [edi + 32], mm0                    ; dct[] = mm01
+   COPY_8_TO_16_SUB 0, 1
-                 movq [edi + 40], mm1
+   COPY_8_TO_16_SUB 1, 1
-                 movq [edi + 48], mm4
+   COPY_8_TO_16_SUB 2, 1
-                 movq [edi + 56], mm5
+   COPY_8_TO_16_SUB 3, 1
-                 add edx, ecx
-                 add esi, ecx
-                 add eax, ecx
-                 add ebx, ecx
-                 movq mm0, [edx]                 ; mm01 = [cur]
+   pop ebx
-                 movq mm1, mm0
+   ret
-                 punpcklbw mm0, mm7
-                 punpckhbw mm1, mm7
-                 movq mm4, [eax]
+ ALIGN 16
-                 movq mm5, mm4
+ transfer_8to16subro_mmx:
+   mov ecx, [esp  + 4] ; Dst
+   mov eax, [esp  + 8] ; Cur
+   push ebx
+   mov ebx, [esp+4+12] ; Ref
+   mov edx, [esp+4+16] ; Stride
+   pxor mm7, mm7
-                 punpcklbw mm4, mm7
+   COPY_8_TO_16_SUB 0, 0
-                 punpckhbw mm5, mm7
+   COPY_8_TO_16_SUB 1, 0
+   COPY_8_TO_16_SUB 2, 0
+   COPY_8_TO_16_SUB 3, 0
-                 movq mm2, [esi]                 ; mm23 = [ref]
+   pop ebx
-                 movq mm3, mm2
+   ret
-                 movq mm6, [ebx]
-                 movq [edx], mm2                 ; [cur] = [ref]
+ ;-----------------------------------------------------------------------------
-                 movq [eax], mm6
+ ;
+ ; void transfer_8to16sub2_mmx(int16_t * const dct,
+ ;                               uint8_t * const cur,
+ ;                               const uint8_t * ref1,
+ ;                               const uint8_t * ref2,
+ ;                               const uint32_t stride)
+ ;
+ ;-----------------------------------------------------------------------------
-                 punpcklbw mm2, mm7
+ %macro COPY_8_TO_16_SUB2_MMX 1
+   movq mm0, [eax]      ; cur
+   movq mm2, [eax+edx]
+   ; mm4 <- (ref1+ref2+1) / 2
+   movq mm4, [ebx]      ; ref1
+   movq mm1, [esi]      ; ref2
+   movq mm6, mm4
+   movq mm3, mm1
+   punpcklbw mm4, mm7
+   punpcklbw mm1, mm7
+   punpckhbw mm6, mm7
                  punpckhbw mm3, mm7
+   paddusw mm4, mm1
-                 psubsw mm0, mm2                 ; mm01 -= mm23
+   paddusw mm6, mm3
+   psrlw mm4, 1
-                 movq mm2, mm6
+   psrlw mm6, 1
+   packuswb mm4, mm6
-                 punpcklbw mm2, mm7
+     ; mm5 <- (ref1+ref2+1) / 2
+   movq mm5, [ebx+edx]  ; ref1
+   movq mm1, [esi+edx]  ; ref2
+   movq mm6, mm5
+   movq mm3, mm1
+   punpcklbw mm5, mm7
+   punpcklbw mm1, mm7
                  punpckhbw mm6, mm7
+   punpckhbw mm3, mm7
+   paddusw mm5, mm1
+   paddusw mm6, mm3
+   lea esi, [esi+2*edx]
+   psrlw mm5, 1
+   psrlw mm6, 1
+   packuswb mm5, mm6
-                 psubsw mm1, mm3
-                 psubsw mm4, mm2
-                 psubsw mm5, mm6
-                 movq [edi + 64], mm0                    ; dct[] = mm01
-                 movq [edi + 72], mm1
-                 movq [edi + 80], mm4
-                 movq [edi + 88], mm5
-                 add edx, ecx
-                 add esi, ecx
-                 add eax, ecx
-                 add ebx, ecx
-                 movq mm0, [edx]                 ; mm01 = [cur]
                  movq mm1, mm0
-                 punpcklbw mm0, mm7
-                 punpckhbw mm1, mm7
-                 movq mm4, [eax]
-                 movq mm5, mm4
-                 punpcklbw mm4, mm7
-                 punpckhbw mm5, mm7
-                 movq mm2, [esi]                 ; mm23 = [ref]
                  movq mm3, mm2
+   punpcklbw mm0, mm7
-                 movq mm6, [ebx]
-                 movq [edx], mm2                 ; [cur] = [ref]
-                 movq [eax], mm6
                  punpcklbw mm2, mm7
+   punpckhbw mm1, mm7
                  punpckhbw mm3, mm7
-                 psubsw mm0, mm2                 ; mm01 -= mm23
+   movq mm6, mm4
+   punpcklbw mm4, mm7
-                 movq mm2, mm6
-                 punpcklbw mm2, mm7
                  punpckhbw mm6, mm7
+   psubsw mm0, mm4
+   psubsw mm1, mm6
+   movq mm6, mm5
+   punpcklbw mm5, mm7
+   punpckhbw mm6, mm7
+   psubsw mm2, mm5
+   lea eax, [eax+2*edx]
+   psubsw mm3, mm6
+   lea ebx, [ebx+2*edx]
+   movq [ecx+%1*32+ 0], mm0 ; dst
+   movq [ecx+%1*32+ 8], mm1
+   movq [ecx+%1*32+16], mm2
+   movq [ecx+%1*32+24], mm3
+ %endmacro
-                 psubsw mm1, mm3
+ ALIGN 16
+ transfer_8to16sub2_mmx:
-                 psubsw mm4, mm2
+   mov ecx, [esp  + 4] ; Dst
-                 psubsw mm5, mm6
+   mov eax, [esp  + 8] ; Cur
+   push ebx
+   mov ebx, [esp+4+12] ; Ref1
+   push esi
+   mov esi, [esp+8+16] ; Ref2
+   mov edx, [esp+8+20] ; Stride
+   pxor mm7, mm7
-                 movq [edi + 96], mm0                    ; dct[] = mm01
+   COPY_8_TO_16_SUB2_MMX 0
-                 movq [edi + 104], mm1
+   COPY_8_TO_16_SUB2_MMX 1
-                 movq [edi + 112], mm4
+   COPY_8_TO_16_SUB2_MMX 2
-                 movq [edi + 120], mm5
+   COPY_8_TO_16_SUB2_MMX 3
-                 pop ebx
-                 pop edi
                  pop esi
+   pop ebx
                  ret
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
  ;
  ; void transfer_8to16sub2_xmm(int16_t * const dct,
  ;                                                         uint8_t * const cur,
  ;                                                         const uint8_t * ref1,
  ;                                                         const uint8_t * ref2,
- ;                                                         const uint32_t stride);
+ ;                               const uint32_t stride)
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- align 16
- cglobal transfer_8to16sub2_xmm
- transfer_8to16sub2_xmm
-                 push edi
-                 push esi
-                 push ebx
-                 mov edi, [esp + 12 +  4] ; edi = &dct
-                 mov esi, [esp + 12 +  8] ; esi = &cur
-                 mov ebx, [esp + 12 + 12] ; ebx = &ref1
-                 mov edx, [esp + 12 + 16] ; edx = &ref2
-                 mov eax, [esp + 12 + 20] ; eax = stride
-                 pxor mm7, mm7   ; mm7 = 0
-                 shl eax, 1      ; eax = stride<<1
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 0] ; mm0 = cur row
-                 movq mm2, [ebx + 0]     ; mm2 = ref1 row
-                 movq mm3, [edx + 0]     ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 8] ; mm0 = cur row
-                 movq mm2, [ebx + 8]     ; mm2 = ref1 row
-                 movq mm3, [edx + 8]     ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 16] ; mm0 = cur row
-                 movq mm2, [ebx + 16]    ; mm2 = ref1 row
-                 movq mm3, [edx + 16]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 24] ; mm0 = cur row
-                 movq mm2, [ebx + 24]    ; mm2 = ref1 row
-                 movq mm3, [edx + 24]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 32] ; mm0 = cur row
-                 movq mm2, [ebx + 32]    ; mm2 = ref1 row
-                 movq mm3, [edx + 32]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
+ %macro COPY_8_TO_16_SUB2_SSE 1
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
+   movq mm0, [eax]      ; cur
+   movq mm2, [eax+edx]
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
+   movq mm1, mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
+   movq mm3, mm2
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 40] ; mm0 = cur row
-                 movq mm2, [ebx + 40]    ; mm2 = ref1 row
-                 movq mm3, [edx + 40]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 48] ; mm0 = cur row
-                 movq mm2, [ebx + 48]    ; mm2 = ref1 row
-                 movq mm3, [edx + 48]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 56] ; mm0 = cur row
-                 movq mm2, [ebx + 56]    ; mm2 = ref1 row
-                 movq mm3, [edx + 56]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
+   punpcklbw mm0, mm7
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
+   punpcklbw mm2, mm7
+   movq mm4, [ebx]     ; ref1
+   pavgb mm4, [esi]     ; ref2
+   punpckhbw mm1, mm7
+   punpckhbw mm3, mm7
+   movq mm5, [ebx+edx] ; ref
+   pavgb mm5, [esi+edx] ; ref2
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
+   movq mm6, mm4
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
+   punpcklbw mm4, mm7
+   punpckhbw mm6, mm7
+   psubsw mm0, mm4
+   psubsw mm1, mm6
+   lea esi, [esi+2*edx]
+   movq mm6, mm5
+   punpcklbw mm5, mm7
+   punpckhbw mm6, mm7
+   psubsw mm2, mm5
+   lea eax, [eax+2*edx]
+   psubsw mm3, mm6
+   lea ebx, [ebx+2*edx]
+   movq [ecx+%1*32+ 0], mm0 ; dst
+   movq [ecx+%1*32+ 8], mm1
+   movq [ecx+%1*32+16], mm2
+   movq [ecx+%1*32+24], mm3
+ %endmacro
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
+ ALIGN 16
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
+ transfer_8to16sub2_xmm:
+   mov ecx, [esp  + 4] ; Dst
+   mov eax, [esp  + 8] ; Cur
+   push ebx
+   mov ebx, [esp+4+12] ; Ref1
+   push esi
+   mov esi, [esp+8+16] ; Ref2
+   mov edx, [esp+8+20] ; Stride
+   pxor mm7, mm7
-                 ; Exit
+   COPY_8_TO_16_SUB2_SSE 0
+   COPY_8_TO_16_SUB2_SSE 1
+   COPY_8_TO_16_SUB2_SSE 2
+   COPY_8_TO_16_SUB2_SSE 3
-                 pop ebx
                  pop esi
-                 pop edi
+   pop ebx
                  ret
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;
  ; void transfer_16to8add_mmx(uint8_t * const dst,
  ;                                               const int16_t * const src,
  ;                                               uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- align 16
- cglobal transfer_16to8add_mmx
- transfer_16to8add_mmx
-                 push    esi
+ %macro COPY_16_TO_8_ADD 1
-                 push    edi
+   movq mm0, [ecx]
+   movq mm2, [ecx+edx]
-                 mov     edi, [esp + 8 + 4]              ; dst
-                 mov     esi, [esp + 8 + 8]              ; src
-                 mov ecx, [esp + 8 + 12]         ; stride
-                 pxor mm7, mm7
-                 mov eax, 8
- .loop
-                 movq mm0, [edi]
                  movq mm1, mm0
-                 punpcklbw mm0, mm7              ; mm23 = unpack([dst])
+   movq mm3, mm2
+   punpcklbw mm0, mm7
+   punpcklbw mm2, mm7
                  punpckhbw mm1, mm7
+   punpckhbw mm3, mm7
+   paddsw mm0, [eax+%1*32+ 0]
+   paddsw mm1, [eax+%1*32+ 8]
+   paddsw mm2, [eax+%1*32+16]
+   paddsw mm3, [eax+%1*32+24]
+   packuswb mm0, mm1
+   movq [ecx], mm0
+   packuswb mm2, mm3
+   movq [ecx+edx], mm2
+ %endmacro
-                 movq mm2, [esi]                 ; mm01 = [src]
-                 movq mm3, [esi + 8]
-                 paddsw mm0, mm2                 ; mm01 += mm23
-                 paddsw mm1, mm3
-                 packuswb mm0, mm1               ; [dst] = pack(mm01)
-                 movq [edi], mm0
-                 add esi, 16
-                 add edi, ecx
-                 dec eax
-                 jnz .loop
-                 pop edi
+ ALIGN 16
-                 pop esi
+ transfer_16to8add_mmx:
+   mov ecx, [esp+ 4] ; Dst
+   mov eax, [esp+ 8] ; Src
+   mov edx, [esp+12] ; Stride
+   pxor mm7, mm7
+   COPY_16_TO_8_ADD 0
+   lea ecx,[ecx+2*edx]
+   COPY_16_TO_8_ADD 1
+   lea ecx,[ecx+2*edx]
+   COPY_16_TO_8_ADD 2
+   lea ecx,[ecx+2*edx]
+   COPY_16_TO_8_ADD 3
                  ret
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
  ;
  ; void transfer8x8_copy_mmx(uint8_t * const dst,
  ;                                       const uint8_t * const src,
  ;                                       const uint32_t stride);
  ;
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- align 16
- cglobal transfer8x8_copy_mmx
- transfer8x8_copy_mmx
-                 push    esi
-                 push    edi
-                 mov     edi, [esp + 8 + 4]              ; dst [out]
+ %macro COPY_8_TO_8 0
-                 mov     esi, [esp + 8 + 8]              ; src [in]
+   movq mm0, [eax]
-                 mov eax, [esp + 8 + 12]         ; stride [in]
+   movq mm1, [eax+edx]
+   movq [ecx], mm0
-                 movq mm0, [esi]
+   lea eax, [eax+2*edx]
-                 movq mm1, [esi+eax]
+   movq [ecx+edx], mm1
-                 movq [edi], mm0
+ %endmacro
-                 movq [edi+eax], mm1
-                 add esi, eax
-                 add edi, eax
-                 add esi, eax
-                 add edi, eax
-                 movq mm0, [esi]
-                 movq mm1, [esi+eax]
-                 movq [edi], mm0
-                 movq [edi+eax], mm1
-                 add esi, eax
-                 add edi, eax
-                 add esi, eax
-                 add edi, eax
-                 movq mm0, [esi]
-                 movq mm1, [esi+eax]
-                 movq [edi], mm0
-                 movq [edi+eax], mm1
-                 add esi, eax
-                 add edi, eax
-                 add esi, eax
-                 add edi, eax
-                 movq mm0, [esi]
-                 movq mm1, [esi+eax]
-                 movq [edi], mm0
-                 movq [edi+eax], mm1
-                 add esi, eax
-                 add edi, eax
-                 add esi, eax
-                 add edi, eax
-                 pop edi
-                 pop esi
+ ALIGN 16
+ transfer8x8_copy_mmx:
+   mov ecx, [esp+ 4] ; Dst
+   mov eax, [esp+ 8] ; Src
+   mov edx, [esp+12] ; Stride
+   COPY_8_TO_8
+   lea ecx,[ecx+2*edx]
+   COPY_8_TO_8
+   lea ecx,[ecx+2*edx]
+   COPY_8_TO_8
+   lea ecx,[ecx+2*edx]
+   COPY_8_TO_8
                  ret

 Legend:



Removed from v.216
 


changed lines


 
Added in v.1192
 Legend:



Removed from v.216
 


changed lines


 
Added in v.1192
-Removed from v.216
+Added in v.1192

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4