Diff of /trunk/xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm

-revision 216, Sun Jun 16 17:25:18 2002 UTC
+revision 654, Sun Nov 17 00:51:11 2002 UTC
 Line 3
  ; *     XVID MPEG-4 VIDEO CODEC
  ; *     mmx 8bit<->16bit transfers
  ; *
- ; *     This program is an implementation of a part of one or more MPEG-4
+ ; *  This file is part of XviD, a free MPEG-4 video encoder/decoder
- ; *     Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
- ; *     to use this software module in hardware or software products are
- ; *     advised that its use may infringe existing patents or copyrights, and
- ; *     any such use would be at such party's own risk.  The original
- ; *     developer of this software module and his/her company, and subsequent
- ; *     editors and their companies, will have no liability for use of this
- ; *     software or modifications or derivatives thereof.
  ; *
- ; *     This program is free software; you can redistribute it and/or modify
+ ; *  XviD is free software; you can redistribute it and/or modify it
- ; *     it under the terms of the GNU General Public License as published by
+ ; *  under the terms of the GNU General Public License as published by
  ; *     the Free Software Foundation; either version 2 of the License, or
  ; *     (at your option) any later version.
  ; *
-Line 24
+Line 17
  ; *
  ; *     You should have received a copy of the GNU General Public License
  ; *     along with this program; if not, write to the Free Software
- ; *     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ ; *
+ ; *  Under section 8 of the GNU General Public License, the copyright
+ ; *  holders of XVID explicitly forbid distribution in the following
+ ; *  countries:
+ ; *
+ ; *    - Japan
+ ; *    - United States of America
+ ; *
+ ; *  Linking XviD statically or dynamically with other modules is making a
+ ; *  combined work based on XviD.  Thus, the terms and conditions of the
+ ; *  GNU General Public License cover the whole combination.
+ ; *
+ ; *  As a special exception, the copyright holders of XviD give you
+ ; *  permission to link XviD with independent modules that communicate with
+ ; *  XviD solely through the VFW1.1 and DShow interfaces, regardless of the
+ ; *  license terms of these independent modules, and to copy and distribute
+ ; *  the resulting combined work under terms of your choice, provided that
+ ; *  every copy of the combined work is accompanied by a complete copy of
+ ; *  the source code of XviD (the version of XviD used to produce the
+ ; *  combined work), being distributed under the terms of the GNU General
+ ; *  Public License plus this exception.  An independent module is a module
+ ; *  which is not derived from or based on XviD.
+ ; *
+ ; *  Note that people who make modified versions of XviD are not obligated
+ ; *  to grant this special exception for their modified versions; it is
+ ; *  their choice whether to do so.  The GNU General Public License gives
+ ; *  permission to release a modified version without this exception; this
+ ; *  exception also makes it possible to release a modified version which
+ ; *  carries forward this exception.
+ ; *
+ ; * $Id: mem_transfer_mmx.asm,v 1.8 2002-11-17 00:51:11 edgomez Exp $
  ; *
  ; *************************************************************************/
-Line 32
+Line 56
  ; *
  ; *     History:
  ; *
+ ; * 04.06.2002  speed enhancement (unroll+overlap). -Skal-
+ ; *             + added transfer_8to16sub2_mmx/xmm
  ; * 07.01.2002  merge functions from compensate_mmx; rename functions
- ; *     07.11.2001      initial version; (c)2001 peter ross <pross@cs.rmit.edu.au>
+ ; *     07.11.2001      initial version; (c)2001 peter ross <pross@xvid.org>
  ; *
  ; *************************************************************************/
-Line 52
+Line 78
  section .text
+ cglobal transfer_8to16copy_mmx
+ cglobal transfer_16to8copy_mmx
+ cglobal transfer_8to16sub_mmx
+ cglobal transfer_8to16sub2_mmx
+ cglobal transfer_8to16sub2_xmm
+ cglobal transfer_16to8add_mmx
+ cglobal transfer8x8_copy_mmx
  ;===========================================================================
  ;
-Line 61
+Line 94
  ;
  ;===========================================================================
- align 16
+ %macro COPY_8_TO_16 1
- cglobal transfer_8to16copy_mmx
+   movq mm0, [eax]
- transfer_8to16copy_mmx
+   movq mm1, [eax+edx]
+   movq mm2, mm0
-                 push    esi
+   movq mm3, mm1
-                 push    edi
+   punpcklbw mm0, mm7
+   movq [ecx+%1*32], mm0
-                 mov     edi, [esp + 8 + 4]              ; dst
+   punpcklbw mm1, mm7
-                 mov     esi, [esp + 8 + 8]              ; src
+   movq [ecx+%1*32+16], mm1
-                 mov ecx, [esp + 8 + 12]         ; stride
+   punpckhbw mm2, mm7
+   punpckhbw mm3, mm7
-                 pxor mm7, mm7                           ; mm7 = zero
+   lea eax,[eax+2*edx]
+   movq [ecx+%1*32+8], mm2
-                 mov eax, 8
+   movq [ecx+%1*32+24], mm3
+ %endmacro
- .loop
-                 movq mm0, [esi]
-                 movq mm1, mm0
-                 punpcklbw mm0, mm7              ; mm01 = unpack([src])
-                 punpckhbw mm1, mm7
-                 movq [edi], mm0                 ; [dst] = mm01
-                 movq [edi + 8], mm1
-                 add edi, 16
+ align 16
-                 add esi, ecx
+ transfer_8to16copy_mmx:
-                 dec eax
-                 jnz .loop
-                 pop edi
+   mov ecx, [esp+ 4] ; Dst
-                 pop esi
+   mov eax, [esp+ 8] ; Src
+   mov edx, [esp+12] ; Stride
+   pxor mm7,mm7
+   COPY_8_TO_16 0
+   COPY_8_TO_16 1
+   COPY_8_TO_16 2
+   COPY_8_TO_16 3
                  ret
  ;===========================================================================
  ;
  ; void transfer_16to8copy_mmx(uint8_t * const dst,
-Line 105
+Line 132
  ;
  ;===========================================================================
- align 16
+ %macro COPY_16_TO_8 1
- cglobal transfer_16to8copy_mmx
+   movq mm0, [eax+%1*32]
- transfer_16to8copy_mmx
+   movq mm1, [eax+%1*32+8]
+   packuswb mm0, mm1
-                 push    esi
+   movq [ecx], mm0
-                 push    edi
+   movq mm2, [eax+%1*32+16]
+   movq mm3, [eax+%1*32+24]
-                 mov     edi, [esp + 8 + 4]              ; dst
+   packuswb mm2, mm3
-                 mov     esi, [esp + 8 + 8]              ; src
+   movq [ecx+edx], mm2
-                 mov ecx, [esp + 8 + 12]         ; stride
+ %endmacro
-                 mov eax, 8
- .loop
-                 movq mm0, [esi]
-                 packuswb mm0, [esi + 8]         ; mm0 = pack([src])
-                 movq [edi], mm0                         ; [dst] = mm0
-                 add esi, 16
-                 add edi, ecx
-                 dec eax
-                 jnz .loop
-                 pop edi
+ align 16
-                 pop esi
+ transfer_16to8copy_mmx:
+   mov ecx, [esp+ 4] ; Dst
+   mov eax, [esp+ 8] ; Src
+   mov edx, [esp+12] ; Stride
+   COPY_16_TO_8 0
+   lea ecx,[ecx+2*edx]
+   COPY_16_TO_8 1
+   lea ecx,[ecx+2*edx]
+   COPY_16_TO_8 2
+   lea ecx,[ecx+2*edx]
+   COPY_16_TO_8 3
                  ret
  ;===========================================================================
  ;
  ; void transfer_8to16sub_mmx(int16_t * const dct,
-Line 151
+Line 175
  ; * 02.12.2001  loop unrolled, code runs 10% faster now (Isibaar)
  ; * 30.11.2001  16 pixels are processed per iteration (Isibaar)
  ; * 30.11.2001  .text missing
- ; *     06.11.2001      inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
+ ; *     06.11.2001      inital version; (c)2001 peter ross <pross@xvid.org>
  ; *
  ; *************************************************************************/
- align 16
+ %macro COPY_8_TO_16_SUB 1
- cglobal transfer_8to16sub_mmx
+   movq mm0, [eax]      ; cur
- transfer_8to16sub_mmx
+   movq mm2, [eax+edx]
-                 push    esi
-                 push    edi
-                 push    ebx
-                 mov     edi, [esp + 12 + 4]             ; dct [out]
-                 mov     edx, [esp + 12 + 8]             ; cur [in/out]
-                 mov     esi, [esp + 12 + 12]            ; ref [in]
-                 mov ecx, [esp + 12 + 16]                ; stride [in]
-                 mov eax, edx                            ; cur -> eax
-                 mov ebx, esi                            ; ref -> ebx
-                 add eax, ecx                            ; cur + stride
-                 add ebx, ecx                            ; ref + stride
-                 shl ecx, 1
-                 pxor mm7, mm7                   ; mm7 = zero
-                 movq mm0, [edx]                 ; mm01 = [cur]
                  movq mm1, mm0
-                 punpcklbw mm0, mm7
-                 punpckhbw mm1, mm7
-                 movq mm4, [eax]
-                 movq mm5, mm4
-                 punpcklbw mm4, mm7
-                 punpckhbw mm5, mm7
-                 movq mm2, [esi]                 ; mm23 = [ref]
                  movq mm3, mm2
-                 movq mm6, [ebx]
-                 movq [edx], mm2                 ; [cur] = [ref]
-                 movq [eax], mm6
-                 punpcklbw mm2, mm7
-                 punpckhbw mm3, mm7
-                 psubsw mm0, mm2                 ; mm01 -= mm23
-                 movq mm2, mm6
-                 punpcklbw mm2, mm7
-                 punpckhbw mm6, mm7
-                 psubsw mm1, mm3
-                 psubsw mm4, mm2
-                 psubsw mm5, mm6
-                 movq [edi], mm0                 ; dct[] = mm01
-                 movq [edi + 8], mm1
-                 movq [edi + 16], mm4
-                 movq [edi + 24], mm5
-                 add edx, ecx
-                 add esi, ecx
-                 add eax, ecx
-                 add ebx, ecx
-                 movq mm0, [edx]                 ; mm01 = [cur]
-                 movq mm1, mm0
                  punpcklbw mm0, mm7
-                 punpckhbw mm1, mm7
-                 movq mm4, [eax]
-                 movq mm5, mm4
-                 punpcklbw mm4, mm7
-                 punpckhbw mm5, mm7
-                 movq mm2, [esi]                 ; mm23 = [ref]
-                 movq mm3, mm2
-                 movq mm6, [ebx]
-                 movq [edx], mm2                 ; [cur] = [ref]
-                 movq [eax], mm6
                  punpcklbw mm2, mm7
+   movq mm4, [ebx]      ; ref
+         punpckhbw mm1, mm7
                  punpckhbw mm3, mm7
+   movq mm5, [ebx+edx]  ; ref
-                 psubsw mm0, mm2                 ; mm01 -= mm23
+   movq mm6, mm4
+   movq [eax], mm4
-                 movq mm2, mm6
+   movq [eax+edx], mm5
+   punpcklbw mm4, mm7
-                 punpcklbw mm2, mm7
                  punpckhbw mm6, mm7
+   psubsw mm0, mm4
+   psubsw mm1, mm6
+   movq mm6, mm5
+   punpcklbw mm5, mm7
+   punpckhbw mm6, mm7
+   psubsw mm2, mm5
+   lea eax,[eax+2*edx]
+   psubsw mm3, mm6
+   lea ebx,[ebx+2*edx]
+   movq [ecx+%1*32+ 0], mm0 ; dst
+         movq [ecx+%1*32+ 8], mm1
+         movq [ecx+%1*32+16], mm2
+         movq [ecx+%1*32+24], mm3
+ %endmacro
-                 psubsw mm1, mm3
+ align 16
+ transfer_8to16sub_mmx:
-                 psubsw mm4, mm2
+   mov ecx, [esp  + 4] ; Dst
-                 psubsw mm5, mm6
+   mov eax, [esp  + 8] ; Cur
+   push ebx
+   mov ebx, [esp+4+12] ; Ref
+   mov edx, [esp+4+16] ; Stride
+   pxor mm7, mm7
-                 movq [edi + 32], mm0                    ; dct[] = mm01
+   COPY_8_TO_16_SUB 0
-                 movq [edi + 40], mm1
+   COPY_8_TO_16_SUB 1
-                 movq [edi + 48], mm4
+   COPY_8_TO_16_SUB 2
-                 movq [edi + 56], mm5
+   COPY_8_TO_16_SUB 3
-                 add edx, ecx
-                 add esi, ecx
-                 add eax, ecx
-                 add ebx, ecx
-                 movq mm0, [edx]                 ; mm01 = [cur]
+   pop ebx
-                 movq mm1, mm0
+   ret
-                 punpcklbw mm0, mm7
+ ;===========================================================================
-                 punpckhbw mm1, mm7
+ ;
+ ; void transfer_8to16sub2_mmx(int16_t * const dct,
+ ;                               uint8_t * const cur,
+ ;                               const uint8_t * ref1,
+ ;                               const uint8_t * ref2,
+ ;                               const uint32_t stride)
+ ;
+ ;===========================================================================
-                 movq mm4, [eax]
+ %macro COPY_8_TO_16_SUB2_MMX 1
-                 movq mm5, mm4
+   movq mm0, [eax]      ; cur
+   movq mm2, [eax+edx]
+     ; mm4 <- (ref1+ref2+1) / 2
+   movq mm4, [ebx]      ; ref1
+   movq mm1, [esi]      ; ref2
+   movq mm6, mm4
+   movq mm3, mm1
                  punpcklbw mm4, mm7
-                 punpckhbw mm5, mm7
+   punpcklbw mm1, mm7
+   punpckhbw mm6, mm7
-                 movq mm2, [esi]                 ; mm23 = [ref]
-                 movq mm3, mm2
-                 movq mm6, [ebx]
-                 movq [edx], mm2                 ; [cur] = [ref]
-                 movq [eax], mm6
-                 punpcklbw mm2, mm7
                  punpckhbw mm3, mm7
+   paddusw mm4, mm1
-                 psubsw mm0, mm2                 ; mm01 -= mm23
+   paddusw mm6, mm3
+   psrlw mm4,1
-                 movq mm2, mm6
+   psrlw mm6,1
+   packuswb mm4, mm6
-                 punpcklbw mm2, mm7
+     ; mm5 <- (ref1+ref2+1) / 2
+   movq mm5, [ebx+edx]  ; ref1
+   movq mm1, [esi+edx]  ; ref2
+   movq mm6, mm5
+   movq mm3, mm1
+   punpcklbw mm5, mm7
+   punpcklbw mm1, mm7
                  punpckhbw mm6, mm7
+   punpckhbw mm3, mm7
+   paddusw mm5, mm1
+   paddusw mm6, mm3
+   lea esi,[esi+2*edx]
+   psrlw mm5,1
+   psrlw mm6,1
+   packuswb mm5, mm6
-                 psubsw mm1, mm3
-                 psubsw mm4, mm2
-                 psubsw mm5, mm6
-                 movq [edi + 64], mm0                    ; dct[] = mm01
-                 movq [edi + 72], mm1
-                 movq [edi + 80], mm4
-                 movq [edi + 88], mm5
-                 add edx, ecx
-                 add esi, ecx
-                 add eax, ecx
-                 add ebx, ecx
-                 movq mm0, [edx]                 ; mm01 = [cur]
                  movq mm1, mm0
-                 punpcklbw mm0, mm7
-                 punpckhbw mm1, mm7
-                 movq mm4, [eax]
-                 movq mm5, mm4
-                 punpcklbw mm4, mm7
-                 punpckhbw mm5, mm7
-                 movq mm2, [esi]                 ; mm23 = [ref]
                  movq mm3, mm2
+   punpcklbw mm0, mm7
-                 movq mm6, [ebx]
-                 movq [edx], mm2                 ; [cur] = [ref]
-                 movq [eax], mm6
                  punpcklbw mm2, mm7
+         punpckhbw mm1, mm7
                  punpckhbw mm3, mm7
-                 psubsw mm0, mm2                 ; mm01 -= mm23
+   movq mm6, mm4
+   punpcklbw mm4, mm7
-                 movq mm2, mm6
-                 punpcklbw mm2, mm7
                  punpckhbw mm6, mm7
+   psubsw mm0, mm4
+   psubsw mm1, mm6
+   movq mm6, mm5
+   punpcklbw mm5, mm7
+   punpckhbw mm6, mm7
+   psubsw mm2, mm5
+   lea eax,[eax+2*edx]
+   psubsw mm3, mm6
+   lea ebx,[ebx+2*edx]
+   movq [ecx+%1*32+ 0], mm0 ; dst
+         movq [ecx+%1*32+ 8], mm1
+         movq [ecx+%1*32+16], mm2
+         movq [ecx+%1*32+24], mm3
+ %endmacro
-                 psubsw mm1, mm3
+ align 16
+ transfer_8to16sub2_mmx:
-                 psubsw mm4, mm2
+   mov ecx, [esp  + 4] ; Dst
-                 psubsw mm5, mm6
+   mov eax, [esp  + 8] ; Cur
+   push ebx
+   mov ebx, [esp+4+12] ; Ref1
+   push esi
+   mov esi, [esp+8+16] ; Ref2
+   mov edx, [esp+8+20] ; Stride
+   pxor mm7, mm7
-                 movq [edi + 96], mm0                    ; dct[] = mm01
+   COPY_8_TO_16_SUB2_MMX 0
-                 movq [edi + 104], mm1
+   COPY_8_TO_16_SUB2_MMX 1
-                 movq [edi + 112], mm4
+   COPY_8_TO_16_SUB2_MMX 2
-                 movq [edi + 120], mm5
+   COPY_8_TO_16_SUB2_MMX 3
-                 pop ebx
-                 pop edi
                  pop esi
+   pop ebx
                  ret
  ;===========================================================================
  ;
  ; void transfer_8to16sub2_xmm(int16_t * const dct,
  ;                                                         uint8_t * const cur,
  ;                                                         const uint8_t * ref1,
  ;                                                         const uint8_t * ref2,
- ;                                                         const uint32_t stride);
+ ;                               const uint32_t stride)
  ;
  ;===========================================================================
- align 16
+ %macro COPY_8_TO_16_SUB2_SSE 1
- cglobal transfer_8to16sub2_xmm
+   movq mm0, [eax]      ; cur
- transfer_8to16sub2_xmm
+   movq mm2, [eax+edx]
+   movq mm1, mm0
-                 push edi
+   movq mm3, mm2
-                 push esi
-                 push ebx
-                 mov edi, [esp + 12 +  4] ; edi = &dct
-                 mov esi, [esp + 12 +  8] ; esi = &cur
-                 mov ebx, [esp + 12 + 12] ; ebx = &ref1
-                 mov edx, [esp + 12 + 16] ; edx = &ref2
-                 mov eax, [esp + 12 + 20] ; eax = stride
-                 pxor mm7, mm7   ; mm7 = 0
-                 shl eax, 1      ; eax = stride<<1
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 0] ; mm0 = cur row
-                 movq mm2, [ebx + 0]     ; mm2 = ref1 row
-                 movq mm3, [edx + 0]     ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 8] ; mm0 = cur row
-                 movq mm2, [ebx + 8]     ; mm2 = ref1 row
-                 movq mm3, [edx + 8]     ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 16] ; mm0 = cur row
-                 movq mm2, [ebx + 16]    ; mm2 = ref1 row
-                 movq mm3, [edx + 16]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 24] ; mm0 = cur row
-                 movq mm2, [ebx + 24]    ; mm2 = ref1 row
-                 movq mm3, [edx + 24]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 32] ; mm0 = cur row
-                 movq mm2, [ebx + 32]    ; mm2 = ref1 row
-                 movq mm3, [edx + 32]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 40] ; mm0 = cur row
-                 movq mm2, [ebx + 40]    ; mm2 = ref1 row
-                 movq mm3, [edx + 40]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 48] ; mm0 = cur row
-                 movq mm2, [ebx + 48]    ; mm2 = ref1 row
-                 movq mm3, [edx + 48]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
-                 ; Increment all pointers
-                 add edi, eax    ; edi = &(next dct row)
-                 ; Row processing
-                 ; One row at a time
-                 movq mm0, [esi + 56] ; mm0 = cur row
-                 movq mm2, [ebx + 56]    ; mm2 = ref1 row
-                 movq mm3, [edx + 56]    ; mm3 = ref2 row
-                 movq mm1, mm0   ; mm1 = cur row
-                 pavgb mm2, mm3          ; mm2 = (ref1 + ref2 + 1)/2 (== avg)
-                 punpcklbw mm0, mm7      ; mm0 = cur(3-0) <-> 16bit
-                 movq mm3,mm2            ; mm3 = avg
-                 punpckhbw mm1, mm7      ; mm1 = cur(7-4) <-> 16bit
-                 punpcklbw mm2, mm7      ; mm2 = avg(3-0) <-> 16bit
+   punpcklbw mm0, mm7
-                 punpckhbw mm3, mm7      ; mm3 = avg(7-4) <-> 16bit
+   punpcklbw mm2, mm7
+   movq mm4, [ebx]      ; ref1
+   pavgb mm4, [esi]     ; ref2
+         punpckhbw mm1, mm7
+         punpckhbw mm3, mm7
+   movq mm5, [ebx+edx]  ; ref
+   pavgb mm5, [esi+edx] ; ref2
-                 psubw mm0, mm2          ; mm0 = cur(3-0) - avg(3-0)
+   movq mm6, mm4
-                 psubw mm1, mm3          ; mm1 = cur(7-4) - avg(7-4)
+   punpcklbw mm4, mm7
+   punpckhbw mm6, mm7
+   psubsw mm0, mm4
+   psubsw mm1, mm6
+   lea esi,[esi+2*edx]
+   movq mm6, mm5
+   punpcklbw mm5, mm7
+   punpckhbw mm6, mm7
+   psubsw mm2, mm5
+   lea eax,[eax+2*edx]
+   psubsw mm3, mm6
+   lea ebx,[ebx+2*edx]
+   movq [ecx+%1*32+ 0], mm0 ; dst
+         movq [ecx+%1*32+ 8], mm1
+         movq [ecx+%1*32+16], mm2
+         movq [ecx+%1*32+24], mm3
+ %endmacro
-                 movq [edi + 0], mm0 ; dct(3-0) = mm0
+ align 16
-                 movq [edi + 8], mm1 ; dct(7-4) = mm1
+ transfer_8to16sub2_xmm:
+   mov ecx, [esp  + 4] ; Dst
+   mov eax, [esp  + 8] ; Cur
+   push ebx
+   mov ebx, [esp+4+12] ; Ref1
+   push esi
+   mov esi, [esp+8+16] ; Ref2
+   mov edx, [esp+8+20] ; Stride
+   pxor mm7, mm7
-                 ; Exit
+   COPY_8_TO_16_SUB2_SSE 0
+   COPY_8_TO_16_SUB2_SSE 1
+   COPY_8_TO_16_SUB2_SSE 2
+   COPY_8_TO_16_SUB2_SSE 3
-                 pop ebx
                  pop esi
-                 pop edi
+   pop ebx
                  ret
  ;===========================================================================
-Line 598
+Line 395
  ;
  ;===========================================================================
- align 16
+ %macro COPY_16_TO_8_ADD 1
- cglobal transfer_16to8add_mmx
+   movq mm0, [ecx]
- transfer_16to8add_mmx
+   movq mm2, [ecx+edx]
-                 push    esi
-                 push    edi
-                 mov     edi, [esp + 8 + 4]              ; dst
-                 mov     esi, [esp + 8 + 8]              ; src
-                 mov ecx, [esp + 8 + 12]         ; stride
-                 pxor mm7, mm7
-                 mov eax, 8
- .loop
-                 movq mm0, [edi]
                  movq mm1, mm0
-                 punpcklbw mm0, mm7              ; mm23 = unpack([dst])
+   movq mm3, mm2
+   punpcklbw mm0, mm7
+   punpcklbw mm2, mm7
                  punpckhbw mm1, mm7
+   punpckhbw mm3, mm7
+   paddsw mm0, [eax+%1*32+ 0]
+   paddsw mm1, [eax+%1*32+ 8]
+   paddsw mm2, [eax+%1*32+16]
+   paddsw mm3, [eax+%1*32+24]
+   packuswb mm0, mm1
+   movq [ecx], mm0
+   packuswb mm2, mm3
+   movq [ecx+edx], mm2
+ %endmacro
-                 movq mm2, [esi]                 ; mm01 = [src]
-                 movq mm3, [esi + 8]
-                 paddsw mm0, mm2                 ; mm01 += mm23
-                 paddsw mm1, mm3
-                 packuswb mm0, mm1               ; [dst] = pack(mm01)
-                 movq [edi], mm0
-                 add esi, 16
-                 add edi, ecx
-                 dec eax
-                 jnz .loop
-                 pop edi
+ align 16
-                 pop esi
+ transfer_16to8add_mmx:
+   mov ecx, [esp+ 4] ; Dst
+   mov eax, [esp+ 8] ; Src
+   mov edx, [esp+12] ; Stride
+   pxor mm7, mm7
+   COPY_16_TO_8_ADD 0
+   lea ecx,[ecx+2*edx]
+   COPY_16_TO_8_ADD 1
+   lea ecx,[ecx+2*edx]
+   COPY_16_TO_8_ADD 2
+   lea ecx,[ecx+2*edx]
+   COPY_16_TO_8_ADD 3
                  ret
  ;===========================================================================
  ;
  ; void transfer8x8_copy_mmx(uint8_t * const dst,
-Line 648
+Line 440
  ;
  ;===========================================================================
- align 16
+ %macro COPY_8_TO_8 0
- cglobal transfer8x8_copy_mmx
+   movq mm0, [eax]
- transfer8x8_copy_mmx
+   movq mm1, [eax+edx]
-                 push    esi
+   movq [ecx], mm0
-                 push    edi
+   lea eax,[eax+2*edx]
+   movq [ecx+edx], mm1
-                 mov     edi, [esp + 8 + 4]              ; dst [out]
+ %endmacro
-                 mov     esi, [esp + 8 + 8]              ; src [in]
-                 mov eax, [esp + 8 + 12]         ; stride [in]
-                 movq mm0, [esi]
-                 movq mm1, [esi+eax]
-                 movq [edi], mm0
-                 movq [edi+eax], mm1
-                 add esi, eax
-                 add edi, eax
-                 add esi, eax
-                 add edi, eax
-                 movq mm0, [esi]
-                 movq mm1, [esi+eax]
-                 movq [edi], mm0
-                 movq [edi+eax], mm1
-                 add esi, eax
-                 add edi, eax
-                 add esi, eax
-                 add edi, eax
-                 movq mm0, [esi]
-                 movq mm1, [esi+eax]
-                 movq [edi], mm0
-                 movq [edi+eax], mm1
-                 add esi, eax
-                 add edi, eax
-                 add esi, eax
-                 add edi, eax
-                 movq mm0, [esi]
-                 movq mm1, [esi+eax]
-                 movq [edi], mm0
-                 movq [edi+eax], mm1
-                 add esi, eax
-                 add edi, eax
-                 add esi, eax
-                 add edi, eax
-                 pop edi
-                 pop esi
+ align 16
+ transfer8x8_copy_mmx:
+   mov ecx, [esp+ 4] ; Dst
+   mov eax, [esp+ 8] ; Src
+   mov edx, [esp+12] ; Stride
+   COPY_8_TO_8
+   lea ecx,[ecx+2*edx]
+   COPY_8_TO_8
+   lea ecx,[ecx+2*edx]
+   COPY_8_TO_8
+   lea ecx,[ecx+2*edx]
+   COPY_8_TO_8
                  ret

 Legend:



Removed from v.216
 


changed lines


 
Added in v.654
 Legend:



Removed from v.216
 


changed lines


 
Added in v.654
-Removed from v.216
+Added in v.654

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4