[svn] / trunk / xvidcore / src / motion / x86_asm / sad_sse2.asm Repository:
ViewVC logotype

View of /trunk/xvidcore/src/motion/x86_asm/sad_sse2.asm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1839 - (download) (annotate)
Tue Dec 2 13:44:55 2008 UTC (15 years, 3 months ago) by Isibaar
File size: 4361 byte(s)
WIN64 XMM6/XMM7 bench and asm optimization patch by Andrew Dunstan
;/****************************************************************************
; *
; *  XVID MPEG-4 VIDEO CODEC
; *  - SSE2 optimized SAD operators -
; *
; *  Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
; *
; *
; *  This program is free software; you can redistribute it and/or modify it
; *  under the terms of the GNU General Public License as published by
; *  the Free Software Foundation; either version 2 of the License, or
; *  (at your option) any later version.
; *
; *  This program is distributed in the hope that it will be useful,
; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *  GNU General Public License for more details.
; *
; *  You should have received a copy of the GNU General Public License
; *  along with this program; if not, write to the Free Software
; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
; *
; * $Id: sad_sse2.asm,v 1.18 2008-12-02 13:44:55 Isibaar Exp $
; *
; ***************************************************************************/

%include "nasm.inc"

;=============================================================================
; Read only data
;=============================================================================

DATA

ALIGN SECTION_ALIGN
zero    times 4   dd 0

;=============================================================================
; Code
;=============================================================================

SECTION .rotext align=SECTION_ALIGN

cglobal  sad16_sse2
cglobal  dev16_sse2

cglobal  sad16_sse3
cglobal  dev16_sse3

;-----------------------------------------------------------------------------
; uint32_t sad16_sse2 (const uint8_t * const cur, <- assumed aligned!
;                      const uint8_t * const ref,
;	                   const uint32_t stride,
;                      const uint32_t /*ignored*/);
;-----------------------------------------------------------------------------


%macro SAD_16x16_SSE2 1
  %1  xmm0, [TMP1]
  %1  xmm1, [TMP1+TMP0]
  lea TMP1,[TMP1+2*TMP0]
  movdqa  xmm2, [_EAX]
  movdqa  xmm3, [_EAX+TMP0]
  lea _EAX,[_EAX+2*TMP0]
  psadbw  xmm0, xmm2
  paddusw xmm4,xmm0
  psadbw  xmm1, xmm3
  paddusw xmm4,xmm1
%endmacro

%macro SAD16_SSE2_SSE3 1
  mov _EAX, prm1 ; cur (assumed aligned)
  mov TMP1, prm2 ; ref
  mov TMP0, prm3 ; stride

  pxor xmm4, xmm4 ; accum

  SAD_16x16_SSE2 %1
  SAD_16x16_SSE2 %1
  SAD_16x16_SSE2 %1
  SAD_16x16_SSE2 %1
  SAD_16x16_SSE2 %1
  SAD_16x16_SSE2 %1
  SAD_16x16_SSE2 %1
  SAD_16x16_SSE2 %1

  pshufd  xmm5, xmm4, 00000010b
  paddusw xmm4, xmm5
  pextrw  eax, xmm4, 0

  ret
%endmacro

ALIGN SECTION_ALIGN
sad16_sse2:
  SAD16_SSE2_SSE3 movdqu
ENDFUNC


ALIGN SECTION_ALIGN
sad16_sse3:
  SAD16_SSE2_SSE3 lddqu
ENDFUNC


;-----------------------------------------------------------------------------
; uint32_t dev16_sse2(const uint8_t * const cur, const uint32_t stride);
;-----------------------------------------------------------------------------

%macro MEAN_16x16_SSE2 1  ; _EAX: src, TMP0:stride, mm7: zero or mean => mm6: result
  %1 xmm0, [_EAX]
  %1 xmm1, [_EAX+TMP0]
  lea _EAX, [_EAX+2*TMP0]    ; + 2*stride
  psadbw xmm0, xmm5
  paddusw xmm4, xmm0
  psadbw xmm1, xmm5
  paddusw xmm4, xmm1
%endmacro


%macro MEAN16_SSE2_SSE3 1
  mov _EAX, prm1   ; src
  mov TMP0, prm2   ; stride

  pxor xmm4, xmm4     ; accum
  pxor xmm5, xmm5     ; zero

  MEAN_16x16_SSE2 %1
  MEAN_16x16_SSE2 %1
  MEAN_16x16_SSE2 %1
  MEAN_16x16_SSE2 %1

  MEAN_16x16_SSE2 %1
  MEAN_16x16_SSE2 %1
  MEAN_16x16_SSE2 %1
  MEAN_16x16_SSE2 %1

  mov _EAX, prm1       ; src again

  pshufd   xmm5, xmm4, 10b
  paddusw  xmm5, xmm4
  pxor     xmm4, xmm4     ; zero accum
  psrlw    xmm5, 8        ; => Mean
  pshuflw  xmm5, xmm5, 0  ; replicate Mean
  packuswb xmm5, xmm5
  pshufd   xmm5, xmm5, 00000000b

  MEAN_16x16_SSE2 %1
  MEAN_16x16_SSE2 %1
  MEAN_16x16_SSE2 %1
  MEAN_16x16_SSE2 %1

  MEAN_16x16_SSE2 %1
  MEAN_16x16_SSE2 %1
  MEAN_16x16_SSE2 %1
  MEAN_16x16_SSE2 %1

  pshufd   xmm5, xmm4, 10b
  paddusw  xmm5, xmm4
  pextrw eax, xmm5, 0

  ret
%endmacro

ALIGN SECTION_ALIGN
dev16_sse2:
  MEAN16_SSE2_SSE3 movdqu
ENDFUNC

ALIGN SECTION_ALIGN
dev16_sse3:
  MEAN16_SSE2_SSE3 lddqu
ENDFUNC

 
%ifidn __OUTPUT_FORMAT__,elf
section ".note.GNU-stack" noalloc noexec nowrite progbits
%endif


No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4