;/**************************************************************************** ; * ; * XVID MPEG-4 VIDEO CODEC ; * - K7 optimized SAD operators - ; * ; * Copyright(C) 2001 Peter Ross ; * 2002 Pascal Massimino ; * 2004 Andre Werthmann ; * ; * This program is free software; you can redistribute it and/or modify it ; * under the terms of the GNU General Public License as published by ; * the Free Software Foundation; either version 2 of the License, or ; * (at your option) any later version. ; * ; * This program is distributed in the hope that it will be useful, ; * but WITHOUT ANY WARRANTY; without even the implied warranty of ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; * GNU General Public License for more details. ; * ; * You should have received a copy of the GNU General Public License ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * ; * $Id: sad_mmx.asm,v 1.1 2005-01-05 23:02:15 edgomez Exp $ ; * ; ***************************************************************************/ BITS 64 %macro cglobal 1 %ifdef PREFIX %ifdef MARK_FUNCS global _%1:function %1.endfunc-%1 %define %1 _%1:function %1.endfunc-%1 %else global _%1 %define %1 _%1 %endif %else %ifdef MARK_FUNCS global %1:function %1.endfunc-%1 %else global %1 %endif %endif %endmacro ;============================================================================= ; Read only data ;============================================================================= %ifdef FORMAT_COFF SECTION .rodata %else SECTION .rodata align=16 %endif ;============================================================================= ; Code ;============================================================================= SECTION .text align=16 cglobal sse8_16bit_x86_64 cglobal sse8_8bit_x86_64 ;----------------------------------------------------------------------------- ; ; uint32_t sse8_16bit_x86_64x(const int16_t *b1, ; const int16_t *b2, ; const uint32_t stride); ; ;----------------------------------------------------------------------------- %macro ROW_SSE_16Bit_MMX 2 movq mm0, [%1] movq mm1, [%1+8] psubw mm0, [%2] psubw mm1, [%2+8] pmaddwd mm0, mm0 pmaddwd mm1, mm1 paddd mm2, mm0 paddd mm2, mm1 %endmacro sse8_16bit_x86_64: ; rdx is stride ; rsi is b2 ; rdi is b1 ;; Reset the sse accumulator pxor mm2, mm2 ;; Let's go %rep 8 ROW_SSE_16Bit_MMX rsi, rdi lea rsi, [rsi+rdx] lea rdi, [rdi+rdx] %endrep ;; Finish adding each dword of the accumulator movq mm3, mm2 psrlq mm2, 32 paddd mm2, mm3 movd eax, mm2 ;; All done ret .endfunc ;----------------------------------------------------------------------------- ; ; uint32_t sse8_8bit_x86_64(const int8_t *b1, ; const int8_t *b2, ; const uint32_t stride); ; ;----------------------------------------------------------------------------- %macro ROW_SSE_8bit_MMX 2 movq mm0, [%1] ; load a row movq mm2, [%2] ; load a row movq mm1, mm0 ; copy row movq mm3, mm2 ; copy row punpcklbw mm0, mm7 ; turn the 4low elements into 16bit punpckhbw mm1, mm7 ; turn the 4high elements into 16bit punpcklbw mm2, mm7 ; turn the 4low elements into 16bit punpckhbw mm3, mm7 ; turn the 4high elements into 16bit psubw mm0, mm2 ; low part of src-dst psubw mm1, mm3 ; high part of src-dst pmaddwd mm0, mm0 ; compute the square sum pmaddwd mm1, mm1 ; compute the square sum paddd mm6, mm0 ; add to the accumulator paddd mm6, mm1 ; add to the accumulator %endmacro sse8_8bit_x86_64: ;; Reset the sse accumulator pxor mm6, mm6 ;; Used to interleave 8bit data with 0x00 values pxor mm7, mm7 ;; Let's go %rep 8 ROW_SSE_8bit_MMX rsi, rdi lea rsi, [rsi+rdx] lea rdi, [rdi+rdx] %endrep ;; Finish adding each dword of the accumulator movq mm7, mm6 psrlq mm6, 32 paddd mm6, mm7 movd eax, mm6 ;; All done ret .endfunc