--- trunk/xvidcore/src/motion/x86_asm/sad_mmx.asm 2002/07/07 09:45:40 262 +++ trunk/xvidcore/src/motion/x86_asm/sad_mmx.asm 2002/09/06 16:59:47 430 @@ -1,37 +1,39 @@ -;/************************************************************************** +;/***************************************************************************** ; * -; * XVID MPEG-4 VIDEO CODEC -; * mmx sum of absolute difference +; * XVID MPEG-4 VIDEO CODEC +; * mmx sum of absolute difference ; * -; * This program is free software; you can redistribute it and/or modify -; * it under the terms of the GNU General Public License as published by -; * the Free Software Foundation; either version 2 of the License, or -; * (at your option) any later version. +; * Copyright(C) 2002 Peter Ross ; * -; * This program is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; * GNU General Public License for more details. +; * This program is an implementation of a part of one or more MPEG-4 +; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending +; * to use this software module in hardware or software products are +; * advised that its use may infringe existing patents or copyrights, and +; * any such use would be at such party's own risk. The original +; * developer of this software module and his/her company, and subsequent +; * editors and their companies, will have no liability for use of this +; * software or modifications or derivatives thereof. ; * -; * You should have received a copy of the GNU General Public License -; * along with this program; if not, write to the Free Software -; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +; * This program is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. ; * -; *************************************************************************/ - -;/************************************************************************** -; * -; * History: +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. ; * -; * 23.07.2002 sad[16,8]bi_xmm; -; * 04.06.2002 cleanup -Skal- -; * 12.11.2001 inital version; (c)2001 peter ross +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; *************************************************************************/ +, ****************************************************************************/ bits 32 %macro cglobal 1 + %ifdef PREFIX global _%1 %define %1 _%1 %else @@ -57,6 +59,8 @@ ; uint32_t sad16_mmx(const uint8_t * const cur, ; const uint8_t * const ref, ; const uint32_t stride, +; const uint32_t best_sad); +; ; (early termination ignore; slows this down) ; ;=========================================================================== @@ -206,6 +210,162 @@ ; ; uint32_t sad16bi_mmx(const uint8_t * const cur, ; const uint8_t * const ref1, +; const uint8_t * const ref2, +; const uint32_t stride); +; +;=========================================================================== +%macro SADBI_16x16_MMX 2 ; SADBI_16x16_MMX( int_ptr_offset, bool_increment_ptr ); + + movq mm0, [edx+%1] + movq mm2, [ebx+%1] + movq mm1, mm0 + movq mm3, mm2 + +%if %2 != 0 + add edx, ecx +%endif + + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 +punpcklbw mm2, mm7 +punpckhbw mm3, mm7 + +%if %2 != 0 + add ebx, ecx +%endif + +paddusw mm0, mm2 ; mm01 = ref1 + ref2 +paddusw mm1, mm3 +paddusw mm0, [mmx_one] ; mm01 += 1 +paddusw mm1, [mmx_one] +psrlw mm0, 1 ; mm01 >>= 1 +psrlw mm1, 1 + + movq mm2, [eax+%1] + movq mm3, mm2 + punpcklbw mm2, mm7 ; mm23 = src + punpckhbw mm3, mm7 + +%if %2 != 0 + add eax, ecx +%endif + + movq mm4, mm0 + movq mm5, mm1 + psubusw mm0, mm2 + psubusw mm1, mm3 + psubusw mm2, mm4 + psubusw mm3, mm5 + por mm0, mm2 ; mm01 = ABS(mm01 - mm23) + por mm1, mm3 + + paddusw mm6,mm0 ; mm6 += mm01 + paddusw mm6,mm1 + +%endmacro + +align 16 +sad16bi_mmx: + push ebx + mov eax, [esp+4+ 4] ; Src + mov edx, [esp+4+ 8] ; Ref1 + mov ebx, [esp+4+12] ; Ref2 + mov ecx, [esp+4+16] ; Stride + + pxor mm6, mm6 ; accum2 +pxor mm7, mm7 +.Loop + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + SADBI_16x16_MMX 0, 0 + SADBI_16x16_MMX 8, 1 + + pmaddwd mm6, [mmx_one] ; collapse + movq mm7, mm6 + psrlq mm7, 32 + paddd mm6, mm7 + + movd eax, mm6 + pop ebx + ret + +;=========================================================================== +; +; uint32_t sad8bi_mmx(const uint8_t * const cur, +; const uint8_t * const ref1, +; const uint8_t * const ref2, +; const uint32_t stride); +; +;=========================================================================== +align 16 +sad8bi_mmx: + push ebx + mov eax, [esp+4+ 4] ; Src + mov edx, [esp+4+ 8] ; Ref1 + mov ebx, [esp+4+12] ; Ref2 + mov ecx, [esp+4+16] ; Stride + + pxor mm6, mm6 ; accum2 +pxor mm7, mm7 +.Loop + SADBI_16x16_MMX 0, 1 + SADBI_16x16_MMX 0, 1 + SADBI_16x16_MMX 0, 1 + SADBI_16x16_MMX 0, 1 + SADBI_16x16_MMX 0, 1 + SADBI_16x16_MMX 0, 1 + SADBI_16x16_MMX 0, 1 + SADBI_16x16_MMX 0, 1 + + pmaddwd mm6, [mmx_one] ; collapse + movq mm7, mm6 + psrlq mm7, 32 + paddd mm6, mm7 + + movd eax, mm6 + pop ebx + ret + + + + +;=========================================================================== +; +; uint32_t dev16_mmx(const uint8_t * const cur, +; const uint32_t stride); +; +;=========================================================================== + +%macro MEAN_16x16_MMX 0 movq mm0, [eax] movq mm2, [eax+8] lea eax,[eax+ecx] @@ -325,4 +485,5 @@ paddd mm6, mm5 movd eax, mm6 - ret \ No newline at end of file + ret +