--- branches/dev-api-4/xvidcore/src/utils/x86_asm/interlacing_mmx.asm 2003/10/27 01:03:43 1191 +++ branches/dev-api-4/xvidcore/src/utils/x86_asm/interlacing_mmx.asm 2003/10/28 22:23:03 1192 @@ -1,65 +1,62 @@ -;/************************************************************************** +;/**************************************************************************** ; * -; * XVID MPEG-4 VIDEO CODEC -; * mmx interlacing decision +; * XVID MPEG-4 VIDEO CODEC +; * - Interlacing Field test - ; * -; * This program is an implementation of a part of one or more MPEG-4 -; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending -; * to use this software module in hardware or software products are -; * advised that its use may infringe existing patents or copyrights, and -; * any such use would be at such party's own risk. The original -; * developer of this software module and his/her company, and subsequent -; * editors and their companies, will have no liability for use of this -; * software or modifications or derivatives thereof. +; * Copyright(C) 2002 Daniel Smith ; * -; * This program is free software; you can redistribute it and/or modify -; * it under the terms of the GNU General Public License as published by -; * the Free Software Foundation; either version 2 of the License, or -; * (at your option) any later version. +; * This program is free software ; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation ; either version 2 of the License, or +; * (at your option) any later version. ; * -; * This program is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; * GNU General Public License for more details. +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY ; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. ; * -; * You should have received a copy of the GNU General Public License -; * along with this program; if not, write to the Free Software -; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +; * You should have received a copy of the GNU General Public License +; * along with this program ; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; *************************************************************************/ - -;/************************************************************************** -; * -; * History: -; * -; * 04.09.2002 initial version; (c)2002 daniel smith +; * $Id: interlacing_mmx.asm,v 1.2.2.1 2003-10-28 22:23:03 edgomez Exp $ ; * -; *************************************************************************/ +; ***************************************************************************/ +BITS 32 -bits 32 - -%macro cglobal 1 +%macro cglobal 1 %ifdef PREFIX - global _%1 + global _%1 %define %1 _%1 %else global %1 %endif %endmacro +;============================================================================= +; Read only data +;============================================================================= + +SECTION .rodata + +; advances to next block on right +ALIGN 16 +nexts: + dd 0, 0, 8, 120, 8 -section .text - -cglobal MBFieldTest_mmx +; multiply word sums into dwords +ALIGN 16 +ones: + times 4 dw 1 + +;============================================================================= +; Code +;============================================================================= -; advances to next block on right -align 16 -nexts dd 0, 0, 8, 120, 8 +SECTION .text -; multiply word sums into dwords -align 16 -ones times 4 dw 1 +cglobal MBFieldTest_mmx ; neater %define line0 esi @@ -99,112 +96,109 @@ ; gets diff between three lines low(%2),mid(%3),hi(%4): frame = mid-low, field = hi-low %macro ABS8 4 - movq %4, [%1] ; m02 = hi - movq mm3, %2 ; mm3 = low copy + movq %4, [%1] ; m02 = hi + movq mm3, %2 ; mm3 = low copy - pxor mm4, mm4 ; mm4 = 0 - pxor mm5, mm5 ; mm5 = 0 + pxor mm4, mm4 ; mm4 = 0 + pxor mm5, mm5 ; mm5 = 0 - psubw %2, %3 ; diff(med,low) for frame - psubw mm3, %4 ; diff(hi,low) for field + psubw %2, %3 ; diff(med,low) for frame + psubw mm3, %4 ; diff(hi,low) for field - pcmpgtw mm4, %2 ; if (diff<0), mm4 will be all 1's, else all 0's - pcmpgtw mm5, mm3 - pxor %2, mm4 ; this will get abs(), but off by 1 if (diff<0) - pxor mm3, mm5 - psubw %2, mm4 ; correct abs being off by 1 when (diff<0) - psubw mm3, mm5 + pcmpgtw mm4, %2 ; if (diff<0), mm4 will be all 1's, else all 0's + pcmpgtw mm5, mm3 + pxor %2, mm4 ; this will get abs(), but off by 1 if (diff<0) + pxor mm3, mm5 + psubw %2, mm4 ; correct abs being off by 1 when (diff<0) + psubw mm3, mm5 - paddw mm6, %2 ; add to totals - paddw mm7, mm3 + paddw mm6, %2 ; add to totals + paddw mm7, mm3 %endmacro -section .text - -;=========================================================================== +;----------------------------------------------------------------------------- ; ; uint32_t MBFieldTest_mmx(int16_t * const data); ; -;=========================================================================== +;----------------------------------------------------------------------------- -align 16 +ALIGN 16 MBFieldTest_mmx: - push esi - push edi + push esi + push edi - mov esi, [esp+8+4] ; esi = top left block - mov edi, esi - add edi, 256 ; edi = bottom left block - - pxor mm6, mm6 ; frame total - pxor mm7, mm7 ; field total - - mov eax, 4 ; we do left 8 bytes of data[0*64], then right 8 bytes - ; then left 8 bytes of data[1*64], then last 8 bytes - -_loop: - movq m00, [line0] ; line0 - movq m01, [line1] ; line1 - - ABS8 line2, m00, m01, m02 ; frame += (line2-line1), field += (line2-line0) - ABS8 line3, m01, m02, m03 - ABS8 line4, m02, m03, m04 - ABS8 line5, m03, m04, m05 - ABS8 line6, m04, m05, m06 - ABS8 line7, m05, m06, m07 - ABS8 line8, m06, m07, m08 - - movq m09, [line9] ; line9-line7, no frame comp for line9-line8! - pxor mm4, mm4 - psubw m07, m09 - pcmpgtw mm4, mm1 - pxor m07, mm4 - psubw m07, mm4 - paddw mm7, m07 ; add to field total - - ABS8 line10, m08, m09, m10 ; frame += (line10-line9), field += (line10-line8) - ABS8 line11, m09, m10, m11 - ABS8 line12, m10, m11, m12 - ABS8 line13, m11, m12, m13 - ABS8 line14, m12, m13, m14 - ABS8 line15, m13, m14, m15 - - pxor mm4, mm4 ; line15-line14, we're done with field comps! - psubw m14, m15 - pcmpgtw mm4, m14 - pxor m14, mm4 - psubw m14, mm4 - paddw mm6, m14 ; add to frame total - - mov ecx, [nexts+eax*4] ; move esi/edi 8 pixels to the right - add esi, ecx - add edi, ecx - - dec eax - jnz near _loop - -_decide: - movq mm0, [ones] ; add packed words into single dwords - pmaddwd mm6, mm0 - pmaddwd mm7, mm0 - - movq mm0, mm6 ; ecx will be frame total, edx field - movq mm1, mm7 - psrlq mm0, 32 - psrlq mm1, 32 - paddd mm0, mm6 - paddd mm1, mm7 - movd ecx, mm0 - movd edx, mm1 - - add edx, 350 ; add bias against field decision - cmp ecx, edx - jb _end ; if frame=field, use field dct (return 1) - -_end: - pop edi - pop esi + mov esi, [esp+8+4] ; esi = top left block + mov edi, esi + add edi, 256 ; edi = bottom left block + + pxor mm6, mm6 ; frame total + pxor mm7, mm7 ; field total + + mov eax, 4 ; we do left 8 bytes of data[0*64], then right 8 bytes + ; then left 8 bytes of data[1*64], then last 8 bytes +.loop: + movq m00, [line0] ; line0 + movq m01, [line1] ; line1 + + ABS8 line2, m00, m01, m02 ; frame += (line2-line1), field += (line2-line0) + ABS8 line3, m01, m02, m03 + ABS8 line4, m02, m03, m04 + ABS8 line5, m03, m04, m05 + ABS8 line6, m04, m05, m06 + ABS8 line7, m05, m06, m07 + ABS8 line8, m06, m07, m08 + + movq m09, [line9] ; line9-line7, no frame comp for line9-line8! + pxor mm4, mm4 + psubw m07, m09 + pcmpgtw mm4, mm1 + pxor m07, mm4 + psubw m07, mm4 + paddw mm7, m07 ; add to field total + + ABS8 line10, m08, m09, m10 ; frame += (line10-line9), field += (line10-line8) + ABS8 line11, m09, m10, m11 + ABS8 line12, m10, m11, m12 + ABS8 line13, m11, m12, m13 + ABS8 line14, m12, m13, m14 + ABS8 line15, m13, m14, m15 + + pxor mm4, mm4 ; line15-line14, we're done with field comps! + psubw m14, m15 + pcmpgtw mm4, m14 + pxor m14, mm4 + psubw m14, mm4 + paddw mm6, m14 ; add to frame total + + mov ecx, [nexts+eax*4] ; move esi/edi 8 pixels to the right + add esi, ecx + add edi, ecx + + dec eax + jnz near .loop + +.decide: + movq mm0, [ones] ; add packed words into single dwords + pmaddwd mm6, mm0 + pmaddwd mm7, mm0 + + movq mm0, mm6 ; ecx will be frame total, edx field + movq mm1, mm7 + psrlq mm0, 32 + psrlq mm1, 32 + paddd mm0, mm6 + paddd mm1, mm7 + movd ecx, mm0 + movd edx, mm1 + + add edx, 350 ; add bias against field decision + cmp ecx, edx + jb .end ; if frame=field, use field dct (return 1) + +.end: + pop edi + pop esi - ret + ret