--- branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize_h263_mmx.asm 2003/10/07 13:02:35 1174 +++ branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize_h263_mmx.asm 2003/10/09 18:50:22 1176 @@ -1,46 +1,28 @@ -;/************************************************************************** +;/***************************************************************************** ; * -; * XVID MPEG-4 VIDEO CODEC -; * mmx quantization/dequantization +; * XVID MPEG-4 VIDEO CODEC +; * - MPEG4 Quantization H263 implementation / MMX optimized - ; * -; * This program is an implementation of a part of one or more MPEG-4 -; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending -; * to use this software module in hardware or software products are -; * advised that its use may infringe existing patents or copyrights, and -; * any such use would be at such party's own risk. The original -; * developer of this software module and his/her company, and subsequent -; * editors and their companies, will have no liability for use of this -; * software or modifications or derivatives thereof. +; * Copyright(C) 2001-2003 Peter Ross +; * 2002-2003 Pascal Massimino ; * -; * This program is free software; you can redistribute it and/or modify -; * it under the terms of the GNU General Public License as published by -; * the Free Software Foundation; either version 2 of the License, or -; * (at your option) any later version. +; * This program is free software ; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation ; either version 2 of the License, or +; * (at your option) any later version. ; * -; * This program is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; * GNU General Public License for more details. +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY ; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. ; * -; * You should have received a copy of the GNU General Public License -; * along with this program; if not, write to the Free Software -; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +; * You should have received a copy of the GNU General Public License +; * along with this program ; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; *************************************************************************/ - -;/************************************************************************** -; * -; * History: -; * -; * 09.08.2002 sse2 dequant funcs revamped -; * 14.06.2002 mmx+xmm dequant_* funcs revamped -Skal- -; * 24.02.2002 sse2 quant_intra / dequant_intra (have to use movdqu ???) -; * 17.04.2002 sse2 quant_inter / dequant_inter -; * 26.12.2001 minor bug fixes, dequant saturate, further optimization -; * 19.11.2001 quant_inter_mmx now returns sum of abs. coefficient values -; * 04.11.2001 nasm version; (c)2001 peter ross +; * $Id: quantize_h263_mmx.asm,v 1.1.2.2 2003-10-09 18:50:22 edgomez Exp $ ; * -; *************************************************************************/ +; ****************************************************************************/ ; enable dequant saturate [-2048,2047], test purposes only. %define SATURATE @@ -50,11 +32,8 @@ bits 32 -section .data - - %macro cglobal 1 - %ifdef PREFIX + %ifdef PREFIX global _%1 %define %1 _%1 %else @@ -62,9 +41,19 @@ %endif %endmacro -align 16 +;*************************************************************************** +; Local data +;*************************************************************************** -plus_one times 8 dw 1 +%ifdef FORMAT_COFF +section .data data +%else +section .data data align=16 +%endif + +align 16 +plus_one: + times 8 dw 1 ;=========================================================================== ; @@ -72,45 +61,13 @@ ; ;=========================================================================== -%macro MMX_SUB 1 -times 4 dw %1 / 2 -%endmacro - align 16 -mmx_sub - MMX_SUB 1 - MMX_SUB 2 - MMX_SUB 3 - MMX_SUB 4 - MMX_SUB 5 - MMX_SUB 6 - MMX_SUB 7 - MMX_SUB 8 - MMX_SUB 9 - MMX_SUB 10 - MMX_SUB 11 - MMX_SUB 12 - MMX_SUB 13 - MMX_SUB 14 - MMX_SUB 15 - MMX_SUB 16 - MMX_SUB 17 - MMX_SUB 18 - MMX_SUB 19 - MMX_SUB 20 - MMX_SUB 21 - MMX_SUB 22 - MMX_SUB 23 - MMX_SUB 24 - MMX_SUB 25 - MMX_SUB 26 - MMX_SUB 27 - MMX_SUB 28 - MMX_SUB 29 - MMX_SUB 30 - MMX_SUB 31 - - +mmx_sub: +%assign quant 1 +%rep 31 + times 4 dw quant / 2 + %assign quant quant+1 +%endrep ;=========================================================================== ; @@ -122,45 +79,13 @@ ; ;=========================================================================== -%macro MMX_DIV 1 -times 4 dw (1 << 16) / (%1 * 2) + 1 -%endmacro - align 16 -mmx_div - MMX_DIV 1 - MMX_DIV 2 - MMX_DIV 3 - MMX_DIV 4 - MMX_DIV 5 - MMX_DIV 6 - MMX_DIV 7 - MMX_DIV 8 - MMX_DIV 9 - MMX_DIV 10 - MMX_DIV 11 - MMX_DIV 12 - MMX_DIV 13 - MMX_DIV 14 - MMX_DIV 15 - MMX_DIV 16 - MMX_DIV 17 - MMX_DIV 18 - MMX_DIV 19 - MMX_DIV 20 - MMX_DIV 21 - MMX_DIV 22 - MMX_DIV 23 - MMX_DIV 24 - MMX_DIV 25 - MMX_DIV 26 - MMX_DIV 27 - MMX_DIV 28 - MMX_DIV 29 - MMX_DIV 30 - MMX_DIV 31 - - +mmx_div: +%assign quant 1 +%rep 31 + times 4 dw (1<<16) / (quant*2) + 1 + %assign quant quant+1 +%endrep ;=========================================================================== ; @@ -168,48 +93,17 @@ ; ;=========================================================================== -%macro MMX_ADD 1 -%if %1 % 2 != 0 -times 4 dw %1 -%else -times 4 dw %1 - 1 -%endif -%endmacro - align 16 -mmx_add - MMX_ADD 1 - MMX_ADD 2 - MMX_ADD 3 - MMX_ADD 4 - MMX_ADD 5 - MMX_ADD 6 - MMX_ADD 7 - MMX_ADD 8 - MMX_ADD 9 - MMX_ADD 10 - MMX_ADD 11 - MMX_ADD 12 - MMX_ADD 13 - MMX_ADD 14 - MMX_ADD 15 - MMX_ADD 16 - MMX_ADD 17 - MMX_ADD 18 - MMX_ADD 19 - MMX_ADD 20 - MMX_ADD 21 - MMX_ADD 22 - MMX_ADD 23 - MMX_ADD 24 - MMX_ADD 25 - MMX_ADD 26 - MMX_ADD 27 - MMX_ADD 28 - MMX_ADD 29 - MMX_ADD 30 - MMX_ADD 31 - +mmx_add: +%assign quant 1 +%rep 31 + %if quant % 2 != 0 + times 4 dw quant + %else + times 4 dw quant - 1 + %endif + %assign quant quant+1 +%endrep ;=========================================================================== ; @@ -217,44 +111,13 @@ ; ;=========================================================================== -%macro MMX_MUL 1 -times 4 dw %1 * 2 -%endmacro - align 16 -mmx_mul - MMX_MUL 1 - MMX_MUL 2 - MMX_MUL 3 - MMX_MUL 4 - MMX_MUL 5 - MMX_MUL 6 - MMX_MUL 7 - MMX_MUL 8 - MMX_MUL 9 - MMX_MUL 10 - MMX_MUL 11 - MMX_MUL 12 - MMX_MUL 13 - MMX_MUL 14 - MMX_MUL 15 - MMX_MUL 16 - MMX_MUL 17 - MMX_MUL 18 - MMX_MUL 19 - MMX_MUL 20 - MMX_MUL 21 - MMX_MUL 22 - MMX_MUL 23 - MMX_MUL 24 - MMX_MUL 25 - MMX_MUL 26 - MMX_MUL 27 - MMX_MUL 28 - MMX_MUL 29 - MMX_MUL 30 - MMX_MUL 31 - +mmx_mul: +%assign quant 1 +%rep 31 + times 4 dw quant*2 + %assign quant quant+1 +%endrep ;=========================================================================== ; @@ -263,25 +126,34 @@ ;=========================================================================== align 16 -sse2_2047 times 8 dw 2047 +sse2_2047: + times 8 dw 2047 align 16 -mmx_2047 times 4 dw 2047 +mmx_2047: + times 4 dw 2047 align 8 -mmx_32768_minus_2048 times 4 dw (32768-2048) -mmx_32767_minus_2047 times 4 dw (32767-2047) +mmx_32768_minus_2048: + times 4 dw (32768-2048) + +mmx_32767_minus_2047: + times 4 dw (32767-2047) + +;*************************************************************************** +; Code +;*************************************************************************** section .text ;=========================================================================== ; -; void quant_intra_mmx(int16_t * coeff, -; const int16_t const * data, -; const uint32_t quant, -; const uint32_t dcscalar); +; uint32_t quant_h263_intra_mmx(int16_t * coeff, +; const int16_t const * data, +; const uint32_t quant, +; const uint32_t dcscalar); ; ;=========================================================================== @@ -289,104 +161,106 @@ cglobal quant_h263_intra_mmx quant_h263_intra_mmx: - push ecx - push esi - push edi - - mov edi, [esp + 12 + 4] ; coeff - mov esi, [esp + 12 + 8] ; data - mov eax, [esp + 12 + 12] ; quant - - xor ecx, ecx - cmp al, 1 - jz .q1loop + push ecx + push esi + push edi + + mov edi, [esp + 12 + 4] ; coeff + mov esi, [esp + 12 + 8] ; data + mov eax, [esp + 12 + 12] ; quant + + xor ecx, ecx + cmp al, 1 + jz .q1loop + + movq mm7, [mmx_div + eax * 8 - 8] - movq mm7, [mmx_div + eax * 8 - 8] align ALIGN .loop - movq mm0, [esi + 8*ecx] ; mm0 = [1st] - movq mm3, [esi + 8*ecx + 8] ; - pxor mm1, mm1 ; mm1 = 0 - pxor mm4, mm4 ; - pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) - pcmpgtw mm4, mm3 ; - pxor mm0, mm1 ; mm0 = |mm0| - pxor mm3, mm4 ; - psubw mm0, mm1 ; displace - psubw mm3, mm4 ; - pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16 - pmulhw mm3, mm7 ; - pxor mm0, mm1 ; mm0 *= sign(mm0) - pxor mm3, mm4 ; - psubw mm0, mm1 ; undisplace - psubw mm3, mm4 ; - movq [edi + 8*ecx], mm0 - movq [edi + 8*ecx + 8], mm3 - - add ecx,2 - cmp ecx,16 - jnz .loop + movq mm0, [esi + 8*ecx] ; mm0 = [1st] + movq mm3, [esi + 8*ecx + 8] + pxor mm1, mm1 ; mm1 = 0 + pxor mm4, mm4 ; + pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) + pcmpgtw mm4, mm3 ; + pxor mm0, mm1 ; mm0 = |mm0| + pxor mm3, mm4 ; + psubw mm0, mm1 ; displace + psubw mm3, mm4 ; + pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16 + pmulhw mm3, mm7 ; + pxor mm0, mm1 ; mm0 *= sign(mm0) + pxor mm3, mm4 ; + psubw mm0, mm1 ; undisplace + psubw mm3, mm4 ; + movq [edi + 8*ecx], mm0 + movq [edi + 8*ecx + 8], mm3 + + add ecx, 2 + cmp ecx, 16 + jnz .loop .done + ; caclulate data[0] // (int32_t)dcscalar) + mov ecx, [esp + 12 + 16] ; dcscalar + mov edx, ecx + movsx eax, word [esi] ; data[0] + shr edx, 1 ; edx = dcscalar /2 + cmp eax, 0 + jg .gtzero - mov ecx, [esp + 12 + 16] ; dcscalar - mov edx, ecx - movsx eax, word [esi] ; data[0] - shr edx, 1 ; edx = dcscalar /2 - cmp eax, 0 - jg .gtzero + sub eax, edx + jmp short .mul - sub eax, edx - jmp short .mul .gtzero - add eax, edx + add eax, edx .mul - cdq ; expand eax -> edx:eax - idiv ecx ; eax = edx:eax / dcscalar - - mov [edi], ax ; coeff[0] = ax + cdq ; expand eax -> edx:eax + idiv ecx ; eax = edx:eax / dcscalar + mov [edi], ax ; coeff[0] = ax + + xor eax, eax ; return(0); + pop edi + pop esi + pop ecx - pop edi - pop esi - pop ecx - - ret + ret align ALIGN .q1loop - movq mm0, [esi + 8*ecx] ; mm0 = [1st] - movq mm3, [esi + 8*ecx + 8] ; - pxor mm1, mm1 ; mm1 = 0 - pxor mm4, mm4 ; - pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) - pcmpgtw mm4, mm3 ; - pxor mm0, mm1 ; mm0 = |mm0| - pxor mm3, mm4 ; - psubw mm0, mm1 ; displace - psubw mm3, mm4 ; - psrlw mm0, 1 ; mm0 >>= 1 (/2) - psrlw mm3, 1 ; - pxor mm0, mm1 ; mm0 *= sign(mm0) - pxor mm3, mm4 ; - psubw mm0, mm1 ; undisplace - psubw mm3, mm4 ; - movq [edi + 8*ecx], mm0 - movq [edi + 8*ecx + 8], mm3 - - add ecx,2 - cmp ecx,16 - jnz .q1loop - jmp short .done + movq mm0, [esi + 8*ecx] ; mm0 = [1st] + movq mm3, [esi + 8*ecx + 8] + pxor mm1, mm1 ; mm1 = 0 + pxor mm4, mm4 ; + pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) + pcmpgtw mm4, mm3 ; + pxor mm0, mm1 ; mm0 = |mm0| + pxor mm3, mm4 ; + psubw mm0, mm1 ; displace + psubw mm3, mm4 ; + psrlw mm0, 1 ; mm0 >>= 1 (/2) + psrlw mm3, 1 ; + pxor mm0, mm1 ; mm0 *= sign(mm0) + pxor mm3, mm4 + psubw mm0, mm1 ; undisplace + psubw mm3, mm4 ; + movq [edi + 8*ecx], mm0 + movq [edi + 8*ecx + 8], mm3 + + add ecx, 2 + cmp ecx, 16 + jnz .q1loop + jmp short .done ;=========================================================================== ; -; void quant_intra_sse2(int16_t * coeff, -; const int16_t const * data, -; const uint32_t quant, -; const uint32_t dcscalar); +; uint32_t quant_h263_intra_sse2(int16_t * coeff, +; const int16_t const * data, +; const uint32_t quant, +; const uint32_t dcscalar); ; ;=========================================================================== @@ -394,103 +268,106 @@ cglobal quant_h263_intra_sse2 quant_h263_intra_sse2: - push esi - push edi + push esi + push edi - mov edi, [esp + 8 + 4] ; coeff - mov esi, [esp + 8 + 8] ; data - mov eax, [esp + 8 + 12] ; quant - - xor ecx, ecx - cmp al, 1 - jz near .qas2_q1loop + mov edi, [esp + 8 + 4] ; coeff + mov esi, [esp + 8 + 8] ; data + mov eax, [esp + 8 + 12] ; quant + + xor ecx, ecx + cmp al, 1 + jz near .qas2_q1loop .qas2_not1 - movq mm7, [mmx_div + eax*8 - 8] - movq2dq xmm7, mm7 - movlhps xmm7, xmm7 + movq mm7, [mmx_div + eax*8 - 8] + movq2dq xmm7, mm7 + movlhps xmm7, xmm7 align 16 .qas2_loop - movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] - movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] - pxor xmm1, xmm1 - pxor xmm4, xmm4 - pcmpgtw xmm1, xmm0 - pcmpgtw xmm4, xmm3 - pxor xmm0, xmm1 - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - pmulhw xmm0, xmm7 - pmulhw xmm3, xmm7 - pxor xmm0, xmm1 - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - movdqa [edi + ecx*8], xmm0 - movdqa [edi + ecx*8 + 16], xmm3 + movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] + movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] + pxor xmm1, xmm1 + pxor xmm4, xmm4 + pcmpgtw xmm1, xmm0 + pcmpgtw xmm4, xmm3 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + pmulhw xmm0, xmm7 + pmulhw xmm3, xmm7 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + movdqa [edi + ecx*8], xmm0 + movdqa [edi + ecx*8 + 16], xmm3 - add ecx, 4 - cmp ecx, 16 - jnz .qas2_loop + add ecx, 4 + cmp ecx, 16 + jnz .qas2_loop .qas2_done - mov ecx, [esp + 8 + 16] ; dcscalar - mov edx, ecx - movsx eax, word [esi] - shr edx, 1 - cmp eax, 0 - jg .qas2_gtzero + mov ecx, [esp + 8 + 16] ; dcscalar + mov edx, ecx + movsx eax, word [esi] + shr edx, 1 + cmp eax, 0 + jg .qas2_gtzero + + sub eax, edx + jmp short .qas2_mul - sub eax, edx - jmp short .qas2_mul .qas2_gtzero - add eax, edx + add eax, edx + .qas2_mul - cdq - idiv ecx + cdq + idiv ecx - mov [edi], ax + mov [edi], ax - pop edi - pop esi + xor eax, eax ; return(0); + pop edi + pop esi - ret + ret align 16 .qas2_q1loop - movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] - movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] - pxor xmm1, xmm1 - pxor xmm4, xmm4 - pcmpgtw xmm1, xmm0 - pcmpgtw xmm4, xmm3 - pxor xmm0, xmm1 - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - psrlw xmm0, 1 - psrlw xmm3, 1 - pxor xmm0, xmm1 - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - movdqa [edi + ecx*8], xmm0 - movdqa [edi + ecx*8 + 16], xmm3 - - add ecx, 4 - cmp ecx, 16 - jnz .qas2_q1loop - jmp near .qas2_done + movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] + movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] + pxor xmm1, xmm1 + pxor xmm4, xmm4 + pcmpgtw xmm1, xmm0 + pcmpgtw xmm4, xmm3 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + psrlw xmm0, 1 + psrlw xmm3, 1 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + movdqa [edi + ecx*8], xmm0 + movdqa [edi + ecx*8 + 16], xmm3 + + add ecx, 4 + cmp ecx, 16 + jnz .qas2_q1loop + jmp near .qas2_done ;=========================================================================== ; -; uint32_t quant_inter_mmx(int16_t * coeff, -; const int16_t const * data, -; const uint32_t quant); +; uint32_t quant_h263_inter_mmx(int16_t * coeff, +; const int16_t const * data, +; const uint32_t quant); ; ;=========================================================================== @@ -498,104 +375,104 @@ cglobal quant_h263_inter_mmx quant_h263_inter_mmx: - push ecx - push esi - push edi + push ecx + push esi + push edi - mov edi, [esp + 12 + 4] ; coeff - mov esi, [esp + 12 + 8] ; data - mov eax, [esp + 12 + 12] ; quant + mov edi, [esp + 12 + 4] ; coeff + mov esi, [esp + 12 + 8] ; data + mov eax, [esp + 12 + 12] ; quant - xor ecx, ecx + xor ecx, ecx - pxor mm5, mm5 ; sum - movq mm6, [mmx_sub + eax * 8 - 8] ; sub + pxor mm5, mm5 ; sum + movq mm6, [mmx_sub + eax * 8 - 8]; sub - cmp al, 1 - jz .q1loop + cmp al, 1 + jz .q1loop - movq mm7, [mmx_div + eax * 8 - 8] ; divider + movq mm7, [mmx_div + eax * 8 - 8] ; divider align ALIGN .loop - movq mm0, [esi + 8*ecx] ; mm0 = [1st] - movq mm3, [esi + 8*ecx + 8] ; - pxor mm1, mm1 ; mm1 = 0 - pxor mm4, mm4 ; - pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) - pcmpgtw mm4, mm3 ; - pxor mm0, mm1 ; mm0 = |mm0| - pxor mm3, mm4 ; - psubw mm0, mm1 ; displace - psubw mm3, mm4 ; - psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0) - psubusw mm3, mm6 ; - pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16 - pmulhw mm3, mm7 ; - paddw mm5, mm0 ; sum += mm0 - pxor mm0, mm1 ; mm0 *= sign(mm0) - paddw mm5, mm3 ; - pxor mm3, mm4 ; - psubw mm0, mm1 ; undisplace - psubw mm3, mm4 - movq [edi + 8*ecx], mm0 - movq [edi + 8*ecx + 8], mm3 - - add ecx, 2 - cmp ecx, 16 - jnz .loop + movq mm0, [esi + 8*ecx] ; mm0 = [1st] + movq mm3, [esi + 8*ecx + 8] + pxor mm1, mm1 ; mm1 = 0 + pxor mm4, mm4 ; + pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) + pcmpgtw mm4, mm3 ; + pxor mm0, mm1 ; mm0 = |mm0| + pxor mm3, mm4 ; + psubw mm0, mm1 ; displace + psubw mm3, mm4 ; + psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0) + psubusw mm3, mm6 ; + pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16 + pmulhw mm3, mm7 ; + paddw mm5, mm0 ; sum += mm0 + pxor mm0, mm1 ; mm0 *= sign(mm0) + paddw mm5, mm3 ; + pxor mm3, mm4 ; + psubw mm0, mm1 ; undisplace + psubw mm3, mm4 + movq [edi + 8*ecx], mm0 + movq [edi + 8*ecx + 8], mm3 + + add ecx, 2 + cmp ecx, 16 + jnz .loop .done - pmaddwd mm5, [plus_one] - movq mm0, mm5 - psrlq mm5, 32 - paddd mm0, mm5 - movd eax, mm0 ; return sum - - pop edi - pop esi - pop ecx + pmaddwd mm5, [plus_one] + movq mm0, mm5 + psrlq mm5, 32 + paddd mm0, mm5 + + movd eax, mm0 ; return sum + pop edi + pop esi + pop ecx - ret + ret align ALIGN .q1loop - movq mm0, [esi + 8*ecx] ; mm0 = [1st] - movq mm3, [esi + 8*ecx+ 8] ; - pxor mm1, mm1 ; mm1 = 0 - pxor mm4, mm4 ; - pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) - pcmpgtw mm4, mm3 ; - pxor mm0, mm1 ; mm0 = |mm0| - pxor mm3, mm4 ; - psubw mm0, mm1 ; displace - psubw mm3, mm4 ; - psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0) - psubusw mm3, mm6 ; - psrlw mm0, 1 ; mm0 >>= 1 (/2) - psrlw mm3, 1 ; - paddw mm5, mm0 ; sum += mm0 - pxor mm0, mm1 ; mm0 *= sign(mm0) - paddw mm5, mm3 ; - pxor mm3, mm4 ; - psubw mm0, mm1 ; undisplace - psubw mm3, mm4 - movq [edi + 8*ecx], mm0 - movq [edi + 8*ecx + 8], mm3 + movq mm0, [esi + 8*ecx] ; mm0 = [1st] + movq mm3, [esi + 8*ecx+ 8] ; + pxor mm1, mm1 ; mm1 = 0 + pxor mm4, mm4 ; + pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) + pcmpgtw mm4, mm3 ; + pxor mm0, mm1 ; mm0 = |mm0| + pxor mm3, mm4 ; + psubw mm0, mm1 ; displace + psubw mm3, mm4 ; + psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0) + psubusw mm3, mm6 ; + psrlw mm0, 1 ; mm0 >>= 1 (/2) + psrlw mm3, 1 ; + paddw mm5, mm0 ; sum += mm0 + pxor mm0, mm1 ; mm0 *= sign(mm0) + paddw mm5, mm3 ; + pxor mm3, mm4 ; + psubw mm0, mm1 ; undisplace + psubw mm3, mm4 + movq [edi + 8*ecx], mm0 + movq [edi + 8*ecx + 8], mm3 - add ecx,2 - cmp ecx,16 - jnz .q1loop + add ecx, 2 + cmp ecx, 16 + jnz .q1loop - jmp .done + jmp .done ;=========================================================================== ; -; uint32_t quant_inter_sse2(int16_t * coeff, -; const int16_t const * data, -; const uint32_t quant); +; uint32_t quant_h263_inter_sse2(int16_t * coeff, +; const int16_t const * data, +; const uint32_t quant); ; ;=========================================================================== @@ -603,112 +480,113 @@ cglobal quant_h263_inter_sse2 quant_h263_inter_sse2: - push esi - push edi + push esi + push edi - mov edi, [esp + 8 + 4] ; coeff - mov esi, [esp + 8 + 8] ; data - mov eax, [esp + 8 + 12] ; quant + mov edi, [esp + 8 + 4] ; coeff + mov esi, [esp + 8 + 8] ; data + mov eax, [esp + 8 + 12] ; quant - xor ecx, ecx + xor ecx, ecx - pxor xmm5, xmm5 ; sum + pxor xmm5, xmm5 ; sum - movq mm0, [mmx_sub + eax*8 - 8] ; sub - movq2dq xmm6, mm0 ; load into low 8 bytes - movlhps xmm6, xmm6 ; duplicate into high 8 bytes + movq mm0, [mmx_sub + eax*8 - 8] ; sub + movq2dq xmm6, mm0 ; load into low 8 bytes + movlhps xmm6, xmm6 ; duplicate into high 8 bytes - cmp al, 1 - jz near .qes2_q1loop + cmp al, 1 + jz near .qes2_q1loop .qes2_not1 - movq mm0, [mmx_div + eax*8 - 8] ; divider - movq2dq xmm7, mm0 - movlhps xmm7, xmm7 + movq mm0, [mmx_div + eax*8 - 8] ; divider + movq2dq xmm7, mm0 + movlhps xmm7, xmm7 align 16 .qes2_loop - movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] - movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] - pxor xmm1, xmm1 - pxor xmm4, xmm4 - pcmpgtw xmm1, xmm0 - pcmpgtw xmm4, xmm3 - pxor xmm0, xmm1 - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - psubusw xmm0, xmm6 - psubusw xmm3, xmm6 - pmulhw xmm0, xmm7 - pmulhw xmm3, xmm7 - paddw xmm5, xmm0 - pxor xmm0, xmm1 - paddw xmm5, xmm3 - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - movdqa [edi + ecx*8], xmm0 - movdqa [edi + ecx*8 + 16], xmm3 - - add ecx, 4 - cmp ecx, 16 - jnz .qes2_loop + movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] + movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] + pxor xmm1, xmm1 + pxor xmm4, xmm4 + pcmpgtw xmm1, xmm0 + pcmpgtw xmm4, xmm3 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + psubusw xmm0, xmm6 + psubusw xmm3, xmm6 + pmulhw xmm0, xmm7 + pmulhw xmm3, xmm7 + paddw xmm5, xmm0 + pxor xmm0, xmm1 + paddw xmm5, xmm3 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + movdqa [edi + ecx*8], xmm0 + movdqa [edi + ecx*8 + 16], xmm3 + + add ecx, 4 + cmp ecx, 16 + jnz .qes2_loop .qes2_done - movdqu xmm6, [plus_one] - pmaddwd xmm5, xmm6 - movhlps xmm6, xmm5 - paddd xmm5, xmm6 - movdq2q mm0, xmm5 - - movq mm5, mm0 - psrlq mm5, 32 - paddd mm0, mm5 - movd eax, mm0 ; return sum + movdqu xmm6, [plus_one] + pmaddwd xmm5, xmm6 + movhlps xmm6, xmm5 + paddd xmm5, xmm6 + movdq2q mm0, xmm5 + + movq mm5, mm0 + psrlq mm5, 32 + paddd mm0, mm5 + + movd eax, mm0 ; return sum - pop edi - pop esi + pop edi + pop esi - ret + ret align 16 .qes2_q1loop - movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] - movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] - pxor xmm1, xmm1 - pxor xmm4, xmm4 - pcmpgtw xmm1, xmm0 - pcmpgtw xmm4, xmm3 - pxor xmm0, xmm1 - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - psubusw xmm0, xmm6 - psubusw xmm3, xmm6 - psrlw xmm0, 1 - psrlw xmm3, 1 - paddw xmm5, xmm0 - pxor xmm0, xmm1 - paddw xmm5, xmm3 - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - movdqa [edi + ecx*8], xmm0 - movdqa [edi + ecx*8 + 16], xmm3 + movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] + movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] + pxor xmm1, xmm1 + pxor xmm4, xmm4 + pcmpgtw xmm1, xmm0 + pcmpgtw xmm4, xmm3 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + psubusw xmm0, xmm6 + psubusw xmm3, xmm6 + psrlw xmm0, 1 + psrlw xmm3, 1 + paddw xmm5, xmm0 + pxor xmm0, xmm1 + paddw xmm5, xmm3 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + movdqa [edi + ecx*8], xmm0 + movdqa [edi + ecx*8 + 16], xmm3 - add ecx,4 - cmp ecx,16 - jnz .qes2_q1loop - jmp .qes2_done + add ecx, 4 + cmp ecx, 16 + jnz .qes2_q1loop + jmp .qes2_done ;=========================================================================== ; -; void dequant_intra_mmx(int16_t *data, -; const int16_t const *coeff, -; const uint32_t quant, -; const uint32_t dcscalar); +; uint32_t dequant_h263_intra_mmx(int16_t *data, +; const int16_t const *coeff, +; const uint32_t quant, +; const uint32_t dcscalar); ; ;=========================================================================== @@ -719,73 +597,73 @@ cglobal dequant_h263_intra_mmx dequant_h263_intra_mmx: - mov edx, [esp+ 4] ; data - mov ecx, [esp+ 8] ; coeff - mov eax, [esp+12] ; quant - movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 - movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant - mov eax, -16 + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 + movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant + mov eax, -16 align ALIGN .loop - movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] - movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] - pxor mm1, mm1 - pxor mm4, mm4 - pcmpgtw mm1, mm0 ; sign(c) - pcmpgtw mm4, mm3 ; sign(c') - pxor mm2, mm2 - pxor mm5, mm5 - pcmpeqw mm2, mm0 ; c is zero - pcmpeqw mm5, mm3 ; c' is zero - pandn mm2, mm6 ; offset = isZero ? 0 : quant_add - pandn mm5, mm6 - pxor mm0, mm1 ; negate if negative - pxor mm3, mm4 ; negate if negative - psubw mm0, mm1 - psubw mm3, mm4 - pmullw mm0, mm7 ; *= 2Q - pmullw mm3, mm7 ; *= 2Q - paddw mm0, mm2 ; + offset - paddw mm3, mm5 ; + offset - paddw mm0, mm1 ; negate back - paddw mm3, mm4 ; negate back - - ; saturates to +2047 - movq mm2, [mmx_32767_minus_2047] - add eax, 2 - paddsw mm0, mm2 - paddsw mm3, mm2 - psubsw mm0, mm2 - psubsw mm3, mm2 - - pxor mm0, mm1 - pxor mm3, mm4 - movq [edx + 8*eax + 8*16 - 2*8], mm0 - movq [edx + 8*eax + 8*16+8 - 2*8], mm3 - jnz near .loop + movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] + movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] + pxor mm1, mm1 + pxor mm4, mm4 + pcmpgtw mm1, mm0 ; sign(c) + pcmpgtw mm4, mm3 ; sign(c') + pxor mm2, mm2 + pxor mm5, mm5 + pcmpeqw mm2, mm0 ; c is zero + pcmpeqw mm5, mm3 ; c' is zero + pandn mm2, mm6 ; offset = isZero ? 0 : quant_add + pandn mm5, mm6 + pxor mm0, mm1 ; negate if negative + pxor mm3, mm4 ; negate if negative + psubw mm0, mm1 + psubw mm3, mm4 + pmullw mm0, mm7 ; *= 2Q + pmullw mm3, mm7 ; *= 2Q + paddw mm0, mm2 ; + offset + paddw mm3, mm5 ; + offset + paddw mm0, mm1 ; negate back + paddw mm3, mm4 ; negate back - ; deal with DC + ; saturates to +2047 + movq mm2, [mmx_32767_minus_2047] + add eax, 2 + paddsw mm0, mm2 + paddsw mm3, mm2 + psubsw mm0, mm2 + psubsw mm3, mm2 + + pxor mm0, mm1 + pxor mm3, mm4 + movq [edx + 8*eax + 8*16 - 2*8], mm0 + movq [edx + 8*eax + 8*16+8 - 2*8], mm3 + jnz near .loop - movd mm0, [ecx] - pmullw mm0, [esp+16] ; dcscalar - movq mm2, [mmx_32767_minus_2047] - paddsw mm0, mm2 - psubsw mm0, mm2 - movq mm3, [mmx_32768_minus_2048] - psubsw mm0, mm3 - paddsw mm0, mm3 - movd eax, mm0 - mov [edx], ax - - ret + ; deal with DC + movd mm0, [ecx] + pmullw mm0, [esp+16] ; dcscalar + movq mm2, [mmx_32767_minus_2047] + paddsw mm0, mm2 + psubsw mm0, mm2 + movq mm3, [mmx_32768_minus_2048] + psubsw mm0, mm3 + paddsw mm0, mm3 + movd eax, mm0 + mov [edx], ax + + xor eax, eax ; return(0); + ret ;=========================================================================== ; -; void dequant_intra_xmm(int16_t *data, -; const int16_t const *coeff, -; const uint32_t quant, -; const uint32_t dcscalar); +; uint32_t dequant_h263_intra_xmm(int16_t *data, +; const int16_t const *coeff, +; const uint32_t quant, +; const uint32_t dcscalar); ; ;=========================================================================== @@ -796,147 +674,146 @@ cglobal dequant_h263_intra_xmm dequant_h263_intra_xmm: - mov edx, [esp+ 4] ; data - mov ecx, [esp+ 8] ; coeff - mov eax, [esp+12] ; quant - movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 - movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant - mov eax, -16 + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 + movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant + mov eax, -16 align ALIGN .loop - movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] - movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] - pxor mm1, mm1 - pxor mm4, mm4 - pcmpgtw mm1, mm0 ; sign(c) - pcmpgtw mm4, mm3 ; sign(c') - pxor mm2, mm2 - pxor mm5, mm5 - pcmpeqw mm2, mm0 ; c is zero - pcmpeqw mm5, mm3 ; c' is zero - pandn mm2, mm6 ; offset = isZero ? 0 : quant_add - pandn mm5, mm6 - pxor mm0, mm1 ; negate if negative - pxor mm3, mm4 ; negate if negative - psubw mm0, mm1 - psubw mm3, mm4 - pmullw mm0, mm7 ; *= 2Q - pmullw mm3, mm7 ; *= 2Q - paddw mm0, mm2 ; + offset - paddw mm3, mm5 ; + offset - paddw mm0, mm1 ; negate back - paddw mm3, mm4 ; negate back - - ; saturates to +2047 - movq mm2, [mmx_2047] - pminsw mm0, mm2 - add eax, 2 - pminsw mm3, mm2 - - pxor mm0, mm1 - pxor mm3, mm4 - movq [edx + 8*eax + 8*16 - 2*8], mm0 - movq [edx + 8*eax + 8*16+8 - 2*8], mm3 - jnz near .loop + movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] + movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] + pxor mm1, mm1 + pxor mm4, mm4 + pcmpgtw mm1, mm0 ; sign(c) + pcmpgtw mm4, mm3 ; sign(c') + pxor mm2, mm2 + pxor mm5, mm5 + pcmpeqw mm2, mm0 ; c is zero + pcmpeqw mm5, mm3 ; c' is zero + pandn mm2, mm6 ; offset = isZero ? 0 : quant_add + pandn mm5, mm6 + pxor mm0, mm1 ; negate if negative + pxor mm3, mm4 ; negate if negative + psubw mm0, mm1 + psubw mm3, mm4 + pmullw mm0, mm7 ; *= 2Q + pmullw mm3, mm7 ; *= 2Q + paddw mm0, mm2 ; + offset + paddw mm3, mm5 ; + offset + paddw mm0, mm1 ; negate back + paddw mm3, mm4 ; negate back + + ; saturates to +2047 + movq mm2, [mmx_2047] + pminsw mm0, mm2 + add eax, 2 + pminsw mm3, mm2 + + pxor mm0, mm1 + pxor mm3, mm4 + movq [edx + 8*eax + 8*16 - 2*8], mm0 + movq [edx + 8*eax + 8*16+8 - 2*8], mm3 + jnz near .loop ; deal with DC + movd mm0, [ecx] + pmullw mm0, [esp+16] ; dcscalar + movq mm2, [mmx_32767_minus_2047] + paddsw mm0, mm2 + psubsw mm0, mm2 + movq mm2, [mmx_32768_minus_2048] + psubsw mm0, mm2 + paddsw mm0, mm2 + movd eax, mm0 + mov [edx], ax - movd mm0, [ecx] - pmullw mm0, [esp+16] ; dcscalar - movq mm2, [mmx_32767_minus_2047] - paddsw mm0, mm2 - psubsw mm0, mm2 - movq mm2, [mmx_32768_minus_2048] - psubsw mm0, mm2 - paddsw mm0, mm2 - movd eax, mm0 - mov [edx], ax - - ret + xor eax, eax + ret ;=========================================================================== ; -; void dequant_intra_sse2(int16_t *data, -; const int16_t const *coeff, -; const uint32_t quant, -; const uint32_t dcscalar); +; uint32_t dequant_h263_intra_sse2(int16_t *data, +; const int16_t const *coeff, +; const uint32_t quant, +; const uint32_t dcscalar); ; ;=========================================================================== + align ALIGN cglobal dequant_h263_intra_sse2 dequant_h263_intra_sse2: - mov edx, [esp+ 4] ; data - mov ecx, [esp+ 8] ; coeff - mov eax, [esp+12] ; quant - movq mm6, [mmx_add + eax * 8 - 8] - movq mm7, [mmx_mul + eax * 8 - 8] + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax * 8 - 8] + movq mm7, [mmx_mul + eax * 8 - 8] movq2dq xmm6, mm6 movq2dq xmm7, mm7 movlhps xmm6, xmm6 movlhps xmm7, xmm7 - mov eax, -16 + mov eax, -16 align ALIGN .loop - movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i] - movdqa xmm3, [ecx + 8*16 + 8*eax+ 16] - pxor xmm1, xmm1 - pxor xmm4, xmm4 - pcmpgtw xmm1, xmm0 ; sign(c) - pcmpgtw xmm4, xmm3 - pxor xmm2, xmm2 - pxor xmm5, xmm5 - pcmpeqw xmm2, xmm0 ; c is zero - pcmpeqw xmm5, xmm3 - pandn xmm2, xmm6 ; offset = isZero ? 0 : quant_add - pandn xmm5, xmm6 - pxor xmm0, xmm1 ; negate if negative - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - pmullw xmm0, xmm7 ; *= 2Q - pmullw xmm3, xmm7 - paddw xmm0, xmm2 ; + offset - paddw xmm3, xmm5 - paddw xmm0, xmm1 ; negate back - paddw xmm3, xmm4 + movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i] + movdqa xmm3, [ecx + 8*16 + 8*eax+ 16] + pxor xmm1, xmm1 + pxor xmm4, xmm4 + pcmpgtw xmm1, xmm0 ; sign(c) + pcmpgtw xmm4, xmm3 + pxor xmm2, xmm2 + pxor xmm5, xmm5 + pcmpeqw xmm2, xmm0 ; c is zero + pcmpeqw xmm5, xmm3 + pandn xmm2, xmm6 ; offset = isZero ? 0 : quant_add + pandn xmm5, xmm6 + pxor xmm0, xmm1 ; negate if negative + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + pmullw xmm0, xmm7 ; *= 2Q + pmullw xmm3, xmm7 + paddw xmm0, xmm2 ; + offset + paddw xmm3, xmm5 + paddw xmm0, xmm1 ; negate back + paddw xmm3, xmm4 ; saturates to +2047 - movdqa xmm2, [sse2_2047] - pminsw xmm0, xmm2 - add eax, 4 - pminsw xmm3, xmm2 - - pxor xmm0, xmm1 - pxor xmm3, xmm4 - movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0 - movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3 - jnz near .loop + movdqa xmm2, [sse2_2047] + pminsw xmm0, xmm2 + add eax, 4 + pminsw xmm3, xmm2 + + pxor xmm0, xmm1 + pxor xmm3, xmm4 + movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0 + movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3 + jnz near .loop ; deal with DC - movd mm0, [ecx] - pmullw mm0, [esp+16] ; dcscalar - movq mm2, [mmx_32767_minus_2047] - paddsw mm0, mm2 - psubsw mm0, mm2 - movq mm2, [mmx_32768_minus_2048] - psubsw mm0, mm2 - paddsw mm0, mm2 - movd eax, mm0 - mov [edx], ax + movd mm0, [ecx] + pmullw mm0, [esp+16] ; dcscalar + movq mm2, [mmx_32767_minus_2047] + paddsw mm0, mm2 + psubsw mm0, mm2 + movq mm2, [mmx_32768_minus_2048] + psubsw mm0, mm2 + paddsw mm0, mm2 + movd eax, mm0 + mov [edx], ax + xor eax, eax ret - - - ;=========================================================================== ; -; void dequant_inter_mmx(int16_t * data, -; const int16_t * const coeff, -; const uint32_t quant); +; uint32t dequant_h263_inter_mmx(int16_t * data, +; const int16_t * const coeff, +; const uint32_t quant); ; ;=========================================================================== @@ -944,59 +821,59 @@ cglobal dequant_h263_inter_mmx dequant_h263_inter_mmx: - mov edx, [esp+ 4] ; data - mov ecx, [esp+ 8] ; coeff - mov eax, [esp+12] ; quant - movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 - movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant - mov eax, -16 + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 + movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant + mov eax, -16 align ALIGN .loop - movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] - movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] - pxor mm1, mm1 - pxor mm4, mm4 - pcmpgtw mm1, mm0 ; sign(c) - pcmpgtw mm4, mm3 ; sign(c') - pxor mm2, mm2 - pxor mm5, mm5 - pcmpeqw mm2, mm0 ; c is zero - pcmpeqw mm5, mm3 ; c' is zero - pandn mm2, mm6 ; offset = isZero ? 0 : quant_add - pandn mm5, mm6 - pxor mm0, mm1 ; negate if negative - pxor mm3, mm4 ; negate if negative - psubw mm0, mm1 - psubw mm3, mm4 - pmullw mm0, mm7 ; *= 2Q - pmullw mm3, mm7 ; *= 2Q - paddw mm0, mm2 ; + offset - paddw mm3, mm5 ; + offset - paddw mm0, mm1 ; negate back - paddw mm3, mm4 ; negate back - - ; saturates to +2047 - movq mm2, [mmx_32767_minus_2047] - add eax, 2 - paddsw mm0, mm2 - paddsw mm3, mm2 - psubsw mm0, mm2 - psubsw mm3, mm2 - - pxor mm0, mm1 - pxor mm3, mm4 - movq [edx + 8*eax + 8*16 - 2*8], mm0 - movq [edx + 8*eax + 8*16+8 - 2*8], mm3 - jnz near .loop - - ret + movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] + movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] + pxor mm1, mm1 + pxor mm4, mm4 + pcmpgtw mm1, mm0 ; sign(c) + pcmpgtw mm4, mm3 ; sign(c') + pxor mm2, mm2 + pxor mm5, mm5 + pcmpeqw mm2, mm0 ; c is zero + pcmpeqw mm5, mm3 ; c' is zero + pandn mm2, mm6 ; offset = isZero ? 0 : quant_add + pandn mm5, mm6 + pxor mm0, mm1 ; negate if negative + pxor mm3, mm4 ; negate if negative + psubw mm0, mm1 + psubw mm3, mm4 + pmullw mm0, mm7 ; *= 2Q + pmullw mm3, mm7 ; *= 2Q + paddw mm0, mm2 ; + offset + paddw mm3, mm5 ; + offset + paddw mm0, mm1 ; negate back + paddw mm3, mm4 ; negate back + ; saturates to +2047 + movq mm2, [mmx_32767_minus_2047] + add eax, 2 + paddsw mm0, mm2 + paddsw mm3, mm2 + psubsw mm0, mm2 + psubsw mm3, mm2 + + pxor mm0, mm1 + pxor mm3, mm4 + movq [edx + 8*eax + 8*16 - 2*8], mm0 + movq [edx + 8*eax + 8*16+8 - 2*8], mm3 + jnz near .loop + + xor eax, eax + ret ;=========================================================================== ; -; void dequant_inter_xmm(int16_t * data, -; const int16_t * const coeff, -; const uint32_t quant); +; uint32_t dequant_h263_inter_xmm(int16_t * data, +; const int16_t * const coeff, +; const uint32_t quant); ; ;=========================================================================== @@ -1007,110 +884,112 @@ cglobal dequant_h263_inter_xmm dequant_h263_inter_xmm: - mov edx, [esp+ 4] ; data - mov ecx, [esp+ 8] ; coeff - mov eax, [esp+12] ; quant - movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 - movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant - mov eax, -16 + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 + movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant + mov eax, -16 align ALIGN .loop - movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] - movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] - pxor mm1, mm1 - pxor mm4, mm4 - pcmpgtw mm1, mm0 ; sign(c) - pcmpgtw mm4, mm3 ; sign(c') - pxor mm2, mm2 - pxor mm5, mm5 - pcmpeqw mm2, mm0 ; c is zero - pcmpeqw mm5, mm3 ; c' is zero - pandn mm2, mm6 ; offset = isZero ? 0 : quant_add - pandn mm5, mm6 - pxor mm0, mm1 ; negate if negative - pxor mm3, mm4 ; negate if negative - psubw mm0, mm1 - psubw mm3, mm4 - pmullw mm0, mm7 ; *= 2Q - pmullw mm3, mm7 ; *= 2Q - paddw mm0, mm2 ; + offset - paddw mm3, mm5 ; + offset - paddw mm0, mm1 ; start restoring sign - paddw mm3, mm4 ; start restoring sign - - ; saturates to +2047 - movq mm2, [mmx_2047] - pminsw mm0, mm2 - add eax, 2 - pminsw mm3, mm2 - - pxor mm0, mm1 ; finish restoring sign - pxor mm3, mm4 ; finish restoring sign - movq [edx + 8*eax + 8*16 - 2*8], mm0 - movq [edx + 8*eax + 8*16+8 - 2*8], mm3 - jnz near .loop - - ret + movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] + movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] + pxor mm1, mm1 + pxor mm4, mm4 + pcmpgtw mm1, mm0 ; sign(c) + pcmpgtw mm4, mm3 ; sign(c') + pxor mm2, mm2 + pxor mm5, mm5 + pcmpeqw mm2, mm0 ; c is zero + pcmpeqw mm5, mm3 ; c' is zero + pandn mm2, mm6 ; offset = isZero ? 0 : quant_add + pandn mm5, mm6 + pxor mm0, mm1 ; negate if negative + pxor mm3, mm4 ; negate if negative + psubw mm0, mm1 + psubw mm3, mm4 + pmullw mm0, mm7 ; *= 2Q + pmullw mm3, mm7 ; *= 2Q + paddw mm0, mm2 ; + offset + paddw mm3, mm5 ; + offset + paddw mm0, mm1 ; start restoring sign + paddw mm3, mm4 ; start restoring sign + ; saturates to +2047 + movq mm2, [mmx_2047] + pminsw mm0, mm2 + add eax, 2 + pminsw mm3, mm2 + + pxor mm0, mm1 ; finish restoring sign + pxor mm3, mm4 ; finish restoring sign + movq [edx + 8*eax + 8*16 - 2*8], mm0 + movq [edx + 8*eax + 8*16+8 - 2*8], mm3 + jnz near .loop + + xor eax, eax + ret ;=========================================================================== ; -; void dequant_inter_sse2(int16_t * data, -; const int16_t * const coeff, -; const uint32_t quant); +; uint32_t dequant_h263_inter_sse2(int16_t * data, +; const int16_t * const coeff, +; const uint32_t quant); ; ;=========================================================================== + align ALIGN cglobal dequant_h263_inter_sse2 dequant_h263_inter_sse2: - mov edx, [esp + 4] ; data - mov ecx, [esp + 8] ; coeff - mov eax, [esp + 12] ; quant - movq mm6, [mmx_add + eax * 8 - 8] - movq mm7, [mmx_mul + eax * 8 - 8] + mov edx, [esp + 4] ; data + mov ecx, [esp + 8] ; coeff + mov eax, [esp + 12] ; quant + movq mm6, [mmx_add + eax * 8 - 8] + movq mm7, [mmx_mul + eax * 8 - 8] movq2dq xmm6, mm6 - movq2dq xmm7, mm7 - movlhps xmm6, xmm6 + movq2dq xmm7, mm7 + movlhps xmm6, xmm6 movlhps xmm7, xmm7 - mov eax, -16 + mov eax, -16 align ALIGN .loop - movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i] - movdqa xmm3, [ecx + 8*16 + 8*eax + 16] + movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i] + movdqa xmm3, [ecx + 8*16 + 8*eax + 16] - pxor xmm1, xmm1 - pxor xmm4, xmm4 + pxor xmm1, xmm1 + pxor xmm4, xmm4 pcmpgtw xmm1, xmm0 ; sign(c) pcmpgtw xmm4, xmm3 - pxor xmm2, xmm2 - pxor xmm5, xmm5 + pxor xmm2, xmm2 + pxor xmm5, xmm5 pcmpeqw xmm2, xmm0 ; c is zero pcmpeqw xmm5, xmm3 - pandn xmm2, xmm6 - pandn xmm5, xmm6 - pxor xmm0, xmm1 ; negate if negative - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - pmullw xmm0, xmm7 ; *= 2Q - pmullw xmm3, xmm7 - paddw xmm0, xmm2 ; + offset - paddw xmm3, xmm5 + pandn xmm2, xmm6 + pandn xmm5, xmm6 + pxor xmm0, xmm1 ; negate if negative + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + pmullw xmm0, xmm7 ; *= 2Q + pmullw xmm3, xmm7 + paddw xmm0, xmm2 ; + offset + paddw xmm3, xmm5 - paddw xmm0, xmm1 ; start restoring sign - paddw xmm3, xmm4 + paddw xmm0, xmm1 ; start restoring sign + paddw xmm3, xmm4 ; saturates to +2047 - movdqa xmm2, [sse2_2047] - pminsw xmm0, xmm2 - add eax, 4 - pminsw xmm3, xmm2 - - pxor xmm0, xmm1 ; finish restoring sign - pxor xmm3, xmm4 - movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0 - movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3 - jnz near .loop + movdqa xmm2, [sse2_2047] + pminsw xmm0, xmm2 + add eax, 4 + pminsw xmm3, xmm2 + + pxor xmm0, xmm1 ; finish restoring sign + pxor xmm3, xmm4 + movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0 + movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3 + jnz near .loop + xor eax, eax ret