1 |
;/************************************************************************** |
;/************************************************************************** |
2 |
; * |
; * |
3 |
; * XVID MPEG-4 VIDEO CODEC |
; * XVID MPEG-4 VIDEO CODEC |
4 |
; * mmx quantization/dequantization |
; * - mmx quantization/dequantization - |
5 |
; * |
; * |
6 |
; * This program is an implementation of a part of one or more MPEG-4 |
; * Copyright(C) 2001-2003 XviD Team <xvid-devel@xvid.org> |
|
; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending |
|
|
; * to use this software module in hardware or software products are |
|
|
; * advised that its use may infringe existing patents or copyrights, and |
|
|
; * any such use would be at such party's own risk. The original |
|
|
; * developer of this software module and his/her company, and subsequent |
|
|
; * editors and their companies, will have no liability for use of this |
|
|
; * software or modifications or derivatives thereof. |
|
7 |
; * |
; * |
8 |
; * This program is free software; you can redistribute it and/or modify |
; * This program is free software; you can redistribute it and/or modify |
9 |
; * it under the terms of the GNU General Public License as published by |
; * it under the terms of the GNU General Public License as published by |
17 |
; * |
; * |
18 |
; * You should have received a copy of the GNU General Public License |
; * You should have received a copy of the GNU General Public License |
19 |
; * along with this program; if not, write to the Free Software |
; * along with this program; if not, write to the Free Software |
20 |
; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
|
; * |
22 |
|
; * $Id: quantize_3dne.asm,v 1.2.2.1 2003-07-16 22:59:20 edgomez Exp $ |
23 |
; * |
; * |
24 |
; *************************************************************************/ |
; *************************************************************************/ |
25 |
; these 3dne functions are compatible with iSSE, but are optimized specifically for |
; these 3dne functions are compatible with iSSE, but are optimized specifically for |
178 |
;Optimized by Jaan, 30 Nov 2002 |
;Optimized by Jaan, 30 Nov 2002 |
179 |
|
|
180 |
%macro quant_intra1 1 |
%macro quant_intra1 1 |
181 |
|
|
182 |
psubw mm1,mm0 ;A3 |
psubw mm1,mm0 ;A3 |
183 |
psubw mm3,mm2 ;B3 |
psubw mm3,mm2 ;B3 |
184 |
%if (%1) |
%if (%1) |
247 |
%endm |
%endm |
248 |
|
|
249 |
|
|
250 |
%macro quant_intra 1 ;rules for athlon: 1) schedule latencies, 2) add/mul and load/store in 2:1 proportion, |
%macro quant_intra 1 |
251 |
|
; Rules for athlon: |
252 |
|
; 1) schedule latencies |
253 |
|
; 2) add/mul and load/store in 2:1 proportion |
254 |
; 3) avoid spliting >3byte instructions over 8byte boundaries |
; 3) avoid spliting >3byte instructions over 8byte boundaries |
255 |
|
|
256 |
psubw mm1,mm0 ;A3 |
psubw mm1,mm0 ;A3 |
257 |
psubw mm3,mm2 ;B3 |
psubw mm3,mm2 ;B3 |
258 |
%if (%1) |
%if (%1) |
346 |
mov ebx,mmzero |
mov ebx,mmzero |
347 |
push edi |
push edi |
348 |
jz near .q1loop |
jz near .q1loop |
349 |
|
|
350 |
quant_intra 0 |
quant_intra 0 |
351 |
mov ebp, [esp + 16 + 16] ; dcscalar |
mov ebp, [esp + 16 + 16] ; dcscalar |
352 |
movsx eax, word [byte ecx] ;x |
; NB -- there are 3 pushes in the function preambule and one more |
353 |
|
; in "quant_intra 0", thus an added offset of 16 bytes |
354 |
|
movsx eax, word [byte ecx] ; DC |
355 |
|
|
356 |
quant_intra 1 |
quant_intra 1 |
357 |
mov edi,eax |
mov edi,eax |
358 |
sar edi,31 ;sign(x) |
sar edi, 31 ; sign(DC) |
359 |
shr ebp,byte 1 ; ebp = dcscalar /2 |
shr ebp,byte 1 ; ebp = dcscalar /2 |
360 |
|
|
361 |
quant_intra 2 |
quant_intra 2 |
362 |
sub eax,edi ; x (+1) |
sub eax, edi ; DC (+1) |
363 |
xor ebp,edi ;sign(x) dcscalar /2 (-1) |
xor ebp, edi ; sign(DC) dcscalar /2 (-1) |
364 |
mov edi,[esp + 16 + 16] |
mov edi, [esp + 16 + 16] ; dscalar |
365 |
lea eax,[byte eax+ebp] ;x + sign(x) dcscalar /2 |
lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2 |
366 |
mov ebp,[byte esp] |
mov ebp,[byte esp] |
367 |
|
|
368 |
quant_intra 3 |
quant_intra 3 |
369 |
psubw mm5, mm4 ;C8 |
psubw mm5, mm4 ;C8 |
370 |
mov esi,[esp+12] |
mov esi, [esp + 12] ; pop back the register value |
371 |
mov edi,[esp+4] |
mov edi, [esp + 4] ; pop back the register value |
|
mov ebx,[esp+8] |
|
|
add esp,byte 16 |
|
372 |
sar eax,16 |
sar eax,16 |
373 |
|
lea ebx, [byte eax + 1] ; workaround for eax < 0 |
374 |
|
cmovs eax, ebx ; conditionnaly move the corrected value |
375 |
mov [edx], ax ; coeff[0] = ax |
mov [edx], ax ; coeff[0] = ax |
376 |
|
mov ebx, [esp + 8] ; pop back the register value |
377 |
|
add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 |
378 |
psubw mm7, mm6 ;D8 |
psubw mm7, mm6 ;D8 |
379 |
movq [edx + 3 * 32 + 16], mm5 ;C9 |
movq [edx + 3 * 32 + 16], mm5 ;C9 |
380 |
movq [edx + 3 * 32 + 24], mm7 ;D9 |
movq [edx + 3 * 32 + 24], mm7 ;D9 |
381 |
|
|
382 |
ret |
ret |
383 |
|
|
384 |
align 16 |
align 16 |
385 |
|
|
386 |
.q1loop |
.q1loop |
387 |
quant_intra1 0 |
quant_intra1 0 |
388 |
mov ebp, [esp + 16 + 16] ; dcscalar |
mov ebp, [esp + 16 + 16] ; dcscalar |
389 |
movsx eax, word [byte ecx] ;x |
movsx eax, word [byte ecx] ; DC |
390 |
|
|
391 |
quant_intra1 1 |
quant_intra1 1 |
392 |
mov edi,eax |
mov edi,eax |
393 |
sar edi,31 ;sign(x) |
sar edi, 31 ; sign(DC) |
394 |
shr ebp,byte 1 ; ebp = dcscalar /2 |
shr ebp,byte 1 ; ebp = dcscalar /2 |
395 |
|
|
396 |
quant_intra1 2 |
quant_intra1 2 |
397 |
sub eax,edi ; x (+1) |
sub eax, edi ; DC (+1) |
398 |
xor ebp,edi ;sign(x) dcscalar /2 (-1) |
xor ebp, edi ; sign(DC) dcscalar /2 (-1) |
399 |
mov edi,[esp + 16 + 16] |
mov edi, [esp + 16 + 16] ; dcscalar |
400 |
lea eax,[byte eax+ebp] ;x + sign(x) dcscalar /2 |
lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2 |
401 |
mov ebp,[byte esp] |
mov ebp,[byte esp] |
402 |
|
|
403 |
quant_intra1 3 |
quant_intra1 3 |
404 |
psubw mm5, mm4 ;C8 |
psubw mm5, mm4 ;C8 |
405 |
mov esi,[dword esp+12] |
mov esi, [dword esp + 12] ; pop back the register value |
406 |
mov edi,[esp+4] |
mov edi, [esp + 4] ; pop back the register value |
|
mov ebx,[esp+8] |
|
|
add esp,byte 16 |
|
407 |
sar eax,16 |
sar eax,16 |
408 |
|
lea ebx, [byte eax + 1] ; workaround for eax < 0 |
409 |
|
cmovs eax, ebx ; conditionnaly move the corrected value |
410 |
mov [edx], ax ; coeff[0] = ax |
mov [edx], ax ; coeff[0] = ax |
411 |
|
mov ebx, [esp + 8] ; pop back the register value |
412 |
|
add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 |
413 |
psubw mm7, mm6 ;D8 |
psubw mm7, mm6 ;D8 |
414 |
movq [edx + 3 * 32 + 16], mm5 ;C9 |
movq [edx + 3 * 32 + 16], mm5 ;C9 |
415 |
movq [edx + 3 * 32 + 24], mm7 ;D9 |
movq [edx + 3 * 32 + 24], mm7 ;D9 |
416 |
|
|
417 |
ret |
ret |
418 |
|
|
419 |
|
|
503 |
align ALIGN |
align ALIGN |
504 |
cglobal quant_inter_3dne |
cglobal quant_inter_3dne |
505 |
quant_inter_3dne |
quant_inter_3dne |
|
|
|
506 |
mov edx, [esp + 4] ; coeff |
mov edx, [esp + 4] ; coeff |
507 |
mov ecx, [esp + 8] ; data |
mov ecx, [esp + 8] ; data |
508 |
mov eax, [esp + 12] ; quant |
mov eax, [esp + 12] ; quant |
530 |
quantinter 2 |
quantinter 2 |
531 |
quantinter 3 |
quantinter 3 |
532 |
quantinter 4 |
quantinter 4 |
533 |
|
|
534 |
psraw mm3,15 ;B6 |
psraw mm3,15 ;B6 |
535 |
psubw mm2, mm6 ;C10 |
psubw mm2, mm6 ;C10 |
536 |
pmulhw mm4, mm7 ; B7 |
pmulhw mm4, mm7 ; B7 |
545 |
punpckhdq mm5, mm5 |
punpckhdq mm5, mm5 |
546 |
paddd mm0, mm5 |
paddd mm0, mm5 |
547 |
movd eax, mm0 ; return sum |
movd eax, mm0 ; return sum |
548 |
|
|
549 |
ret |
ret |
550 |
|
|
551 |
align ALIGN |
align ALIGN |
552 |
.q1loop |
.q1loop |
553 |
movq mm6,[byte ebx] |
movq mm6,[byte ebx] |
554 |
|
|
555 |
quantinter1 0 |
quantinter1 0 |
556 |
quantinter1 1 |
quantinter1 1 |
557 |
quantinter1 2 |
quantinter1 2 |
586 |
;This is Athlon-optimized code (ca 106 clk per call) |
;This is Athlon-optimized code (ca 106 clk per call) |
587 |
|
|
588 |
%macro dequant 1 |
%macro dequant 1 |
589 |
movq mm1, [ecx+%1*24] ;A2 ; c = coeff[i] |
movq mm1, [ecx+%1*24] ; c = coeff[i] ;A2 |
590 |
psubw mm0,mm1 ;-c ;A3 (1st dep) |
psubw mm0,mm1 ;-c ;A3 (1st dep) |
591 |
%if (%1) |
%if (%1) |
592 |
paddw mm4,mm6 ; C11 mm6 free (4th+) |
paddw mm4,mm6 ; C11 mm6 free (4th+) |
594 |
pmaxsw mm0,mm1 ;|c| ;A4 (2nd) |
pmaxsw mm0,mm1 ;|c| ;A4 (2nd) |
595 |
%if (%1) |
%if (%1) |
596 |
mov ebp,ebp |
mov ebp,ebp |
597 |
pminsw mm4,[ebx] ; C12 saturates to +2047 (5th+) 1ater |
pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) later |
598 |
%endif |
%endif |
599 |
movq mm6,[esi] ;0 ;A5 mm6 in use |
movq mm6,[esi] ;0 ;A5 mm6 in use |
600 |
pandn mm7,[eax] ; B9 offset = isZero ? 0 : quant_add (2nd) |
pandn mm7,[eax] ; B9 offset = isZero ? 0 : quant_add (2nd) |
676 |
pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) |
pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) |
677 |
psraw mm3,15 ; sign(c) ;B7 (2nd) |
psraw mm3,15 ; sign(c) ;B7 (2nd) |
678 |
mov edx, [esp+ 4+16] ; data |
mov edx, [esp+ 4+16] ; data |
679 |
|
|
680 |
align 8 |
align 8 |
681 |
dequant 0 |
dequant 0 |
682 |
|
|
683 |
cmp ebp,-2048 |
cmp ebp,-2048 |
684 |
mov esp,esp |
mov esp,esp |
685 |
|
|
686 |
dequant 1 |
dequant 1 |
687 |
|
|
688 |
cmovl ebp,[int_2048] |
cmovl ebp,[int_2048] |
689 |
nop |
nop |
690 |
|
|
691 |
dequant 2 |
dequant 2 |
692 |
|
|
693 |
cmp ebp,2047 |
cmp ebp,2047 |
694 |
mov esp,esp |
mov esp,esp |
695 |
|
|
696 |
dequant 3 |
dequant 3 |
697 |
|
|
698 |
cmovg ebp,[int2047] |
cmovg ebp,[int2047] |
699 |
nop |
nop |
700 |
|
|
701 |
dequant 4 |
dequant 4 |
702 |
|
|
703 |
paddw mm4,mm6 ; C11 mm6 free (4th+) |
paddw mm4,mm6 ; C11 mm6 free (4th+) |
735 |
align ALIGN |
align ALIGN |
736 |
cglobal dequant_inter_3dne |
cglobal dequant_inter_3dne |
737 |
dequant_inter_3dne: |
dequant_inter_3dne: |
|
|
|
738 |
mov ecx, [esp+ 8] ; coeff |
mov ecx, [esp+ 8] ; coeff |
739 |
mov eax, [esp+12] ; quant |
mov eax, [esp+12] ; quant |
740 |
pxor mm0,mm0 |
pxor mm0,mm0 |
754 |
pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) |
pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) |
755 |
psraw mm3,15 ; sign(c) ;B7 (2nd) |
psraw mm3,15 ; sign(c) ;B7 (2nd) |
756 |
mov edx, [dword esp+ 4+12] ; data |
mov edx, [dword esp+ 4+12] ; data |
757 |
|
|
758 |
align 8 |
align 8 |
759 |
|
|
760 |
dequant 0 |
dequant 0 |
761 |
dequant 1 |
dequant 1 |
762 |
dequant 2 |
dequant 2 |
777 |
add esp,byte 12 |
add esp,byte 12 |
778 |
pxor mm3, mm2 ; B13 (6th+) |
pxor mm3, mm2 ; B13 (6th+) |
779 |
movq [edx+4*24+8], mm3 ; B14 (7th) |
movq [edx+4*24+8], mm3 ; B14 (7th) |
780 |
|
|
781 |
ret |
ret |