1 |
;/************************************************************************** |
;/************************************************************************** |
2 |
; * |
; * |
3 |
; * XVID MPEG-4 VIDEO CODEC |
; * XVID MPEG-4 VIDEO CODEC |
4 |
; * - mmx quantization/dequantization - |
; * - 3dne Quantization/Dequantization - |
5 |
; * |
; * |
6 |
; * Copyright(C) 2001-2003 XviD Team <xvid-devel@xvid.org> |
; * Copyright (C) 2002-2003 Peter Ross <pross@xvid.org> |
7 |
|
; * 2002 Jaan Kalda |
8 |
; * |
; * |
9 |
; * This program is free software ; you can redistribute it and/or modify |
; * This program is free software ; you can redistribute it and/or modify |
10 |
; * it under the terms of the GNU General Public License as published by |
; * it under the terms of the GNU General Public License as published by |
20 |
; * along with this program ; if not, write to the Free Software |
; * along with this program ; if not, write to the Free Software |
21 |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
22 |
; * |
; * |
23 |
; * $Id: quantize_mpeg_xmm.asm,v 1.1.2.1 2003-10-07 13:02:35 edgomez Exp $ |
; * $Id: quantize_mpeg_xmm.asm,v 1.1.2.2 2003-10-09 18:50:22 edgomez Exp $ |
24 |
; * |
; * |
25 |
; *************************************************************************/ |
; *************************************************************************/ |
26 |
;/************************************************************************** |
|
|
; * quant4 bugs have been fixed: (a) overflow bug for matrix elements |
|
|
; * equal to 1 or 2 is fixed by substituting pmulhw with pmulhuw (iSSE) |
|
|
; * and using multiplier 0ffffh instead of 10001h (for matrix element = 1; |
|
|
; * in that case, 1 is added before multiplying, that additional 1 comes |
|
|
; * from intra_matrix1; (b) rounding error for large coefficients and matrix |
|
|
; * elements is fixed by two-step approach: first approximation (rounded |
|
|
; * down) is found as usual; the result is multiplied by the matrix element |
|
|
; * and mismatch is used to calculate the correction. |
|
|
; *************************************************************************/ |
|
27 |
; _3dne functions are compatible with iSSE, but are optimized specifically |
; _3dne functions are compatible with iSSE, but are optimized specifically |
28 |
; for K7 pipelines |
; for K7 pipelines |
|
; |
|
|
;--------------------------------------------------------------------------- |
|
|
; 09.12.2002 Athlon optimizations contributed by Jaan Kalda |
|
|
;--------------------------------------------------------------------------- |
|
|
|
|
29 |
|
|
30 |
; data/text alignment |
; data/text alignment |
31 |
%define ALIGN 8 |
%define ALIGN 8 |
33 |
|
|
34 |
bits 32 |
bits 32 |
35 |
|
|
|
%ifdef FORMAT_COFF |
|
|
SECTION .data data |
|
|
%else |
|
|
SECTION .data data align=8 |
|
|
%endif |
|
|
|
|
36 |
%macro cglobal 1 |
%macro cglobal 1 |
37 |
%ifdef PREFIX |
%ifdef PREFIX |
38 |
global _%1 |
global _%1 |
50 |
extern %1 |
extern %1 |
51 |
%endif |
%endif |
52 |
%endmacro |
%endmacro |
|
align 8 |
|
|
mmzero dd 0,0 |
|
53 |
|
|
54 |
mmx_one times 4 dw 1 |
;*************************************************************************** |
55 |
|
; Local data |
56 |
|
;*************************************************************************** |
57 |
|
|
58 |
|
%ifdef FORMAT_COFF |
59 |
|
SECTION .data data |
60 |
|
%else |
61 |
|
SECTION .data data align=8 |
62 |
|
%endif |
63 |
|
|
64 |
|
align 8 |
65 |
|
mmzero: |
66 |
|
dd 0,0 |
67 |
|
mmx_one: |
68 |
|
times 4 dw 1 |
69 |
|
|
70 |
;=========================================================================== |
;=========================================================================== |
71 |
; |
; |
74 |
;=========================================================================== |
;=========================================================================== |
75 |
|
|
76 |
align ALIGN |
align ALIGN |
77 |
mmx_divs ;i>2 |
mmx_divs: ;i>2 |
78 |
%assign i 1 |
%assign i 1 |
79 |
%rep 31 |
%rep 31 |
80 |
times 4 dw ((1 << 15) / i + 1) |
times 4 dw ((1 << 15) / i + 1) |
82 |
%endrep |
%endrep |
83 |
|
|
84 |
align ALIGN |
align ALIGN |
85 |
mmx_div ;i>2 |
mmx_div: ;i>2 |
86 |
%assign i 1 |
%assign i 1 |
87 |
%rep 31 |
%rep 31 |
88 |
times 4 dw ((1 << 16) / i + 1) |
times 4 dw ((1 << 16) / i + 1) |
117 |
cextern inter_matrix_fixl |
cextern inter_matrix_fixl |
118 |
|
|
119 |
|
|
|
%define VM18P 3 |
|
|
%define VM18Q 4 |
|
120 |
%define nop4 db 08Dh,074h,026h,0 |
%define nop4 db 08Dh,074h,026h,0 |
121 |
%define nop3 add esp,byte 0 |
%define nop3 add esp,byte 0 |
122 |
%define nop2 mov esp,esp |
%define nop2 mov esp,esp |
129 |
; |
; |
130 |
;=========================================================================== |
;=========================================================================== |
131 |
|
|
132 |
|
%define VM18P 3 |
133 |
|
%define VM18Q 4 |
134 |
|
|
135 |
quantd |
align 16 |
136 |
|
quantd: |
137 |
%assign i 1 |
%assign i 1 |
138 |
%rep 31 |
%rep 31 |
139 |
times 4 dw (((VM18P*i) + (VM18Q/2)) / VM18Q) |
times 4 dw (((VM18P*i) + (VM18Q/2)) / VM18Q) |
146 |
; |
; |
147 |
;=========================================================================== |
;=========================================================================== |
148 |
|
|
149 |
|
align 16 |
150 |
mmx_mul_quant |
mmx_mul_quant: |
151 |
%assign i 1 |
%assign i 1 |
152 |
%rep 31 |
%rep 31 |
153 |
times 4 dw i |
times 4 dw i |
161 |
;=========================================================================== |
;=========================================================================== |
162 |
|
|
163 |
align 16 |
align 16 |
164 |
|
mmx_32767_minus_2047: |
165 |
|
times 4 dw (32767-2047) |
166 |
|
mmx_32768_minus_2048: |
167 |
|
times 4 dw (32768-2048) |
168 |
|
mmx_2047: |
169 |
|
times 4 dw 2047 |
170 |
|
mmx_minus_2048: |
171 |
|
times 4 dw (-2048) |
172 |
|
zero: |
173 |
|
times 4 dw 0 |
174 |
|
|
175 |
mmx_32767_minus_2047 times 4 dw (32767-2047) |
int_div: |
|
mmx_32768_minus_2048 times 4 dw (32768-2048) |
|
|
mmx_2047 times 4 dw 2047 |
|
|
mmx_minus_2048 times 4 dw (-2048) |
|
|
zero times 4 dw 0 |
|
|
|
|
|
int_div |
|
176 |
dd 0 |
dd 0 |
177 |
%assign i 1 |
%assign i 1 |
178 |
%rep 255 |
%rep 255 |
180 |
%assign i i+1 |
%assign i i+1 |
181 |
%endrep |
%endrep |
182 |
|
|
183 |
|
;*************************************************************************** |
184 |
|
; Code |
185 |
|
;*************************************************************************** |
186 |
|
|
187 |
section .text |
section .text |
188 |
|
|
189 |
;=========================================================================== |
;=========================================================================== |
190 |
; |
; |
191 |
; void quant4_intra_xmm(int16_t * coeff, |
; uint32_t quant_mpeg_intra_xmm(int16_t * coeff, |
192 |
; const int16_t const * data, |
; const int16_t const * data, |
193 |
; const uint32_t quant, |
; const uint32_t quant, |
194 |
; const uint32_t dcscalar); |
; const uint32_t dcscalar); |
215 |
jg near .lloop |
jg near .lloop |
216 |
nop6 |
nop6 |
217 |
|
|
|
|
|
218 |
align ALIGN |
align ALIGN |
219 |
.loop |
.loop |
220 |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
289 |
add esp, byte 12 |
add esp, byte 12 |
290 |
mov [edx], cx ; coeff[0] = ax |
mov [edx], cx ; coeff[0] = ax |
291 |
|
|
292 |
|
xor eax, eax |
293 |
ret |
ret |
294 |
|
|
295 |
align ALIGN |
align ALIGN |
397 |
|
|
398 |
;=========================================================================== |
;=========================================================================== |
399 |
; |
; |
400 |
; uint32_t quant4_inter_xmm(int16_t * coeff, |
; uint32_t quant_mpeg_inter_xmm(int16_t * coeff, |
401 |
; const int16_t const * data, |
; const int16_t const * data, |
402 |
; const uint32_t quant); |
; const uint32_t quant); |
403 |
; |
; |
597 |
|
|
598 |
;=========================================================================== |
;=========================================================================== |
599 |
; |
; |
600 |
; void dequant4_intra_mmx(int16_t *data, |
; uint32_t dequant_mpeg_intra_3dne(int16_t *data, |
601 |
; const int16_t const *coeff, |
; const int16_t const *coeff, |
602 |
; const uint32_t quant, |
; const uint32_t quant, |
603 |
; const uint32_t dcscalar); |
; const uint32_t dcscalar); |
614 |
; and quant in [1..31]. |
; and quant in [1..31]. |
615 |
; |
; |
616 |
;******************************************************************** |
;******************************************************************** |
617 |
|
|
618 |
%macro DEQUANT4INTRAMMX 1 |
%macro DEQUANT4INTRAMMX 1 |
619 |
movq mm1, [byte ecx+ 16 * %1] ; mm0 = c = coeff[i] |
movq mm1, [byte ecx+ 16 * %1] ; mm0 = c = coeff[i] |
620 |
movq mm4, [ecx+ 16 * %1 +8]; mm3 = c' = coeff[i+1] |
movq mm4, [ecx+ 16 * %1 +8]; mm3 = c' = coeff[i+1] |
675 |
imul ebx,[esp+16+8+32] ; dcscalar |
imul ebx,[esp+16+8+32] ; dcscalar |
676 |
movq mm2,mm7 |
movq mm2,mm7 |
677 |
|
|
|
|
|
678 |
align 4 |
align 4 |
679 |
|
|
680 |
DEQUANT4INTRAMMX 0 |
DEQUANT4INTRAMMX 0 |
708 |
|
|
709 |
add esp, byte 32+8 |
add esp, byte 32+8 |
710 |
|
|
711 |
|
xor eax, eax |
712 |
ret |
ret |
713 |
|
|
714 |
;=========================================================================== |
;=========================================================================== |
715 |
; |
; |
716 |
; void dequant4_inter_3dne(int16_t * data, |
; uint32_t dequant_mpeg_inter_3dne(int16_t * data, |
717 |
; const int16_t * const coeff, |
; const int16_t * const coeff, |
718 |
; const uint32_t quant); |
; const uint32_t quant); |
719 |
; |
; |
819 |
add esp,byte 4 |
add esp,byte 4 |
820 |
xor word [edx + 2*63], ax |
xor word [edx + 2*63], ax |
821 |
|
|
822 |
|
xor eax, eax |
823 |
ret |
ret |