1 |
;/****************************************************************************** |
;/************************************************************************** |
2 |
; * * |
; * |
3 |
; * This file is part of XviD, a free MPEG-4 video encoder/decoder * |
; * XVID MPEG-4 VIDEO CODEC |
4 |
; * * |
; * - mmx quantization/dequantization - |
5 |
; * XviD is an implementation of a part of one or more MPEG-4 Video tools * |
; * |
6 |
; * as specified in ISO/IEC 14496-2 standard. Those intending to use this * |
; * Copyright(C) 2001-2003 XviD Team <xvid-devel@xvid.org> |
7 |
; * software module in hardware or software products are advised that its * |
; * |
8 |
; * use may infringe existing patents or copyrights, and any such use * |
; * This program is free software ; you can redistribute it and/or modify |
9 |
; * would be at such party's own risk. The original developer of this * |
; * it under the terms of the GNU General Public License as published by |
10 |
; * software module and his/her company, and subsequent editors and their * |
; * the Free Software Foundation ; either version 2 of the License, or |
11 |
; * companies, will have no liability for use of this software or * |
; * (at your option) any later version. |
12 |
; * modifications or derivatives thereof. * |
; * |
13 |
; * * |
; * This program is distributed in the hope that it will be useful, |
14 |
; * XviD is free software; you can redistribute it and/or modify it * |
; * but WITHOUT ANY WARRANTY ; without even the implied warranty of |
15 |
; * under the terms of the GNU General Public License as published by * |
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
; * the Free Software Foundation; either version 2 of the License, or * |
; * GNU General Public License for more details. |
17 |
; * (at your option) any later version. * |
; * |
18 |
; * * |
; * You should have received a copy of the GNU General Public License |
19 |
; * XviD is distributed in the hope that it will be useful, but * |
; * along with this program ; if not, write to the Free Software |
20 |
; * WITHOUT ANY WARRANTY; without even the implied warranty of * |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * |
; * |
22 |
; * GNU General Public License for more details. * |
; * $Id: quantize4_xmm.asm,v 1.2.2.1 2003-07-16 22:59:15 edgomez Exp $ |
23 |
; * * |
; * |
24 |
; * You should have received a copy of the GNU General Public License * |
; *************************************************************************/ |
25 |
; * along with this program; if not, write to the Free Software * |
;/************************************************************************** |
26 |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * |
; * quant4 bugs have been fixed: (a) overflow bug for matrix elements |
27 |
; * * |
; * equal to 1 or 2 is fixed by substituting pmulhw with pmulhuw (iSSE) |
28 |
; ******************************************************************************/ |
; * and using multiplier 0ffffh instead of 10001h (for matrix element = 1; |
29 |
; |
; * in that case, 1 is added before multiplying, that additional 1 comes |
30 |
;/****************************************************************************** |
; * from intra_matrix1; (b) rounding error for large coefficients and matrix |
31 |
; * quant4 bugs have been fixed: (a) overflow bug for matrix elements * |
; * elements is fixed by two-step approach: first approximation (rounded |
32 |
; * equal to 1 or 2 is fixed by substituting pmulhw with pmulhuw (iSSE) * |
; * down) is found as usual; the result is multiplied by the matrix element |
33 |
; * and using multiplier 0ffffh instead of 10001h (for matrix element = 1; * |
; * and mismatch is used to calculate the correction. |
34 |
; * in that case, 1 is added before multiplying, that additional 1 comes * |
; *************************************************************************/ |
35 |
; * from intra_matrix1; (b) rounding error for large coefficients and matrix * |
; _3dne functions are compatible with iSSE, but are optimized specifically |
36 |
; * elements is fixed by two-step approach: first approximation (rounded * |
; for K7 pipelines |
|
; * down) is found as usual; the result is multiplied by the matrix element * |
|
|
; * and mismatch is used to calculate the correction. * |
|
|
; ******************************************************************************/ |
|
37 |
; |
; |
38 |
; _3dne functions are compatible with iSSE, but are optimized specifically for |
;--------------------------------------------------------------------------- |
|
; K7 pipelines |
|
|
; |
|
|
;------------------------------------------------------------------------------ |
|
39 |
; 09.12.2002 Athlon optimizations contributed by Jaan Kalda |
; 09.12.2002 Athlon optimizations contributed by Jaan Kalda |
40 |
;------------------------------------------------------------------------------ |
;--------------------------------------------------------------------------- |
41 |
|
|
42 |
|
|
43 |
; data/text alignment |
; data/text alignment |
126 |
|
|
127 |
%define VM18P 3 |
%define VM18P 3 |
128 |
%define VM18Q 4 |
%define VM18Q 4 |
129 |
%define nop4 DB 08Dh,074h,026h,0 |
%define nop4 db 08Dh,074h,026h,0 |
130 |
%define nop3 add esp,byte 0 |
%define nop3 add esp,byte 0 |
131 |
%define nop2 mov esp,esp |
%define nop2 mov esp,esp |
132 |
%define nop7 db 08dh,02ch,02dh,0,0,0,0 |
%define nop7 db 08dh,02ch,02dh,0,0,0,0 |
133 |
%define nop6 add ebp,dword 0 |
%define nop6 add ebp,dword 0 |
134 |
|
|
135 |
;=========================================================================== |
;=========================================================================== |
136 |
; |
; |
137 |
; quantd table |
; quantd table |
186 |
|
|
187 |
;=========================================================================== |
;=========================================================================== |
188 |
; |
; |
189 |
; void quant_intra4_xmm(int16_t * coeff, |
; void quant4_intra_xmm(int16_t * coeff, |
190 |
; const int16_t const * data, |
; const int16_t const * data, |
191 |
; const uint32_t quant, |
; const uint32_t quant, |
192 |
; const uint32_t dcscalar); |
; const uint32_t dcscalar); |
196 |
align ALIGN |
align ALIGN |
197 |
cglobal quant4_intra_xmm |
cglobal quant4_intra_xmm |
198 |
quant4_intra_xmm |
quant4_intra_xmm |
|
|
|
199 |
mov eax, [esp + 8] ; data |
mov eax, [esp + 8] ; data |
200 |
mov ecx, [esp + 12] ; quant |
mov ecx, [esp + 12] ; quant |
201 |
mov edx, [esp + 4] ; coeff |
mov edx, [esp + 4] ; coeff |
225 |
nop2 |
nop2 |
226 |
psraw mm1,15 ;sign src |
psraw mm1,15 ;sign src |
227 |
psraw mm4,15 |
psraw mm4,15 |
228 |
psllw mm0, 4 ; level << 4 |
psllw mm0, 4 ;level << 4 ; |
229 |
psllw mm3, 4 ; |
psllw mm3, 4 |
230 |
paddw mm0, [intra_matrix1 + 8*esi+112] |
paddw mm0, [intra_matrix1 + 8*esi+112] |
231 |
paddw mm3, [intra_matrix1 + 8*esi+120] |
paddw mm3, [intra_matrix1 + 8*esi+120] |
232 |
movq mm5,[intra_matrix_fixl + 8*esi+112] |
movq mm5,[intra_matrix_fixl + 8*esi+112] |
262 |
movq [edx + 8*esi +120], mm7 |
movq [edx + 8*esi +120], mm7 |
263 |
add esi,byte 2 |
add esi,byte 2 |
264 |
jng near .loop |
jng near .loop |
|
.done |
|
|
; caclulate data[0] // (int32_t)dcscalar) |
|
|
|
|
265 |
|
|
266 |
|
.done |
267 |
|
; calculate data[0] // (int32_t)dcscalar) |
268 |
mov esi, [esp + 12 + 16] ; dcscalar |
mov esi, [esp + 12 + 16] ; dcscalar |
269 |
movsx ecx, word [eax] |
movsx ecx, word [eax] |
270 |
mov edi,ecx |
mov edi,ecx |
278 |
mov esi,[int_div+4*esi] |
mov esi,[int_div+4*esi] |
279 |
imul ecx,esi |
imul ecx,esi |
280 |
sar ecx,17 |
sar ecx,17 |
281 |
|
lea ebx, [byte ecx + 1] |
282 |
|
cmovs ecx, ebx |
283 |
; idiv cx ; ecx = edi:ecx / dcscalar |
; idiv cx ; ecx = edi:ecx / dcscalar |
284 |
|
|
285 |
mov ebx,[esp] |
mov ebx,[esp] |
404 |
align ALIGN |
align ALIGN |
405 |
cglobal quant4_inter_xmm |
cglobal quant4_inter_xmm |
406 |
quant4_inter_xmm |
quant4_inter_xmm |
|
|
|
407 |
mov eax, [esp + 8] ; data |
mov eax, [esp + 8] ; data |
408 |
mov ecx, [esp + 12] ; quant |
mov ecx, [esp + 12] ; quant |
409 |
mov edx, [esp + 4] ; coeff |
mov edx, [esp + 4] ; coeff |
427 |
jg near .lloop |
jg near .lloop |
428 |
nop |
nop |
429 |
|
|
|
|
|
430 |
align ALIGN |
align ALIGN |
431 |
.loop |
.loop |
432 |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
475 |
movq [edx + 8*esi+112-16], mm5 |
movq [edx + 8*esi+112-16], mm5 |
476 |
movq [edx + 8*esi +120-16], mm7 |
movq [edx + 8*esi +120-16], mm7 |
477 |
jng near .loop |
jng near .loop |
|
.done |
|
|
; caclulate data[0] // (int32_t)dcscalar) |
|
478 |
|
|
479 |
|
.done |
480 |
|
; calculate data[0] // (int32_t)dcscalar) |
481 |
paddw mm2,[ebx] |
paddw mm2,[ebx] |
482 |
mov ebx,[esp+24] |
mov ebx,[esp+24] |
483 |
mov edi,[esp+4+24] |
mov edi,[esp+4+24] |
488 |
paddd mm0,mm2 |
paddd mm0,mm2 |
489 |
punpckhdq mm0,mm0 ;get result to low |
punpckhdq mm0,mm0 ;get result to low |
490 |
movd eax, mm0 |
movd eax, mm0 |
491 |
|
|
492 |
ret |
ret |
493 |
|
|
494 |
align ALIGN |
align ALIGN |
654 |
align 16 |
align 16 |
655 |
cglobal dequant4_intra_3dne |
cglobal dequant4_intra_3dne |
656 |
dequant4_intra_3dne: |
dequant4_intra_3dne: |
|
|
|
657 |
mov eax, [esp+12] ; quant |
mov eax, [esp+12] ; quant |
658 |
mov ecx, [esp+8] ; coeff |
mov ecx, [esp+8] ; coeff |
659 |
movq mm7, [mmx_mul_quant + eax*8 - 8] |
movq mm7, [mmx_mul_quant + eax*8 - 8] |
674 |
|
|
675 |
|
|
676 |
align 4 |
align 4 |
677 |
|
|
678 |
DEQUANT4INTRAMMX 0 |
DEQUANT4INTRAMMX 0 |
679 |
|
|
680 |
mov esi,-2048 |
mov esi,-2048 |
681 |
nop |
nop |
682 |
cmp ebx,esi |
cmp ebx,esi |
683 |
|
|
684 |
DEQUANT4INTRAMMX 1 |
DEQUANT4INTRAMMX 1 |
685 |
|
|
686 |
cmovl ebx,esi |
cmovl ebx,esi |
687 |
neg esi |
neg esi |
688 |
sub esi,byte 1 ;2047 |
sub esi,byte 1 ;2047 |
689 |
|
|
690 |
DEQUANT4INTRAMMX 2 |
DEQUANT4INTRAMMX 2 |
691 |
|
|
692 |
cmp ebx,esi |
cmp ebx,esi |
693 |
cmovg ebx,esi |
cmovg ebx,esi |
694 |
lea ebp,[byte ebp] |
lea ebp,[byte ebp] |
695 |
|
|
696 |
DEQUANT4INTRAMMX 3 |
DEQUANT4INTRAMMX 3 |
697 |
|
|
698 |
mov esi,[esp+32] |
mov esi,[esp+32] |
699 |
mov [byte edx], bx |
mov [byte edx], bx |
700 |
mov ebx,[esp+32+4] |
mov ebx,[esp+32+4] |
701 |
|
|
702 |
DEQUANT4INTRAMMX 4 |
DEQUANT4INTRAMMX 4 |
703 |
DEQUANT4INTRAMMX 5 |
DEQUANT4INTRAMMX 5 |
704 |
DEQUANT4INTRAMMX 6 |
DEQUANT4INTRAMMX 6 |
705 |
DEQUANT4INTRAMMX 7 |
DEQUANT4INTRAMMX 7 |
706 |
|
|
707 |
add esp,byte 32+8 |
add esp,byte 32+8 |
708 |
|
|
709 |
ret |
ret |
724 |
align 16 |
align 16 |
725 |
cglobal dequant4_inter_3dne |
cglobal dequant4_inter_3dne |
726 |
dequant4_inter_3dne: |
dequant4_inter_3dne: |
|
|
|
727 |
mov edx, [esp+ 4] ; data |
mov edx, [esp+ 4] ; data |
728 |
mov ecx, [esp+ 8] ; coeff |
mov ecx, [esp+ 8] ; coeff |
729 |
mov eax, [esp+12] ; quant |
mov eax, [esp+12] ; quant |
817 |
xor word [edx + 2*63], ax |
xor word [edx + 2*63], ax |
818 |
|
|
819 |
ret |
ret |
|
|
|