20 |
; * along with this program ; if not, write to the Free Software |
; * along with this program ; if not, write to the Free Software |
21 |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
22 |
; * |
; * |
23 |
; * $Id: quantize_mpeg_xmm.asm,v 1.2 2004-03-22 22:36:24 edgomez Exp $ |
; * $Id: quantize_mpeg_xmm.asm,v 1.6 2006-07-10 08:09:59 syskin Exp $ |
24 |
; * |
; * |
25 |
; ***************************************************************************/ |
; ***************************************************************************/ |
26 |
|
|
33 |
|
|
34 |
%macro cglobal 1 |
%macro cglobal 1 |
35 |
%ifdef PREFIX |
%ifdef PREFIX |
36 |
|
%ifdef MARK_FUNCS |
37 |
|
global _%1:function %1.endfunc-%1 |
38 |
|
%define %1 _%1:function %1.endfunc-%1 |
39 |
|
%else |
40 |
global _%1 |
global _%1 |
41 |
%define %1 _%1 |
%define %1 _%1 |
42 |
|
%endif |
43 |
|
%else |
44 |
|
%ifdef MARK_FUNCS |
45 |
|
global %1:function %1.endfunc-%1 |
46 |
%else |
%else |
47 |
global %1 |
global %1 |
48 |
%endif |
%endif |
49 |
|
%endif |
50 |
%endmacro |
%endmacro |
51 |
|
|
52 |
%macro cextern 1 |
%macro cextern 1 |
63 |
;============================================================================= |
;============================================================================= |
64 |
|
|
65 |
%ifdef FORMAT_COFF |
%ifdef FORMAT_COFF |
66 |
SECTION .rodata data |
SECTION .rodata |
67 |
%else |
%else |
68 |
SECTION .rodata data align=16 |
SECTION .rodata align=16 |
69 |
%endif |
%endif |
70 |
|
|
71 |
ALIGN 8 |
ALIGN 8 |
165 |
|
|
166 |
SECTION .text |
SECTION .text |
167 |
|
|
|
cglobal quant_mpeg_intra_xmm |
|
168 |
cglobal quant_mpeg_inter_xmm |
cglobal quant_mpeg_inter_xmm |
169 |
cglobal dequant_mpeg_intra_3dne |
cglobal dequant_mpeg_intra_3dne |
170 |
cglobal dequant_mpeg_inter_3dne |
cglobal dequant_mpeg_inter_3dne |
171 |
|
|
172 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
173 |
; |
; |
|
; uint32_t quant_mpeg_intra_xmm(int16_t * coeff, |
|
|
; const int16_t const * data, |
|
|
; const uint32_t quant, |
|
|
; const uint32_t dcscalar, |
|
|
; const uint16_t *mpeg_matrices); |
|
|
; |
|
|
;----------------------------------------------------------------------------- |
|
|
|
|
|
ALIGN 16 |
|
|
quant_mpeg_intra_xmm: |
|
|
mov eax, [esp + 8] ; data |
|
|
mov ecx, [esp + 12] ; quant |
|
|
mov edx, [esp + 4] ; coeff |
|
|
push esi |
|
|
push edi |
|
|
push ebx |
|
|
nop |
|
|
mov edi, [esp + 12 + 20] ; mpeg_quant_matrices |
|
|
mov esi, -14 |
|
|
pxor mm0, mm0 |
|
|
pxor mm3, mm3 |
|
|
cmp ecx, byte 1 |
|
|
je near .q1loop |
|
|
cmp ecx, byte 19 |
|
|
jg near .lloop |
|
|
nop6 |
|
|
|
|
|
ALIGN 16 |
|
|
.loop |
|
|
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
|
|
psubw mm0, mm1 ;-mm1 |
|
|
movq mm4, [eax + 8*esi + 120] ; |
|
|
psubw mm3, mm4 ;-mm4 |
|
|
pmaxsw mm0, mm1 ;|src| |
|
|
pmaxsw mm3,mm4 |
|
|
nop2 |
|
|
psraw mm1, 15 ;sign src |
|
|
psraw mm4, 15 |
|
|
psllw mm0, 4 ;level << 4 ; |
|
|
psllw mm3, 4 |
|
|
paddw mm0, [edi + 128 + 8*esi+112] |
|
|
paddw mm3, [edi + 128 + 8*esi+120] |
|
|
movq mm5, [edi + 384 + 8*esi+112] |
|
|
movq mm7, [edi + 384 + 8*esi+120] |
|
|
pmulhuw mm5, mm0 |
|
|
pmulhuw mm7, mm3 |
|
|
mov esp, esp |
|
|
movq mm2, [edi + 8*esi+112] |
|
|
movq mm6, [edi + 8*esi+120] |
|
|
pmullw mm2, mm5 |
|
|
pmullw mm6, mm7 |
|
|
psubw mm0, mm2 |
|
|
psubw mm3, mm6 |
|
|
nop4 |
|
|
movq mm2, [quantd + ecx * 8 - 8] |
|
|
movq mm6, [mmx_divs + ecx * 8 - 8] |
|
|
paddw mm5, mm2 |
|
|
paddw mm7, mm2 |
|
|
mov esp, esp |
|
|
pmulhuw mm0, [edi + 256 + 8*esi+112] |
|
|
pmulhuw mm3, [edi + 256 + 8*esi+120] |
|
|
paddw mm5, mm0 |
|
|
paddw mm7, mm3 |
|
|
pxor mm0, mm0 |
|
|
pxor mm3, mm3 |
|
|
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 |
|
|
pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32) |
|
|
pxor mm5, mm1 ; mm0 *= sign(mm0) |
|
|
pxor mm7, mm4 ; |
|
|
psubw mm5, mm1 ; undisplace |
|
|
psubw mm7, mm4 ; |
|
|
movq [edx + 8*esi+112], mm5 |
|
|
movq [edx + 8*esi +120], mm7 |
|
|
add esi, byte 2 |
|
|
jng near .loop |
|
|
|
|
|
.done |
|
|
; calculate data[0] // (int32_t)dcscalar) |
|
|
mov esi, [esp + 12 + 16] ; dcscalar |
|
|
movsx ecx, word [eax] |
|
|
mov edi, ecx |
|
|
mov edx, [esp + 12 + 16] |
|
|
shr edx, 1 ; ebx = dcscalar /2 |
|
|
sar edi, 31 ; cdq is vectorpath |
|
|
xor edx, edi ; ebx = eax V -eax -1 |
|
|
sub ecx, edi |
|
|
add ecx, edx |
|
|
mov edx, [dword esp + 12 + 4] |
|
|
mov esi, [int_div+4*esi] |
|
|
imul ecx, esi |
|
|
sar ecx, 17 |
|
|
lea ebx, [byte ecx + 1] |
|
|
cmovs ecx, ebx |
|
|
; idiv cx ; ecx = edi:ecx / dcscalar |
|
|
|
|
|
mov ebx, [esp] |
|
|
mov edi, [esp+4] |
|
|
mov esi, [esp+8] |
|
|
add esp, byte 12 |
|
|
mov [edx], cx ; coeff[0] = ax |
|
|
|
|
|
xor eax, eax |
|
|
ret |
|
|
|
|
|
ALIGN 16 |
|
|
.q1loop |
|
|
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
|
|
psubw mm0, mm1 ;-mm1 |
|
|
movq mm4, [eax + 8*esi+120] ; |
|
|
psubw mm3, mm4 ;-mm4 |
|
|
pmaxsw mm0, mm1 ;|src| |
|
|
pmaxsw mm3, mm4 |
|
|
nop2 |
|
|
psraw mm1, 15 ;sign src |
|
|
psraw mm4, 15 |
|
|
psllw mm0, 4 ; level << 4 |
|
|
psllw mm3, 4 |
|
|
paddw mm0, [edi + 128 + 8*esi+112] ;mm0 is to be divided |
|
|
paddw mm3, [edi + 128 + 8*esi+120] ;intra1 contains fix for division by 1 |
|
|
movq mm5, [edi + 384 + 8*esi+112] ;with rounding down |
|
|
movq mm7, [edi + 384 + 8*esi+120] |
|
|
pmulhuw mm5, mm0 |
|
|
pmulhuw mm7, mm3 ;mm7: first approx of division |
|
|
mov esp, esp |
|
|
movq mm2, [edi + 8*esi+112] |
|
|
movq mm6, [edi + 8*esi+120] ; divs for q<=16 |
|
|
pmullw mm2, mm5 ;test value <= original |
|
|
pmullw mm6, mm7 |
|
|
psubw mm0, mm2 ;mismatch |
|
|
psubw mm3, mm6 |
|
|
nop4 |
|
|
movq mm2, [quantd + ecx * 8 - 8] |
|
|
paddw mm5, mm2 ;first approx with quantd |
|
|
paddw mm7, mm2 |
|
|
mov esp, esp |
|
|
pmulhuw mm0, [edi + 256 + 8*esi+112] ;correction |
|
|
pmulhuw mm3, [edi + 256 + 8*esi+120] |
|
|
paddw mm5, mm0 ;final result with quantd |
|
|
paddw mm7, mm3 |
|
|
pxor mm0, mm0 |
|
|
pxor mm3, mm3 |
|
|
mov esp, esp |
|
|
psrlw mm5, 1 ; (level + quantd) /2 (quant = 1) |
|
|
psrlw mm7, 1 |
|
|
pxor mm5, mm1 ; mm0 *= sign(mm0) |
|
|
pxor mm7, mm4 ; |
|
|
psubw mm5, mm1 ; undisplace |
|
|
psubw mm7, mm4 ; |
|
|
movq [edx + 8*esi+112], mm5 |
|
|
movq [edx + 8*esi +120], mm7 |
|
|
add esi, byte 2 |
|
|
jng near .q1loop |
|
|
jmp near .done |
|
|
|
|
|
ALIGN 8 |
|
|
.lloop |
|
|
movq mm1, [eax + 8*esi+112] ; mm0 = [1st] |
|
|
psubw mm0, mm1 ;-mm1 |
|
|
movq mm4, [eax + 8*esi+120] |
|
|
psubw mm3, mm4 ;-mm4 |
|
|
pmaxsw mm0, mm1 ;|src| |
|
|
pmaxsw mm3, mm4 |
|
|
nop2 |
|
|
psraw mm1, 15 ;sign src |
|
|
psraw mm4, 15 |
|
|
psllw mm0, 4 ; level << 4 |
|
|
psllw mm3, 4 ; |
|
|
paddw mm0, [edi + 128 + 8*esi+112] ;mm0 is to be divided intra1 contains fix for division by 1 |
|
|
paddw mm3, [edi + 128 + 8*esi+120] |
|
|
movq mm5, [edi + 384 + 8*esi+112] |
|
|
movq mm7, [edi + 384 + 8*esi+120] |
|
|
pmulhuw mm5, mm0 |
|
|
pmulhuw mm7, mm3 ;mm7: first approx of division |
|
|
mov esp, esp |
|
|
movq mm2, [edi + 8*esi+112] |
|
|
movq mm6, [edi + 8*esi+120] |
|
|
pmullw mm2, mm5 ;test value <= original |
|
|
pmullw mm6, mm7 |
|
|
psubw mm0, mm2 ;mismatch |
|
|
psubw mm3, mm6 |
|
|
nop4 |
|
|
movq mm2, [quantd + ecx * 8 - 8] |
|
|
movq mm6, [mmx_div + ecx * 8 - 8] ; divs for q<=16 |
|
|
paddw mm5, mm2 ;first approx with quantd |
|
|
paddw mm7, mm2 |
|
|
mov esp, esp |
|
|
pmulhuw mm0, [edi + 256 + 8*esi+112] ;correction |
|
|
pmulhuw mm3, [edi + 256 + 8*esi+120] |
|
|
paddw mm5, mm0 ;final result with quantd |
|
|
paddw mm7, mm3 |
|
|
pxor mm0, mm0 |
|
|
pxor mm3, mm3 |
|
|
mov esp, esp |
|
|
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 |
|
|
pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32) |
|
|
psrlw mm5, 1 ; (level + quantd) / (2*quant) |
|
|
psrlw mm7, 1 |
|
|
pxor mm5, mm1 ; mm0 *= sign(mm0) |
|
|
pxor mm7, mm4 ; |
|
|
psubw mm5, mm1 ; undisplace |
|
|
psubw mm7, mm4 ; |
|
|
movq [edx + 8*esi+112], mm5 |
|
|
movq [edx + 8*esi +120], mm7 |
|
|
add esi,byte 2 |
|
|
jng near .lloop |
|
|
jmp near .done |
|
|
|
|
|
;----------------------------------------------------------------------------- |
|
|
; |
|
174 |
; uint32_t quant_mpeg_inter_xmm(int16_t * coeff, |
; uint32_t quant_mpeg_inter_xmm(int16_t * coeff, |
175 |
; const int16_t const * data, |
; const int16_t const * data, |
176 |
; const uint32_t quant, |
; const uint32_t quant, |
367 |
movq [edx + 8*esi +120-16], mm7 |
movq [edx + 8*esi +120-16], mm7 |
368 |
jng near .lloop |
jng near .lloop |
369 |
jmp near .done |
jmp near .done |
370 |
|
.endfunc |
371 |
|
|
372 |
|
|
373 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
487 |
|
|
488 |
xor eax, eax |
xor eax, eax |
489 |
ret |
ret |
490 |
|
.endfunc |
491 |
|
|
492 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
493 |
; |
; |
602 |
|
|
603 |
xor eax, eax |
xor eax, eax |
604 |
ret |
ret |
605 |
|
.endfunc |
606 |
|
|