Annotation of /branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize_mpeg_xmm.asm

Revision 1174 - (view) (download)

1 :	edgomez	1174	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - mmx quantization/dequantization -
5 :			; *
6 :			; * Copyright(C) 2001-2003 XviD Team <xvid-devel@xvid.org>
7 :			; *
8 :			; * This program is free software ; you can redistribute it and/or modify
9 :			; * it under the terms of the GNU General Public License as published by
10 :			; * the Free Software Foundation ; either version 2 of the License, or
11 :			; * (at your option) any later version.
12 :			; *
13 :			; * This program is distributed in the hope that it will be useful,
14 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
15 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :			; * GNU General Public License for more details.
17 :			; *
18 :			; * You should have received a copy of the GNU General Public License
19 :			; * along with this program ; if not, write to the Free Software
20 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :			; *
22 :			; * $Id: quantize_mpeg_xmm.asm,v 1.1.2.1 2003-10-07 13:02:35 edgomez Exp $
23 :			; *
24 :			; *************************************************************************/
25 :			;/**************************************************************************
26 :			; * quant4 bugs have been fixed: (a) overflow bug for matrix elements
27 :			; * equal to 1 or 2 is fixed by substituting pmulhw with pmulhuw (iSSE)
28 :			; * and using multiplier 0ffffh instead of 10001h (for matrix element = 1;
29 :			; * in that case, 1 is added before multiplying, that additional 1 comes
30 :			; * from intra_matrix1; (b) rounding error for large coefficients and matrix
31 :			; * elements is fixed by two-step approach: first approximation (rounded
32 :			; * down) is found as usual; the result is multiplied by the matrix element
33 :			; * and mismatch is used to calculate the correction.
34 :			; *************************************************************************/
35 :			; _3dne functions are compatible with iSSE, but are optimized specifically
36 :			; for K7 pipelines
37 :			;
38 :			;---------------------------------------------------------------------------
39 :			; 09.12.2002 Athlon optimizations contributed by Jaan Kalda
40 :			;---------------------------------------------------------------------------
41 :
42 :
43 :			; data/text alignment
44 :			%define ALIGN 8
45 :			%define SATURATE
46 :
47 :			bits 32
48 :
49 :			%ifdef FORMAT_COFF
50 :			SECTION .data data
51 :			%else
52 :			SECTION .data data align=8
53 :			%endif
54 :
55 :			%macro cglobal 1
56 :			%ifdef PREFIX
57 :			global _%1
58 :			%define %1 _%1
59 :			%else
60 :			global %1
61 :			%endif
62 :			%endmacro
63 :
64 :			%macro cextern 1
65 :			%ifdef PREFIX
66 :			extern _%1
67 :			%define %1 _%1
68 :			%else
69 :			extern %1
70 :			%endif
71 :			%endmacro
72 :			align 8
73 :			mmzero dd 0,0
74 :
75 :			mmx_one times 4 dw 1
76 :
77 :			;===========================================================================
78 :			;
79 :			; divide by 2Q table
80 :			;
81 :			;===========================================================================
82 :
83 :			align ALIGN
84 :			mmx_divs ;i>2
85 :			%assign i 1
86 :			%rep 31
87 :			times 4 dw ((1 << 15) / i + 1)
88 :			%assign i i+1
89 :			%endrep
90 :
91 :			align ALIGN
92 :			mmx_div ;i>2
93 :			%assign i 1
94 :			%rep 31
95 :			times 4 dw ((1 << 16) / i + 1)
96 :			%assign i i+1
97 :			%endrep
98 :
99 :
100 :			;===========================================================================
101 :			;
102 :			; intra matrix
103 :			;
104 :			;===========================================================================
105 :
106 :			%macro FIXX 1
107 :			dw (1 << 16) / (%1) + 1
108 :			%endmacro
109 :
110 :			cextern intra_matrix_fixl
111 :			cextern intra_matrix_fix
112 :			cextern intra_matrix1
113 :			cextern intra_matrix
114 :
115 :			;===========================================================================
116 :			;
117 :			; inter matrix
118 :			;
119 :			;===========================================================================
120 :
121 :			cextern inter_matrix1
122 :			cextern inter_matrix
123 :			cextern inter_matrix_fix
124 :			cextern inter_matrix_fixl
125 :
126 :
127 :			%define VM18P 3
128 :			%define VM18Q 4
129 :			%define nop4 db 08Dh,074h,026h,0
130 :			%define nop3 add esp,byte 0
131 :			%define nop2 mov esp,esp
132 :			%define nop7 db 08dh,02ch,02dh,0,0,0,0
133 :			%define nop6 add ebp,dword 0
134 :
135 :			;===========================================================================
136 :			;
137 :			; quantd table
138 :			;
139 :			;===========================================================================
140 :
141 :
142 :			quantd
143 :			%assign i 1
144 :			%rep 31
145 :			times 4 dw (((VM18P*i) + (VM18Q/2)) / VM18Q)
146 :			%assign i i+1
147 :			%endrep
148 :
149 :			;===========================================================================
150 :			;
151 :			; multiple by 2Q table
152 :			;
153 :			;===========================================================================
154 :
155 :
156 :			mmx_mul_quant
157 :			%assign i 1
158 :			%rep 31
159 :			times 4 dw i
160 :			%assign i i+1
161 :			%endrep
162 :
163 :			;===========================================================================
164 :			;
165 :			; saturation limits
166 :			;
167 :			;===========================================================================
168 :
169 :			align 16
170 :
171 :			mmx_32767_minus_2047 times 4 dw (32767-2047)
172 :			mmx_32768_minus_2048 times 4 dw (32768-2048)
173 :			mmx_2047 times 4 dw 2047
174 :			mmx_minus_2048 times 4 dw (-2048)
175 :			zero times 4 dw 0
176 :
177 :			int_div
178 :			dd 0
179 :			%assign i 1
180 :			%rep 255
181 :			dd (1 << 17) / ( i) + 1
182 :			%assign i i+1
183 :			%endrep
184 :
185 :			section .text
186 :
187 :			;===========================================================================
188 :			;
189 :			; void quant4_intra_xmm(int16_t * coeff,
190 :			; const int16_t const * data,
191 :			; const uint32_t quant,
192 :			; const uint32_t dcscalar);
193 :			;
194 :			;===========================================================================
195 :
196 :			align ALIGN
197 :			cglobal quant_mpeg_intra_xmm
198 :			quant_mpeg_intra_xmm:
199 :			mov eax, [esp + 8] ; data
200 :			mov ecx, [esp + 12] ; quant
201 :			mov edx, [esp + 4] ; coeff
202 :			push esi
203 :			push edi
204 :			push ebx
205 :			nop
206 :			mov edi,mmzero
207 :			mov esi,-14
208 :			pxor mm0,mm0
209 :			pxor mm3,mm3
210 :			cmp ecx,byte 1
211 :			je near .q1loop
212 :			cmp ecx,byte 19
213 :			jg near .lloop
214 :			nop6
215 :
216 :
217 :			align ALIGN
218 :			.loop
219 :			movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
220 :			psubw mm0, mm1 ;-mm1
221 :			movq mm4, [eax + 8*esi + 120] ;
222 :			psubw mm3, mm4 ;-mm4
223 :			pmaxsw mm0, mm1 ;\|src\|
224 :			pmaxsw mm3,mm4
225 :			nop2
226 :			psraw mm1, 15 ;sign src
227 :			psraw mm4, 15
228 :			psllw mm0, 4 ;level << 4 ;
229 :			psllw mm3, 4
230 :			paddw mm0, [intra_matrix1 + 8*esi+112]
231 :			paddw mm3, [intra_matrix1 + 8*esi+120]
232 :			movq mm5, [intra_matrix_fixl + 8*esi+112]
233 :			movq mm7, [intra_matrix_fixl + 8*esi+120]
234 :			pmulhuw mm5, mm0
235 :			pmulhuw mm7, mm3
236 :			mov esp, esp
237 :			movq mm2, [intra_matrix + 8*esi+112]
238 :			movq mm6, [intra_matrix + 8*esi+120]
239 :			pmullw mm2, mm5
240 :			pmullw mm6, mm7
241 :			psubw mm0, mm2
242 :			psubw mm3, mm6
243 :			nop4
244 :			movq mm2, [quantd + ecx * 8 - 8]
245 :			movq mm6, [mmx_divs + ecx * 8 - 8]
246 :			paddw mm5, mm2
247 :			paddw mm7, mm2
248 :			mov esp, esp
249 :			pmulhuw mm0, [intra_matrix_fix + 8*esi+112]
250 :			pmulhuw mm3, [intra_matrix_fix + 8*esi+120]
251 :			paddw mm5, mm0
252 :			paddw mm7, mm3
253 :			movq mm0, [edi]
254 :			movq mm3, [edi]
255 :			pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
256 :			pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32)
257 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
258 :			pxor mm7, mm4 ;
259 :			psubw mm5, mm1 ; undisplace
260 :			psubw mm7, mm4 ;
261 :			movq [edx + 8*esi+112], mm5
262 :			movq [edx + 8*esi +120], mm7
263 :			add esi, byte 2
264 :			jng near .loop
265 :
266 :			.done
267 :			; calculate data[0] // (int32_t)dcscalar)
268 :			mov esi, [esp + 12 + 16] ; dcscalar
269 :			movsx ecx, word [eax]
270 :			mov edi, ecx
271 :			mov edx, [esp + 12 + 16]
272 :			shr edx, 1 ; ebx = dcscalar /2
273 :			sar edi, 31 ; cdq is vectorpath
274 :			xor edx, edi ; ebx = eax V -eax -1
275 :			sub ecx, edi
276 :			add ecx, edx
277 :			mov edx, [dword esp + 12 + 4]
278 :			mov esi, [int_div+4*esi]
279 :			imul ecx, esi
280 :			sar ecx, 17
281 :			lea ebx, [byte ecx + 1]
282 :			cmovs ecx, ebx
283 :			; idiv cx ; ecx = edi:ecx / dcscalar
284 :
285 :			mov ebx, [esp]
286 :			mov edi, [esp+4]
287 :			mov esi, [esp+8]
288 :			add esp, byte 12
289 :			mov [edx], cx ; coeff[0] = ax
290 :
291 :			ret
292 :
293 :			align ALIGN
294 :			.q1loop
295 :			movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
296 :			psubw mm0, mm1 ;-mm1
297 :			movq mm4, [eax + 8*esi+120] ;
298 :			psubw mm3, mm4 ;-mm4
299 :			pmaxsw mm0, mm1 ;\|src\|
300 :			pmaxsw mm3, mm4
301 :			nop2
302 :			psraw mm1, 15 ;sign src
303 :			psraw mm4, 15
304 :			psllw mm0, 4 ; level << 4
305 :			psllw mm3, 4
306 :			paddw mm0, [intra_matrix1 + 8*esi+112] ;mm0 is to be divided
307 :			paddw mm3, [intra_matrix1 + 8*esi+120] ;intra1 contains fix for division by 1
308 :			movq mm5, [intra_matrix_fixl + 8*esi+112] ;with rounding down
309 :			movq mm7, [intra_matrix_fixl + 8*esi+120]
310 :			pmulhuw mm5, mm0
311 :			pmulhuw mm7, mm3 ;mm7: first approx of division
312 :			mov esp, esp
313 :			movq mm2, [intra_matrix + 8*esi+112]
314 :			movq mm6, [intra_matrix + 8*esi+120] ; divs for q<=16
315 :			pmullw mm2, mm5 ;test value <= original
316 :			pmullw mm6, mm7
317 :			psubw mm0, mm2 ;mismatch
318 :			psubw mm3, mm6
319 :			nop4
320 :			movq mm2, [quantd + ecx * 8 - 8]
321 :			paddw mm5, mm2 ;first approx with quantd
322 :			paddw mm7, mm2
323 :			mov esp, esp
324 :			pmulhuw mm0, [intra_matrix_fix + 8*esi+112] ;correction
325 :			pmulhuw mm3, [intra_matrix_fix + 8*esi+120]
326 :			paddw mm5, mm0 ;final result with quantd
327 :			paddw mm7, mm3
328 :			movq mm0, [edi]
329 :			movq mm3, [edi]
330 :			mov esp, esp
331 :			psrlw mm5, 1 ; (level + quantd) /2 (quant = 1)
332 :			psrlw mm7, 1
333 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
334 :			pxor mm7, mm4 ;
335 :			psubw mm5, mm1 ; undisplace
336 :			psubw mm7, mm4 ;
337 :			movq [edx + 8*esi+112], mm5
338 :			movq [edx + 8*esi +120], mm7
339 :			add esi, byte 2
340 :			jng near .q1loop
341 :			jmp near .done
342 :
343 :			align 8
344 :			.lloop
345 :			movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
346 :			psubw mm0, mm1 ;-mm1
347 :			movq mm4, [eax + 8*esi+120] ;
348 :			psubw mm3, mm4 ;-mm4
349 :			pmaxsw mm0, mm1 ;\|src\|
350 :			pmaxsw mm3, mm4
351 :			nop2
352 :			psraw mm1, 15 ;sign src
353 :			psraw mm4, 15
354 :			psllw mm0, 4 ; level << 4
355 :			psllw mm3, 4 ;
356 :			paddw mm0, [intra_matrix1 + 8*esi+112] ;mm0 is to be divided intra1 contains fix for division by 1
357 :			paddw mm3, [intra_matrix1 + 8*esi+120]
358 :			movq mm5, [intra_matrix_fixl + 8*esi+112]
359 :			movq mm7, [intra_matrix_fixl + 8*esi+120]
360 :			pmulhuw mm5, mm0
361 :			pmulhuw mm7, mm3 ;mm7: first approx of division
362 :			mov esp, esp
363 :			movq mm2, [intra_matrix + 8*esi+112]
364 :			movq mm6, [intra_matrix + 8*esi+120]
365 :			pmullw mm2, mm5 ;test value <= original
366 :			pmullw mm6, mm7
367 :			psubw mm0, mm2 ;mismatch
368 :			psubw mm3, mm6
369 :			nop4
370 :			movq mm2, [quantd + ecx * 8 - 8]
371 :			movq mm6, [mmx_div + ecx * 8 - 8] ; divs for q<=16
372 :			paddw mm5, mm2 ;first approx with quantd
373 :			paddw mm7, mm2
374 :			mov esp, esp
375 :			pmulhuw mm0, [intra_matrix_fix + 8*esi+112] ;correction
376 :			pmulhuw mm3, [intra_matrix_fix + 8*esi+120]
377 :			paddw mm5, mm0 ;final result with quantd
378 :			paddw mm7, mm3
379 :			movq mm0, [edi]
380 :			movq mm3, [edi]
381 :			mov esp, esp
382 :			pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
383 :			pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32)
384 :			psrlw mm5, 1 ; (level + quantd) / (2*quant)
385 :			psrlw mm7, 1
386 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
387 :			pxor mm7, mm4 ;
388 :			psubw mm5, mm1 ; undisplace
389 :			psubw mm7, mm4 ;
390 :			movq [edx + 8*esi+112], mm5
391 :			movq [edx + 8*esi +120], mm7
392 :			add esi,byte 2
393 :			jng near .lloop
394 :			jmp near .done
395 :
396 :			;===========================================================================
397 :			;
398 :			; uint32_t quant4_inter_xmm(int16_t * coeff,
399 :			; const int16_t const * data,
400 :			; const uint32_t quant);
401 :			;
402 :			;===========================================================================
403 :
404 :			align ALIGN
405 :			cglobal quant_mpeg_inter_xmm
406 :			quant_mpeg_inter_xmm:
407 :			mov eax, [esp + 8] ; data
408 :			mov ecx, [esp + 12] ; quant
409 :			mov edx, [esp + 4] ; coeff
410 :			push esi
411 :			push edi
412 :			push ebx
413 :			nop
414 :			mov edi,mmzero
415 :			mov esi,-14
416 :			mov ebx,esp
417 :			sub esp,byte 24
418 :			lea ebx,[esp+8]
419 :			and ebx,byte -8 ;align 8
420 :			pxor mm0,mm0
421 :			pxor mm3,mm3
422 :			movq [byte ebx],mm0
423 :			db 0Fh, 7Fh, 44h, 23h, 8 ;movq [ebx+8],mm0
424 :			cmp ecx,byte 1
425 :			je near .q1loop
426 :			cmp ecx,byte 19
427 :			jg near .lloop
428 :			nop
429 :
430 :			align ALIGN
431 :			.loop
432 :			movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
433 :			psubw mm0,mm1 ;-mm1
434 :			movq mm4, [eax + 8*esi + 120] ;
435 :			psubw mm3,mm4 ;-mm4
436 :			pmaxsw mm0,mm1 ;\|src\|
437 :			pmaxsw mm3,mm4
438 :			nop2
439 :			psraw mm1,15 ;sign src
440 :			psraw mm4,15
441 :			psllw mm0, 4 ; level << 4
442 :			psllw mm3, 4 ;
443 :			paddw mm0, [inter_matrix1 + 8*esi+112]
444 :			paddw mm3, [inter_matrix1 + 8*esi+120]
445 :			movq mm5,[inter_matrix_fixl + 8*esi+112]
446 :			movq mm7,[inter_matrix_fixl + 8*esi+120]
447 :			pmulhuw mm5,mm0
448 :			pmulhuw mm7,mm3
449 :			mov esp,esp
450 :			movq mm2,[inter_matrix + 8*esi+112]
451 :			movq mm6,[inter_matrix + 8*esi+120]
452 :			pmullw mm2,mm5
453 :			pmullw mm6,mm7
454 :			psubw mm0,mm2
455 :			psubw mm3,mm6
456 :			movq mm2,[byte ebx]
457 :			movq mm6,[mmx_divs + ecx * 8 - 8]
458 :			pmulhuw mm0,[inter_matrix_fix + 8*esi+112]
459 :			pmulhuw mm3,[inter_matrix_fix + 8*esi+120]
460 :			paddw mm2,[ebx+8] ;sum
461 :			paddw mm5,mm0
462 :			paddw mm7,mm3
463 :			movq mm0,[edi]
464 :			movq mm3,[edi]
465 :			pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
466 :			pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32)
467 :			add esi,byte 2
468 :			paddw mm2,mm5 ;sum += x1
469 :			movq [ebx],mm7 ;store x2
470 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
471 :			pxor mm7, mm4 ;
472 :			psubw mm5, mm1 ; undisplace
473 :			psubw mm7, mm4 ;
474 :			db 0Fh, 7Fh, 54h, 23h, 08 ;movq [ebx+8],mm2 ;store sum
475 :			movq [edx + 8*esi+112-16], mm5
476 :			movq [edx + 8*esi +120-16], mm7
477 :			jng near .loop
478 :
479 :			.done
480 :			; calculate data[0] // (int32_t)dcscalar)
481 :			paddw mm2,[ebx]
482 :			mov ebx,[esp+24]
483 :			mov edi,[esp+4+24]
484 :			mov esi,[esp+8+24]
485 :			add esp,byte 12+24
486 :			pmaddwd mm2, [mmx_one]
487 :			punpckldq mm0,mm2 ;get low dw to mm0:high
488 :			paddd mm0,mm2
489 :			punpckhdq mm0,mm0 ;get result to low
490 :			movd eax, mm0
491 :
492 :			ret
493 :
494 :			align ALIGN
495 :			.q1loop
496 :			movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
497 :			psubw mm0,mm1 ;-mm1
498 :			movq mm4, [eax + 8*esi+120] ;
499 :			psubw mm3,mm4 ;-mm4
500 :			pmaxsw mm0,mm1 ;\|src\|
501 :			pmaxsw mm3,mm4
502 :			nop2
503 :			psraw mm1,15 ;sign src
504 :			psraw mm4,15
505 :			psllw mm0, 4 ; level << 4
506 :			psllw mm3, 4
507 :			paddw mm0, [inter_matrix1 + 8*esi+112] ;mm0 is to be divided
508 :			paddw mm3, [inter_matrix1 + 8*esi+120] ; inter1 contains fix for division by 1
509 :			movq mm5,[inter_matrix_fixl + 8*esi+112] ;with rounding down
510 :			movq mm7,[inter_matrix_fixl + 8*esi+120]
511 :			pmulhuw mm5,mm0
512 :			pmulhuw mm7,mm3 ;mm7: first approx of division
513 :			mov esp,esp
514 :			movq mm2,[inter_matrix + 8*esi+112]
515 :			movq mm6,[inter_matrix + 8*esi+120] ; divs for q<=16
516 :			pmullw mm2,mm5 ;test value <= original
517 :			pmullw mm6,mm7
518 :			psubw mm0,mm2 ;mismatch
519 :			psubw mm3,mm6
520 :			movq mm2,[byte ebx]
521 :			pmulhuw mm0,[inter_matrix_fix + 8*esi+112] ;correction
522 :			pmulhuw mm3,[inter_matrix_fix + 8*esi+120]
523 :			paddw mm2,[ebx+8] ;sum
524 :			paddw mm5,mm0 ;final result
525 :			paddw mm7,mm3
526 :			movq mm0,[edi]
527 :			movq mm3,[edi]
528 :			psrlw mm5, 1 ; (level ) /2 (quant = 1)
529 :			psrlw mm7, 1
530 :			add esi,byte 2
531 :			paddw mm2,mm5 ;sum += x1
532 :			movq [ebx],mm7 ;store x2
533 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
534 :			pxor mm7, mm4 ;
535 :			psubw mm5, mm1 ; undisplace
536 :			psubw mm7, mm4 ;
537 :			movq [ebx+8],mm2 ;store sum
538 :			movq [edx + 8*esi+112-16], mm5
539 :			movq [edx + 8*esi +120-16], mm7
540 :			jng near .q1loop
541 :			jmp near .done
542 :
543 :			align 8
544 :			.lloop
545 :			movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
546 :			psubw mm0,mm1 ;-mm1
547 :			movq mm4, [eax + 8*esi+120] ;
548 :			psubw mm3,mm4 ;-mm4
549 :			pmaxsw mm0,mm1 ;\|src\|
550 :			pmaxsw mm3,mm4
551 :			nop2
552 :			psraw mm1,15 ;sign src
553 :			psraw mm4,15
554 :			psllw mm0, 4 ; level << 4
555 :			psllw mm3, 4 ;
556 :			paddw mm0, [inter_matrix1 + 8*esi+112] ;mm0 is to be divided inter1 contains fix for division by 1
557 :			paddw mm3, [inter_matrix1 + 8*esi+120]
558 :			movq mm5,[inter_matrix_fixl + 8*esi+112]
559 :			movq mm7,[inter_matrix_fixl + 8*esi+120]
560 :			pmulhuw mm5,mm0
561 :			pmulhuw mm7,mm3 ;mm7: first approx of division
562 :			mov esp,esp
563 :			movq mm2,[inter_matrix + 8*esi+112]
564 :			movq mm6,[inter_matrix + 8*esi+120]
565 :			pmullw mm2,mm5 ;test value <= original
566 :			pmullw mm6,mm7
567 :			psubw mm0,mm2 ;mismatch
568 :			psubw mm3,mm6
569 :			movq mm2,[byte ebx]
570 :			movq mm6,[mmx_div + ecx * 8 - 8] ; divs for q<=16
571 :			pmulhuw mm0,[inter_matrix_fix + 8*esi+112] ;correction
572 :			pmulhuw mm3,[inter_matrix_fix + 8*esi+120]
573 :			paddw mm2,[ebx+8] ;sum
574 :			paddw mm5,mm0 ;final result
575 :			paddw mm7,mm3
576 :			movq mm0,[edi]
577 :			movq mm3,[edi]
578 :			pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
579 :			pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32)
580 :			add esi,byte 2
581 :			psrlw mm5, 1 ; (level ) / (2*quant)
582 :			paddw mm2,mm5 ;sum += x1
583 :			psrlw mm7, 1
584 :			movq [ebx],mm7 ;store x2
585 :			pxor mm5, mm1 ; mm0 *= sign(mm0)
586 :			pxor mm7, mm4 ;
587 :			psubw mm5, mm1 ; undisplace
588 :			psubw mm7, mm4 ;
589 :			db 0Fh, 7Fh, 54h, 23h, 08 ;movq [ebx+8],mm2 ;store sum
590 :			movq [edx + 8*esi+112-16], mm5
591 :			movq [edx + 8*esi +120-16], mm7
592 :			jng near .lloop
593 :			jmp near .done
594 :
595 :
596 :			;===========================================================================
597 :			;
598 :			; void dequant4_intra_mmx(int16_t *data,
599 :			; const int16_t const *coeff,
600 :			; const uint32_t quant,
601 :			; const uint32_t dcscalar);
602 :			;
603 :			;===========================================================================
604 :
605 :			; Note: in order to saturate 'easily', we pre-shift the quantifier
606 :			; by 4. Then, the high-word of (coeff[]matrix[i]quant) are used to
607 :			; build a saturating mask. It is non-zero only when an overflow occured.
608 :			; We thus avoid packing/unpacking toward double-word.
609 :			; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g.,
610 :			; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not
611 :			; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a
612 :			; and quant in [1..31].
613 :			;
614 :			;********************************************************************
615 :			%macro DEQUANT4INTRAMMX 1
616 :			movq mm1, [byte ecx+ 16 * %1] ; mm0 = c = coeff[i]
617 :			movq mm4, [ecx+ 16 * %1 +8]; mm3 = c' = coeff[i+1]
618 :			psubw mm0,mm1
619 :			psubw mm3,mm4
620 :			pmaxsw mm0,mm1
621 :			pmaxsw mm3,mm4
622 :			psraw mm1,15
623 :			psraw mm4,15
624 :			%if %1
625 :			movq mm2,[eax+8] ;preshifted quant
626 :			movq mm7,[eax+8]
627 :			%endif
628 :			pmullw mm2, [intra_matrix + 16 * %1 ] ; matrix[i]*quant
629 :			pmullw mm7, [intra_matrix + 16 * %1 +8] ; matrix[i+1]*quant
630 :			movq mm5,mm0
631 :			movq mm6,mm3
632 :			pmulhw mm0, mm2 ; high of coeff(matrixquant)
633 :			pmulhw mm3, mm7 ; high of coeff(matrixquant)
634 :			pmullw mm2, mm5 ; low of coeff(matrixquant)
635 :			pmullw mm7, mm6 ; low of coeff(matrixquant)
636 :			pcmpgtw mm0, [eax]
637 :			pcmpgtw mm3, [eax]
638 :			paddusw mm2, mm0
639 :			paddusw mm7, mm3
640 :			psrlw mm2, 5
641 :			psrlw mm7, 5
642 :			pxor mm2, mm1 ; start negating back
643 :			pxor mm7, mm4 ; start negating back
644 :			psubusw mm1, mm0
645 :			psubusw mm4, mm3
646 :			movq mm0,[eax] ;zero
647 :			movq mm3,[eax] ;zero
648 :			psubw mm2, mm1 ; finish negating back
649 :			psubw mm7, mm4 ; finish negating back
650 :			movq [byte edx + 16 * %1], mm2 ; data[i]
651 :			movq [edx + 16 * %1 +8], mm7 ; data[i+1]
652 :			%endmacro
653 :
654 :			align 16
655 :			cglobal dequant_mpeg_intra_3dne
656 :			dequant_mpeg_intra_3dne:
657 :			mov eax, [esp+12] ; quant
658 :			mov ecx, [esp+8] ; coeff
659 :			movq mm7, [mmx_mul_quant + eax*8 - 8]
660 :			psllw mm7, 2 ; << 2. See comment.
661 :			mov edx, [esp+4] ; data
662 :			push ebx
663 :			movsx ebx,word [ecx]
664 :			pxor mm0, mm0
665 :			pxor mm3, mm3
666 :			push esi
667 :			lea eax,[esp-28]
668 :			sub esp,byte 32
669 :			and eax,byte -8 ;points to qword aligned space on stack
670 :			movq [eax],mm0
671 :			movq [eax+8],mm7
672 :			imul ebx,[esp+16+8+32] ; dcscalar
673 :			movq mm2,mm7
674 :
675 :
676 :			align 4
677 :
678 :			DEQUANT4INTRAMMX 0
679 :
680 :			mov esi,-2048
681 :			nop
682 :			cmp ebx,esi
683 :
684 :			DEQUANT4INTRAMMX 1
685 :
686 :			cmovl ebx, esi
687 :			neg esi
688 :			sub esi, byte 1 ;2047
689 :
690 :			DEQUANT4INTRAMMX 2
691 :
692 :			cmp ebx, esi
693 :			cmovg ebx, esi
694 :			lea ebp, [byte ebp]
695 :
696 :			DEQUANT4INTRAMMX 3
697 :
698 :			mov esi, [esp+32]
699 :			mov [byte edx], bx
700 :			mov ebx, [esp+32+4]
701 :
702 :			DEQUANT4INTRAMMX 4
703 :			DEQUANT4INTRAMMX 5
704 :			DEQUANT4INTRAMMX 6
705 :			DEQUANT4INTRAMMX 7
706 :
707 :			add esp, byte 32+8
708 :
709 :			ret
710 :
711 :			;===========================================================================
712 :			;
713 :			; void dequant4_inter_3dne(int16_t * data,
714 :			; const int16_t * const coeff,
715 :			; const uint32_t quant);
716 :			;
717 :			;===========================================================================
718 :
719 :			; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier
720 :			; so we handle the 3 cases: c<0, c==0, and c>0 in one shot.
721 :			; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0.
722 :			; It's mixed with the extraction of the absolute value.
723 :
724 :			align 16
725 :			cglobal dequant_mpeg_inter_3dne
726 :			dequant_mpeg_inter_3dne:
727 :			mov edx, [esp+ 4] ; data
728 :			mov ecx, [esp+ 8] ; coeff
729 :			mov eax, [esp+12] ; quant
730 :			movq mm7, [mmx_mul_quant + eax*8 - 8]
731 :			mov eax, -14
732 :			paddw mm7, mm7 ; << 1
733 :			pxor mm6, mm6 ; mismatch sum
734 :			push esi
735 :			mov esi,mmzero
736 :			pxor mm1,mm1
737 :			pxor mm3,mm3
738 :			nop
739 :			nop4
740 :
741 :			align 16
742 :			.loop
743 :			movq mm0, [ecx+8eax + 716 ] ; mm0 = coeff[i]
744 :			pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved)
745 :			movq mm2, [ecx+8eax + 716 +8] ; mm2 = coeff[i+1]
746 :			pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved)
747 :			paddsw mm0, mm1 ; c += sgn(c)
748 :			paddsw mm2, mm3 ; c += sgn(c')
749 :			paddw mm0, mm0 ; c *= 2
750 :			paddw mm2, mm2 ; c'*= 2
751 :
752 :			movq mm4, [esi]
753 :			movq mm5, [esi]
754 :			psubw mm4, mm0 ; -c
755 :			psubw mm5, mm2 ; -c'
756 :
757 :			psraw mm4, 16 ; mm4 = sgn(-c)
758 :			psraw mm5, 16 ; mm5 = sgn(-c')
759 :			psubsw mm0, mm4 ; c -= sgn(-c)
760 :			psubsw mm2, mm5 ; c' -= sgn(-c')
761 :			pxor mm0, mm1 ; finish changing sign if needed
762 :			pxor mm2, mm3 ; finish changing sign if needed
763 :
764 :			; we're short on register, here. Poor pairing...
765 :
766 :			movq mm4, mm7 ; (matrix*quant)
767 :			nop
768 :			pmullw mm4, [inter_matrix + 8eax + 716]
769 :			movq mm5, mm4
770 :			pmulhw mm5, mm0 ; high of c(matrixquant)
771 :			pmullw mm0, mm4 ; low of c(matrixquant)
772 :
773 :			movq mm4, mm7 ; (matrix*quant)
774 :			pmullw mm4, [inter_matrix + 8eax + 716 + 8]
775 :			add eax,byte 2
776 :
777 :			pcmpgtw mm5, [esi]
778 :			paddusw mm0, mm5
779 :			psrlw mm0, 5
780 :			pxor mm0, mm1 ; start restoring sign
781 :			psubusw mm1, mm5
782 :
783 :			movq mm5, mm4
784 :			pmulhw mm5, mm2 ; high of c(matrixquant)
785 :			pmullw mm2, mm4 ; low of c(matrixquant)
786 :			psubw mm0, mm1 ; finish restoring sign
787 :
788 :			pcmpgtw mm5, [esi]
789 :			paddusw mm2, mm5
790 :			psrlw mm2, 5
791 :			pxor mm2, mm3 ; start restoring sign
792 :			psubusw mm3, mm5
793 :			psubw mm2, mm3 ; finish restoring sign
794 :			movq mm1, [esi]
795 :			movq mm3, [byte esi]
796 :			pxor mm6, mm0 ; mismatch control
797 :			movq [edx + 8eax + 716 -2*8 ], mm0 ; data[i]
798 :			pxor mm6, mm2 ; mismatch control
799 :			movq [edx + 8eax + 716 -2*8 +8], mm2 ; data[i+1]
800 :
801 :			jng .loop
802 :			nop
803 :
804 :			; mismatch control
805 :
806 :			pshufw mm0,mm6,01010101b
807 :			pshufw mm1,mm6,10101010b
808 :			pshufw mm2,mm6,11111111b
809 :			pxor mm6, mm0
810 :			pxor mm1, mm2
811 :			pxor mm6, mm1
812 :			movd eax, mm6
813 :			and eax,byte 1
814 :			xor eax,byte 1
815 :			mov esi,[esp]
816 :			add esp,byte 4
817 :			xor word [edx + 2*63], ax
818 :
819 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4