Annotation of /trunk/xvidcore/src/quant/x86_asm/quantize_h263_3dne.asm

Revision 1540 - (view) (download)

1 :	edgomez	1382	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dne Quantization/Dequantization -
5 :			; *
6 :			; * Copyright(C) 2002-2003 Jaan Kalda
7 :			; *
8 :			; * This program is free software ; you can redistribute it and/or modify
9 :			; * it under the terms of the GNU General Public License as published by
10 :			; * the Free Software Foundation ; either version 2 of the License, or
11 :			; * (at your option) any later version.
12 :			; *
13 :			; * This program is distributed in the hope that it will be useful,
14 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
15 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :			; * GNU General Public License for more details.
17 :			; *
18 :			; * You should have received a copy of the GNU General Public License
19 :			; * along with this program ; if not, write to the Free Software
20 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :			; *
22 :	edgomez	1540	; * $Id: quantize_h263_3dne.asm,v 1.5 2004-08-29 10:02:38 edgomez Exp $
23 :	edgomez	1382	; *
24 :			; *************************************************************************/
25 :			;
26 :			; these 3dne functions are compatible with iSSE, but are optimized specifically for
27 :			; K7 pipelines
28 :
29 :			; enable dequant saturate [-2048,2047], test purposes only.
30 :			%define SATURATE
31 :
32 :			BITS 32
33 :
34 :			%macro cglobal 1
35 :			%ifdef PREFIX
36 :	edgomez	1535	%ifdef MARK_FUNCS
37 :	edgomez	1540	global _%1:function %1.endfunc-%1
38 :			%define %1 _%1:function %1.endfunc-%1
39 :	edgomez	1535	%else
40 :			global _%1
41 :			%define %1 _%1
42 :			%endif
43 :	edgomez	1382	%else
44 :	edgomez	1535	%ifdef MARK_FUNCS
45 :	edgomez	1540	global %1:function %1.endfunc-%1
46 :	edgomez	1535	%else
47 :			global %1
48 :			%endif
49 :	edgomez	1382	%endif
50 :			%endmacro
51 :
52 :			;=============================================================================
53 :			; Local data
54 :			;=============================================================================
55 :
56 :			%ifdef FORMAT_COFF
57 :	edgomez	1519	SECTION .rodata
58 :	edgomez	1382	%else
59 :	edgomez	1519	SECTION .rodata align=16
60 :	edgomez	1382	%endif
61 :
62 :			align 4
63 :			int_div:
64 :			dd 0
65 :			%assign i 1
66 :			%rep 255
67 :			dd (1 << 16) / (i) + 1
68 :			%assign i i+1
69 :			%endrep
70 :
71 :			ALIGN 16
72 :			plus_one:
73 :			times 8 dw 1
74 :
75 :			;-----------------------------------------------------------------------------
76 :			; subtract by Q/2 table
77 :			;-----------------------------------------------------------------------------
78 :
79 :			ALIGN 16
80 :			mmx_sub:
81 :			%assign i 1
82 :			%rep 31
83 :			times 4 dw i / 2
84 :			%assign i i+1
85 :			%endrep
86 :
87 :
88 :			;-----------------------------------------------------------------------------
89 :			;
90 :			; divide by 2Q table
91 :			;
92 :			; use a shift of 16 to take full advantage of _pmulhw_
93 :			; for q=1, _pmulhw_ will overflow so it is treated seperately
94 :			; (3dnow2 provides _pmulhuw_ which wont cause overflow)
95 :			;
96 :			;-----------------------------------------------------------------------------
97 :
98 :			ALIGN 16
99 :			mmx_div:
100 :			%assign i 1
101 :			%rep 31
102 :			times 4 dw (1 << 16) / (i * 2) + 1
103 :			%assign i i+1
104 :			%endrep
105 :
106 :			;-----------------------------------------------------------------------------
107 :			; add by (odd(Q) ? Q : Q - 1) table
108 :			;-----------------------------------------------------------------------------
109 :
110 :			ALIGN 16
111 :			mmx_add:
112 :			%assign i 1
113 :			%rep 31
114 :			%if i % 2 != 0
115 :			times 4 dw i
116 :			%else
117 :			times 4 dw i - 1
118 :			%endif
119 :			%assign i i+1
120 :			%endrep
121 :
122 :			;-----------------------------------------------------------------------------
123 :			; multiple by 2Q table
124 :			;-----------------------------------------------------------------------------
125 :
126 :			ALIGN 16
127 :			mmx_mul:
128 :			%assign i 1
129 :			%rep 31
130 :			times 4 dw i * 2
131 :			%assign i i+1
132 :			%endrep
133 :
134 :			;-----------------------------------------------------------------------------
135 :			; saturation limits
136 :			;-----------------------------------------------------------------------------
137 :
138 :			ALIGN 8
139 :			mmx_32768_minus_2048:
140 :			times 4 dw (32768-2048)
141 :			mmx_32767_minus_2047:
142 :			times 4 dw (32767-2047)
143 :
144 :			ALIGN 16
145 :			mmx_2047:
146 :			times 4 dw 2047
147 :
148 :			ALIGN 8
149 :			mmzero:
150 :			dd 0, 0
151 :			int2047:
152 :			dd 2047
153 :			int_2048:
154 :			dd -2048
155 :
156 :			;=============================================================================
157 :			; Code
158 :			;=============================================================================
159 :
160 :			SECTION .text
161 :
162 :
163 :			;-----------------------------------------------------------------------------
164 :			;
165 :			; uint32_t quant_h263_intra_3dne(int16_t * coeff,
166 :			; const int16_t const * data,
167 :			; const uint32_t quant,
168 :			; const uint32_t dcscalar,
169 :			; const uint16_t *mpeg_matrices);
170 :			;
171 :			;-----------------------------------------------------------------------------
172 :			;This is Athlon-optimized code (ca 70 clk per call)
173 :
174 :			%macro quant_intra1 1
175 :			psubw mm1, mm0 ;A3
176 :			psubw mm3, mm2 ;B3
177 :			%if (%1)
178 :			psubw mm5, mm4 ;C8
179 :			psubw mm7, mm6 ;D8
180 :			%endif
181 :
182 :			ALIGN 8
183 :			db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1
184 :			pmaxsw mm1, mm0 ;A4
185 :			db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1
186 :			pmaxsw mm3, mm2 ;B4
187 :
188 :
189 :			psraw mm0, 15 ;A5
190 :			psraw mm2, 15 ;B5
191 :			%if (%1)
192 :			movq [edx + %1 * 32 + 16-32], mm5 ;C9
193 :			movq [edx + %1 * 32 + 24-32], mm7 ;D9
194 :			%endif
195 :
196 :			psrlw mm1, 1 ;A6
197 :			psrlw mm3, 1 ;B6
198 :			movq mm5, [ebx] ;C2
199 :			movq mm7, [ebx] ;D2
200 :
201 :			pxor mm1, mm0 ;A7
202 :			pxor mm3, mm2 ;B7
203 :
204 :			psubw mm5, mm4 ;C3
205 :			psubw mm7, mm6 ;D3
206 :			psubw mm1, mm0 ;A8
207 :			psubw mm3, mm2 ;B8
208 :
209 :			%if (%1 == 0)
210 :			push ebp
211 :			movq mm0, [ecx + %1 * 32 +32]
212 :			%elif (%1 < 3)
213 :			db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1
214 :			%endif
215 :			pmaxsw mm5, mm4 ;C4
216 :			%if (%1 < 3)
217 :			db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1
218 :			%else
219 :			cmp esp, esp
220 :			%endif
221 :			pmaxsw mm7, mm6 ;D4
222 :
223 :			psraw mm4, 15 ;C5
224 :			psraw mm6, 15 ;D5
225 :			movq [byte edx + %1 * 32], mm1 ;A9
226 :			movq [edx + %1 * 32+8], mm3 ;B9
227 :
228 :
229 :			psrlw mm5, 1 ;C6
230 :			psrlw mm7, 1 ;D6
231 :			%if (%1 < 3)
232 :			movq mm1, [ebx] ;A2
233 :			movq mm3, [ebx] ;B2
234 :			%endif
235 :			%if (%1 == 3)
236 :			imul eax, [int_div+4*edi]
237 :			%endif
238 :			pxor mm5, mm4 ;C7
239 :			pxor mm7, mm6 ;D7
240 :			%endm
241 :
242 :
243 :			%macro quant_intra 1
244 :			; Rules for athlon:
245 :			; 1) schedule latencies
246 :			; 2) add/mul and load/store in 2:1 proportion
247 :			; 3) avoid spliting >3byte instructions over 8byte boundaries
248 :
249 :			psubw mm1, mm0 ;A3
250 :			psubw mm3, mm2 ;B3
251 :			%if (%1)
252 :			psubw mm5, mm4 ;C8
253 :			psubw mm7, mm6 ;D8
254 :			%endif
255 :
256 :			ALIGN 8
257 :			db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1
258 :			pmaxsw mm1, mm0 ;A4
259 :			db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1
260 :			pmaxsw mm3, mm2 ;B4
261 :
262 :
263 :			psraw mm0, 15 ;A5
264 :			psraw mm2, 15 ;B5
265 :			%if (%1)
266 :			movq [edx + %1 * 32 + 16-32], mm5 ;C9
267 :			movq [edx + %1 * 32 + 24-32], mm7 ;D9
268 :			%endif
269 :
270 :			pmulhw mm1, [esi] ;A6
271 :			pmulhw mm3, [esi] ;B6
272 :			movq mm5, [ebx] ;C2
273 :			movq mm7, [ebx] ;D2
274 :
275 :			nop
276 :			nop
277 :			pxor mm1, mm0 ;A7
278 :			pxor mm3, mm2 ;B7
279 :
280 :			psubw mm5, mm4 ;C3
281 :			psubw mm7, mm6 ;D3
282 :			psubw mm1, mm0 ;A8
283 :			psubw mm3, mm2 ;B8
284 :
285 :
286 :			%if (%1 < 3)
287 :			db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1
288 :			%endif
289 :			pmaxsw mm5, mm4 ;C4
290 :			%if (%1 < 3)
291 :			db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1
292 :			%else
293 :			cmp esp, esp
294 :			%endif
295 :			pmaxsw mm7,mm6 ;D4
296 :
297 :			psraw mm4, 15 ;C5
298 :			psraw mm6, 15 ;D5
299 :			movq [byte edx + %1 * 32], mm1 ;A9
300 :			movq [edx + %1 * 32+8], mm3 ;B9
301 :
302 :
303 :			pmulhw mm5, [esi] ;C6
304 :			pmulhw mm7, [esi] ;D6
305 :			%if (%1 < 3)
306 :			movq mm1, [ebx] ;A2
307 :			movq mm3, [ebx] ;B2
308 :			%endif
309 :			%if (%1 == 0)
310 :			push ebp
311 :			%elif (%1 < 3)
312 :			nop
313 :			%endif
314 :			nop
315 :			%if (%1 == 3)
316 :			imul eax, [int_div+4*edi]
317 :			%endif
318 :			pxor mm5, mm4 ;C7
319 :			pxor mm7, mm6 ;D7
320 :			%endmacro
321 :
322 :
323 :			ALIGN 16
324 :			cglobal quant_h263_intra_3dne
325 :			quant_h263_intra_3dne:
326 :
327 :			mov eax, [esp + 12] ; quant
328 :			mov ecx, [esp + 8] ; data
329 :			mov edx, [esp + 4] ; coeff
330 :			cmp al, 1
331 :			pxor mm1, mm1
332 :			pxor mm3, mm3
333 :			movq mm0, [ecx] ; mm0 = [1st]
334 :			movq mm2, [ecx + 8]
335 :			push esi
336 :			lea esi, [mmx_div + eax*8 - 8]
337 :
338 :			push ebx
339 :			mov ebx, mmzero
340 :			push edi
341 :			jz near .q1loop
342 :
343 :			quant_intra 0
344 :			mov ebp, [esp + 16 + 16] ; dcscalar
345 :			; NB -- there are 3 pushes in the function preambule and one more
346 :			; in "quant_intra 0", thus an added offset of 16 bytes
347 :			movsx eax, word [byte ecx] ; DC
348 :
349 :			quant_intra 1
350 :			mov edi, eax
351 :			sar edi, 31 ; sign(DC)
352 :			shr ebp, byte 1 ; ebp = dcscalar/2
353 :
354 :			quant_intra 2
355 :			sub eax, edi ; DC (+1)
356 :			xor ebp, edi ; sign(DC) dcscalar /2 (-1)
357 :			mov edi, [esp + 16 + 16] ; dscalar
358 :			lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2
359 :			mov ebp, [byte esp]
360 :
361 :			quant_intra 3
362 :			psubw mm5, mm4 ;C8
363 :			mov esi, [esp + 12] ; pop back the register value
364 :			mov edi, [esp + 4] ; pop back the register value
365 :			sar eax, 16
366 :			lea ebx, [byte eax + 1] ; workaround for eax < 0
367 :			cmovs eax, ebx ; conditionnaly move the corrected value
368 :			mov [edx], ax ; coeff[0] = ax
369 :			mov ebx, [esp + 8] ; pop back the register value
370 :			add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
371 :			psubw mm7, mm6 ;D8
372 :			movq [edx + 3 * 32 + 16], mm5 ;C9
373 :			movq [edx + 3 * 32 + 24], mm7 ;D9
374 :
375 :			xor eax, eax
376 :			ret
377 :
378 :			ALIGN 16
379 :
380 :			.q1loop
381 :			quant_intra1 0
382 :			mov ebp, [esp + 16 + 16] ; dcscalar
383 :			movsx eax, word [byte ecx] ; DC
384 :
385 :			quant_intra1 1
386 :			mov edi, eax
387 :			sar edi, 31 ; sign(DC)
388 :			shr ebp, byte 1 ; ebp = dcscalar /2
389 :
390 :			quant_intra1 2
391 :			sub eax, edi ; DC (+1)
392 :			xor ebp, edi ; sign(DC) dcscalar /2 (-1)
393 :			mov edi, [esp + 16 + 16] ; dcscalar
394 :			lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2
395 :			mov ebp, [byte esp]
396 :
397 :			quant_intra1 3
398 :			psubw mm5, mm4 ;C8
399 :			mov esi, [dword esp + 12] ; pop back the register value
400 :			mov edi, [esp + 4] ; pop back the register value
401 :			sar eax, 16
402 :			lea ebx, [byte eax + 1] ; workaround for eax < 0
403 :			cmovs eax, ebx ; conditionnaly move the corrected value
404 :			mov [edx], ax ; coeff[0] = ax
405 :			mov ebx, [esp + 8] ; pop back the register value
406 :			add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
407 :			psubw mm7, mm6 ;D8
408 :			movq [edx + 3 * 32 + 16], mm5 ;C9
409 :			movq [edx + 3 * 32 + 24], mm7 ;D9
410 :
411 :			xor eax, eax
412 :			ret
413 :	edgomez	1540	.endfunc
414 :	edgomez	1382
415 :
416 :
417 :
418 :			;-----------------------------------------------------------------------------
419 :			;
420 :			; uint32_t quant_h263_inter_3dne(int16_t * coeff,
421 :			; const int16_t const * data,
422 :			; const uint32_t quant,
423 :			; const uint16_t *mpeg_matrices);
424 :			;
425 :			;-----------------------------------------------------------------------------
426 :			;This is Athlon-optimized code (ca 90 clk per call)
427 :			;Optimized by Jaan, 30 Nov 2002
428 :
429 :
430 :			%macro quantinter 1
431 :			movq mm1, [eax] ;A2
432 :			psraw mm3, 15 ;B6
433 :			%if (%1)
434 :			psubw mm2, mm6 ;C10
435 :			%endif
436 :			psubw mm1, mm0 ;A3
437 :			pmulhw mm4, mm7 ;B7
438 :			movq mm6, [ecx + %1*24+16] ;C1
439 :			pmaxsw mm1, mm0 ;A4
440 :			paddw mm5, mm4 ;B8
441 :			%if (%1)
442 :			movq [edx + %1*24+16-24], mm2 ;C11
443 :			%endif
444 :			psubusw mm1, [ebx] ;A5 mm0 -= sub (unsigned, dont go < 0)
445 :			pxor mm4, mm3 ;B9
446 :			movq mm2, [eax] ;C2
447 :			psraw mm0, 15 ;A6
448 :			psubw mm4, mm3 ;B10
449 :			psubw mm2, mm6 ;C3
450 :			pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24
451 :			movq mm3, [ecx + %1*24+8] ;B1
452 :			pmaxsw mm2, mm6 ;C4
453 :			paddw mm5, mm1 ;A8 sum += mm0
454 :			%if (%1)
455 :			movq [edx + %1*24+8-24], mm4 ;B11
456 :			%else
457 :			movq [edx + 120], mm4 ;B11
458 :			%endif
459 :			psubusw mm2, [ebx] ;C5
460 :			pxor mm1, mm0 ;A9 mm0 *= sign(mm0)
461 :			movq mm4, [eax] ;B2
462 :			psraw mm6, 15 ;C6
463 :			psubw mm1, mm0 ;A10 undisplace
464 :			psubw mm4, mm3 ;B3
465 :			pmulhw mm2, mm7 ;C7
466 :			movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st]
467 :			pmaxsw mm4, mm3 ;B4
468 :			paddw mm5, mm2 ;C8
469 :			movq [byte edx + %1*24], mm1 ;A11
470 :			psubusw mm4, [ebx] ;B5
471 :			pxor mm2, mm6 ;C9
472 :			%endmacro
473 :
474 :			%macro quantinter1 1
475 :			movq mm0, [byte ecx + %1*16] ;mm0 = [1st]
476 :			movq mm3, [ecx + %1*16+8] ;
477 :			movq mm1, [eax]
478 :			movq mm4, [eax]
479 :			psubw mm1, mm0
480 :			psubw mm4, mm3
481 :			pmaxsw mm1, mm0
482 :			pmaxsw mm4, mm3
483 :			psubusw mm1, mm6 ; mm0 -= sub (unsigned, dont go < 0)
484 :			psubusw mm4, mm6 ;
485 :			psraw mm0, 15
486 :			psraw mm3, 15
487 :			psrlw mm1, 1 ; mm0 = (mm0 / 2Q) >> 16
488 :			psrlw mm4, 1 ;
489 :			paddw mm5, mm1 ; sum += mm0
490 :			pxor mm1, mm0 ; mm0 *= sign(mm0)
491 :			paddw mm5, mm4
492 :			pxor mm4, mm3 ;
493 :			psubw mm1, mm0 ; undisplace
494 :			psubw mm4, mm3
495 :			cmp esp, esp
496 :			movq [byte edx + %1*16], mm1
497 :			movq [edx + %1*16+8], mm4
498 :			%endmacro
499 :
500 :			ALIGN 16
501 :			cglobal quant_h263_inter_3dne
502 :			quant_h263_inter_3dne:
503 :			mov edx, [esp + 4] ; coeff
504 :			mov ecx, [esp + 8] ; data
505 :			mov eax, [esp + 12] ; quant
506 :			push ebx
507 :
508 :			pxor mm5, mm5 ; sum
509 :			nop
510 :			lea ebx,[mmx_sub + eax * 8 - 8] ; sub
511 :			movq mm7, [mmx_div + eax * 8 - 8] ; divider
512 :
513 :			cmp al, 1
514 :			lea eax, [mmzero]
515 :			jz near .q1loop
516 :			cmp esp, esp
517 :			ALIGN 8
518 :			movq mm3, [ecx + 120] ;B1
519 :			pxor mm4, mm4 ;B2
520 :			psubw mm4, mm3 ;B3
521 :			movq mm0, [ecx] ;A1 mm0 = [1st]
522 :			pmaxsw mm4, mm3 ;B4
523 :			psubusw mm4, [ebx] ;B5
524 :
525 :			quantinter 0
526 :			quantinter 1
527 :			quantinter 2
528 :			quantinter 3
529 :			quantinter 4
530 :
531 :			psraw mm3, 15 ;B6
532 :			psubw mm2, mm6 ;C10
533 :			pmulhw mm4, mm7 ;B7
534 :			paddw mm5, mm4 ;B8
535 :			pxor mm4, mm3 ;B9
536 :			psubw mm4, mm3 ;B10
537 :			movq [edx + 4*24+16], mm2 ;C11
538 :			pop ebx
539 :			movq [edx + 4*24+8], mm4 ;B11
540 :			pmaddwd mm5, [plus_one]
541 :			movq mm0, mm5
542 :			punpckhdq mm5, mm5
543 :			paddd mm0, mm5
544 :			movd eax, mm0 ; return sum
545 :
546 :			ret
547 :
548 :			ALIGN 16
549 :			.q1loop
550 :			movq mm6, [byte ebx]
551 :
552 :			quantinter1 0
553 :			quantinter1 1
554 :			quantinter1 2
555 :			quantinter1 3
556 :			quantinter1 4
557 :			quantinter1 5
558 :			quantinter1 6
559 :			quantinter1 7
560 :
561 :			pmaddwd mm5, [plus_one]
562 :			movq mm0, mm5
563 :			psrlq mm5, 32
564 :			paddd mm0, mm5
565 :			movd eax, mm0 ; return sum
566 :
567 :			pop ebx
568 :
569 :			ret
570 :	edgomez	1540	.endfunc
571 :	edgomez	1382
572 :			;-----------------------------------------------------------------------------
573 :			;
574 :			; uint32_t dequant_h263_intra_3dne(int16_t *data,
575 :			; const int16_t const *coeff,
576 :			; const uint32_t quant,
577 :			; const uint32_t dcscalar,
578 :			; const uint16_t *mpeg_matrices);
579 :			;
580 :			;-----------------------------------------------------------------------------
581 :
582 :			; this is the same as dequant_inter_3dne, except that we're
583 :			; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
584 :
585 :			;This is Athlon-optimized code (ca 106 clk per call)
586 :
587 :			%macro dequant 1
588 :			movq mm1, [ecx+%1*24] ; c = coeff[i] ;A2
589 :			psubw mm0, mm1 ;-c ;A3 (1st dep)
590 :			%if (%1)
591 :			paddw mm4, mm6 ;C11 mm6 free (4th+)
592 :			%endif
593 :			pmaxsw mm0, mm1 ;\|c\| ;A4 (2nd)
594 :			%if (%1)
595 :			mov ebp, ebp
596 :			pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) later
597 :			%endif
598 :			movq mm6, [esi] ;0 ;A5 mm6 in use
599 :			pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
600 :			%if (%1)
601 :			pxor mm5, mm4 ;C13 (6th+) 1later
602 :			%endif
603 :			movq mm4, [esi] ;C1 ;0
604 :			mov esp, esp
605 :			pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st)
606 :			ALIGN 4
607 :			psraw mm1, 15 ; sign(c) ;A7 (2nd)
608 :			%if (%1)
609 :			movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later
610 :			%endif
611 :			paddw mm7, mm3 ;B10 offset +negate back (3rd)
612 :			pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+)
613 :			paddw mm2, mm7 ;B11 mm7 free (4th+)
614 :			lea ebp, [byte ebp]
615 :			movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i]
616 :			psubw mm4, mm5 ;-c ;C3 (1st dep)
617 :			pandn mm6, [eax] ;A9 offset = isZero ? 0 : quant_add (2nd)
618 :			pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
619 :			pxor mm3, mm2 ;B13 (6th+)
620 :			movq mm2, [byte esi] ;B1 ;0
621 :			%if (%1)
622 :			movq [edx+%1*24+8-24], mm3 ;B14 (7th)
623 :			%else
624 :			movq [edx+120], mm3
625 :			%endif
626 :			pmaxsw mm4, mm5 ;\|c\| ;C4 (2nd)
627 :			paddw mm6, mm1 ;A10 offset +negate back (3rd)
628 :			movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i]
629 :			psubw mm2, mm3 ;-c ;B3 (1st dep)
630 :			paddw mm0, mm6 ;A11 mm6 free (4th+)
631 :			movq mm6, [byte esi] ;0 ;C5 mm6 in use
632 :			pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st)
633 :			pminsw mm0, [ebx] ;A12 saturates to +2047 (5th+)
634 :			pmaxsw mm2, mm3 ;\|c\| ;B4 (2nd)
635 :			pxor mm1, mm0 ;A13 (6th+)
636 :			pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+)
637 :			psraw mm5, 15 ; sign(c) ;C7 (2nd)
638 :			movq mm7, [byte esi] ;0 ;B5 mm7 in use
639 :			pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st)
640 :			%if (%1 < 4)
641 :			movq mm0, [byte esi] ;A1 ;0
642 :			%endif
643 :			pandn mm6, [byte eax] ;C9 offset = isZero ? 0 : quant_add (2nd)
644 :			psraw mm3, 15 ;sign(c) ;B7 (2nd)
645 :			movq [byte edx+%1*24], mm1 ;A14 (7th)
646 :			paddw mm6, mm5 ;C10 offset +negate back (3rd)
647 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
648 :			mov esp, esp
649 :			%endmacro
650 :
651 :
652 :			ALIGN 16
653 :			cglobal dequant_h263_intra_3dne
654 :			dequant_h263_intra_3dne:
655 :			mov ecx, [esp+ 8] ; coeff
656 :			mov eax, [esp+12] ; quant
657 :			pxor mm0, mm0
658 :			pxor mm2, mm2
659 :			push edi
660 :			push ebx
661 :			lea edi, [mmx_mul + eax8 - 8] ; 2quant
662 :			push ebp
663 :			mov ebx, mmx_2047
664 :			movsx ebp, word [ecx]
665 :			lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1
666 :			push esi
667 :			mov esi, mmzero
668 :			pxor mm7, mm7
669 :			movq mm3, [ecx+120] ;B2 ; c = coeff[i]
670 :			pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st)
671 :
672 :			imul ebp, [esp+16+16] ; dcscalar
673 :			psubw mm2, mm3 ;-c ;B3 (1st dep)
674 :			pmaxsw mm2, mm3 ;\|c\| ;B4 (2nd)
675 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
676 :			psraw mm3, 15 ; sign(c) ;B7 (2nd)
677 :			mov edx, [esp+ 4+16] ; data
678 :
679 :			ALIGN 8
680 :			dequant 0
681 :
682 :			cmp ebp, -2048
683 :			mov esp, esp
684 :
685 :			dequant 1
686 :
687 :			cmovl ebp, [int_2048]
688 :			nop
689 :
690 :			dequant 2
691 :
692 :			cmp ebp, 2047
693 :			mov esp, esp
694 :
695 :			dequant 3
696 :
697 :			cmovg ebp, [int2047]
698 :			nop
699 :
700 :			dequant 4
701 :
702 :			paddw mm4, mm6 ;C11 mm6 free (4th+)
703 :			pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+)
704 :			pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
705 :			mov eax, ebp
706 :			mov esi, [esp]
707 :			mov ebp, [esp+4]
708 :			pxor mm5, mm4 ;C13 (6th+)
709 :			paddw mm7, mm3 ;B10 offset +negate back (3rd)
710 :			movq [edx+4*24+16], mm5 ;C14 (7th)
711 :			paddw mm2, mm7 ;B11 mm7 free (4th+)
712 :			pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
713 :			mov ebx, [esp+8]
714 :			mov edi, [esp+12]
715 :			add esp, byte 16
716 :			pxor mm3, mm2 ;B13 (6th+)
717 :			movq [edx+4*24+8], mm3 ;B14 (7th)
718 :			mov [edx], ax
719 :
720 :			xor eax, eax
721 :			ret
722 :	edgomez	1540	.endfunc
723 :	edgomez	1382
724 :			;-----------------------------------------------------------------------------
725 :			;
726 :			; uint32_t dequant_h263_inter_3dne(int16_t * data,
727 :			; const int16_t * const coeff,
728 :			; const uint32_t quant,
729 :			; const uint16_t *mpeg_matrices);
730 :			;
731 :			;-----------------------------------------------------------------------------
732 :
733 :			; this is the same as dequant_inter_3dne,
734 :			; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
735 :			; This is Athlon-optimized code (ca 100 clk per call)
736 :
737 :			ALIGN 16
738 :			cglobal dequant_h263_inter_3dne
739 :			dequant_h263_inter_3dne:
740 :			mov ecx, [esp+ 8] ; coeff
741 :			mov eax, [esp+12] ; quant
742 :			pxor mm0, mm0
743 :			pxor mm2, mm2
744 :			push edi
745 :			push ebx
746 :			push esi
747 :			lea edi, [mmx_mul + eax8 - 8] ; 2quant
748 :			mov ebx, mmx_2047
749 :			pxor mm7, mm7
750 :			movq mm3, [ecx+120] ;B2 ; c = coeff[i]
751 :			pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st)
752 :			lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1
753 :			psubw mm2, mm3 ;-c ;B3 (1st dep)
754 :			mov esi, mmzero
755 :			pmaxsw mm2, mm3 ;\|c\| ;B4 (2nd)
756 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
757 :			psraw mm3, 15 ; sign(c) ;B7 (2nd)
758 :			mov edx, [dword esp+ 4+12] ; data
759 :
760 :			ALIGN 8
761 :
762 :			dequant 0
763 :			dequant 1
764 :			dequant 2
765 :			dequant 3
766 :			dequant 4
767 :
768 :			paddw mm4, mm6 ;C11 mm6 free (4th+)
769 :			pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+)
770 :			pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
771 :			mov esi, [esp]
772 :			pxor mm5, mm4 ;C13 (6th+)
773 :			paddw mm7, mm3 ;B10 offset +negate back (3rd)
774 :			movq [edx+4*24+16], mm5 ;C14 (7th)
775 :			paddw mm2, mm7 ;B11 mm7 free (4th+)
776 :			pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
777 :			mov ebx, [esp+4]
778 :			mov edi, [esp+8]
779 :			add esp, byte 12
780 :			pxor mm3, mm2 ;B13 (6th+)
781 :			movq [edx+4*24+8], mm3 ;B14 (7th)
782 :
783 :			xor eax, eax
784 :			ret
785 :	edgomez	1540	.endfunc
786 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4