Annotation of /trunk/xvidcore/src/quant/x86_asm/quantize_h263_3dne.asm

Revision 1535 - (view) (download)

1 :	edgomez	1382	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dne Quantization/Dequantization -
5 :			; *
6 :			; * Copyright(C) 2002-2003 Jaan Kalda
7 :			; *
8 :			; * This program is free software ; you can redistribute it and/or modify
9 :			; * it under the terms of the GNU General Public License as published by
10 :			; * the Free Software Foundation ; either version 2 of the License, or
11 :			; * (at your option) any later version.
12 :			; *
13 :			; * This program is distributed in the hope that it will be useful,
14 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
15 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :			; * GNU General Public License for more details.
17 :			; *
18 :			; * You should have received a copy of the GNU General Public License
19 :			; * along with this program ; if not, write to the Free Software
20 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :			; *
22 :	edgomez	1535	; * $Id: quantize_h263_3dne.asm,v 1.4 2004-08-22 11:46:10 edgomez Exp $
23 :	edgomez	1382	; *
24 :			; *************************************************************************/
25 :			;
26 :			; these 3dne functions are compatible with iSSE, but are optimized specifically for
27 :			; K7 pipelines
28 :
29 :			; enable dequant saturate [-2048,2047], test purposes only.
30 :			%define SATURATE
31 :
32 :			BITS 32
33 :
34 :			%macro cglobal 1
35 :			%ifdef PREFIX
36 :	edgomez	1535	%ifdef MARK_FUNCS
37 :			global _%1:function
38 :			%define %1 _%1:function
39 :			%else
40 :			global _%1
41 :			%define %1 _%1
42 :			%endif
43 :	edgomez	1382	%else
44 :	edgomez	1535	%ifdef MARK_FUNCS
45 :			global %1:function
46 :			%else
47 :			global %1
48 :			%endif
49 :	edgomez	1382	%endif
50 :			%endmacro
51 :
52 :			;=============================================================================
53 :			; Local data
54 :			;=============================================================================
55 :
56 :			%ifdef FORMAT_COFF
57 :	edgomez	1519	SECTION .rodata
58 :	edgomez	1382	%else
59 :	edgomez	1519	SECTION .rodata align=16
60 :	edgomez	1382	%endif
61 :
62 :			align 4
63 :			int_div:
64 :			dd 0
65 :			%assign i 1
66 :			%rep 255
67 :			dd (1 << 16) / (i) + 1
68 :			%assign i i+1
69 :			%endrep
70 :
71 :			ALIGN 16
72 :			plus_one:
73 :			times 8 dw 1
74 :
75 :			;-----------------------------------------------------------------------------
76 :			; subtract by Q/2 table
77 :			;-----------------------------------------------------------------------------
78 :
79 :			ALIGN 16
80 :			mmx_sub:
81 :			%assign i 1
82 :			%rep 31
83 :			times 4 dw i / 2
84 :			%assign i i+1
85 :			%endrep
86 :
87 :
88 :			;-----------------------------------------------------------------------------
89 :			;
90 :			; divide by 2Q table
91 :			;
92 :			; use a shift of 16 to take full advantage of _pmulhw_
93 :			; for q=1, _pmulhw_ will overflow so it is treated seperately
94 :			; (3dnow2 provides _pmulhuw_ which wont cause overflow)
95 :			;
96 :			;-----------------------------------------------------------------------------
97 :
98 :			ALIGN 16
99 :			mmx_div:
100 :			%assign i 1
101 :			%rep 31
102 :			times 4 dw (1 << 16) / (i * 2) + 1
103 :			%assign i i+1
104 :			%endrep
105 :
106 :			;-----------------------------------------------------------------------------
107 :			; add by (odd(Q) ? Q : Q - 1) table
108 :			;-----------------------------------------------------------------------------
109 :
110 :			ALIGN 16
111 :			mmx_add:
112 :			%assign i 1
113 :			%rep 31
114 :			%if i % 2 != 0
115 :			times 4 dw i
116 :			%else
117 :			times 4 dw i - 1
118 :			%endif
119 :			%assign i i+1
120 :			%endrep
121 :
122 :			;-----------------------------------------------------------------------------
123 :			; multiple by 2Q table
124 :			;-----------------------------------------------------------------------------
125 :
126 :			ALIGN 16
127 :			mmx_mul:
128 :			%assign i 1
129 :			%rep 31
130 :			times 4 dw i * 2
131 :			%assign i i+1
132 :			%endrep
133 :
134 :			;-----------------------------------------------------------------------------
135 :			; saturation limits
136 :			;-----------------------------------------------------------------------------
137 :
138 :			ALIGN 8
139 :			mmx_32768_minus_2048:
140 :			times 4 dw (32768-2048)
141 :			mmx_32767_minus_2047:
142 :			times 4 dw (32767-2047)
143 :
144 :			ALIGN 16
145 :			mmx_2047:
146 :			times 4 dw 2047
147 :
148 :			ALIGN 8
149 :			mmzero:
150 :			dd 0, 0
151 :			int2047:
152 :			dd 2047
153 :			int_2048:
154 :			dd -2048
155 :
156 :			;=============================================================================
157 :			; Code
158 :			;=============================================================================
159 :
160 :			SECTION .text
161 :
162 :
163 :			;-----------------------------------------------------------------------------
164 :			;
165 :			; uint32_t quant_h263_intra_3dne(int16_t * coeff,
166 :			; const int16_t const * data,
167 :			; const uint32_t quant,
168 :			; const uint32_t dcscalar,
169 :			; const uint16_t *mpeg_matrices);
170 :			;
171 :			;-----------------------------------------------------------------------------
172 :			;This is Athlon-optimized code (ca 70 clk per call)
173 :
174 :			%macro quant_intra1 1
175 :			psubw mm1, mm0 ;A3
176 :			psubw mm3, mm2 ;B3
177 :			%if (%1)
178 :			psubw mm5, mm4 ;C8
179 :			psubw mm7, mm6 ;D8
180 :			%endif
181 :
182 :			ALIGN 8
183 :			db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1
184 :			pmaxsw mm1, mm0 ;A4
185 :			db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1
186 :			pmaxsw mm3, mm2 ;B4
187 :
188 :
189 :			psraw mm0, 15 ;A5
190 :			psraw mm2, 15 ;B5
191 :			%if (%1)
192 :			movq [edx + %1 * 32 + 16-32], mm5 ;C9
193 :			movq [edx + %1 * 32 + 24-32], mm7 ;D9
194 :			%endif
195 :
196 :			psrlw mm1, 1 ;A6
197 :			psrlw mm3, 1 ;B6
198 :			movq mm5, [ebx] ;C2
199 :			movq mm7, [ebx] ;D2
200 :
201 :			pxor mm1, mm0 ;A7
202 :			pxor mm3, mm2 ;B7
203 :
204 :			psubw mm5, mm4 ;C3
205 :			psubw mm7, mm6 ;D3
206 :			psubw mm1, mm0 ;A8
207 :			psubw mm3, mm2 ;B8
208 :
209 :			%if (%1 == 0)
210 :			push ebp
211 :			movq mm0, [ecx + %1 * 32 +32]
212 :			%elif (%1 < 3)
213 :			db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1
214 :			%endif
215 :			pmaxsw mm5, mm4 ;C4
216 :			%if (%1 < 3)
217 :			db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1
218 :			%else
219 :			cmp esp, esp
220 :			%endif
221 :			pmaxsw mm7, mm6 ;D4
222 :
223 :			psraw mm4, 15 ;C5
224 :			psraw mm6, 15 ;D5
225 :			movq [byte edx + %1 * 32], mm1 ;A9
226 :			movq [edx + %1 * 32+8], mm3 ;B9
227 :
228 :
229 :			psrlw mm5, 1 ;C6
230 :			psrlw mm7, 1 ;D6
231 :			%if (%1 < 3)
232 :			movq mm1, [ebx] ;A2
233 :			movq mm3, [ebx] ;B2
234 :			%endif
235 :			%if (%1 == 3)
236 :			imul eax, [int_div+4*edi]
237 :			%endif
238 :			pxor mm5, mm4 ;C7
239 :			pxor mm7, mm6 ;D7
240 :			%endm
241 :
242 :
243 :			%macro quant_intra 1
244 :			; Rules for athlon:
245 :			; 1) schedule latencies
246 :			; 2) add/mul and load/store in 2:1 proportion
247 :			; 3) avoid spliting >3byte instructions over 8byte boundaries
248 :
249 :			psubw mm1, mm0 ;A3
250 :			psubw mm3, mm2 ;B3
251 :			%if (%1)
252 :			psubw mm5, mm4 ;C8
253 :			psubw mm7, mm6 ;D8
254 :			%endif
255 :
256 :			ALIGN 8
257 :			db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1
258 :			pmaxsw mm1, mm0 ;A4
259 :			db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1
260 :			pmaxsw mm3, mm2 ;B4
261 :
262 :
263 :			psraw mm0, 15 ;A5
264 :			psraw mm2, 15 ;B5
265 :			%if (%1)
266 :			movq [edx + %1 * 32 + 16-32], mm5 ;C9
267 :			movq [edx + %1 * 32 + 24-32], mm7 ;D9
268 :			%endif
269 :
270 :			pmulhw mm1, [esi] ;A6
271 :			pmulhw mm3, [esi] ;B6
272 :			movq mm5, [ebx] ;C2
273 :			movq mm7, [ebx] ;D2
274 :
275 :			nop
276 :			nop
277 :			pxor mm1, mm0 ;A7
278 :			pxor mm3, mm2 ;B7
279 :
280 :			psubw mm5, mm4 ;C3
281 :			psubw mm7, mm6 ;D3
282 :			psubw mm1, mm0 ;A8
283 :			psubw mm3, mm2 ;B8
284 :
285 :
286 :			%if (%1 < 3)
287 :			db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1
288 :			%endif
289 :			pmaxsw mm5, mm4 ;C4
290 :			%if (%1 < 3)
291 :			db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1
292 :			%else
293 :			cmp esp, esp
294 :			%endif
295 :			pmaxsw mm7,mm6 ;D4
296 :
297 :			psraw mm4, 15 ;C5
298 :			psraw mm6, 15 ;D5
299 :			movq [byte edx + %1 * 32], mm1 ;A9
300 :			movq [edx + %1 * 32+8], mm3 ;B9
301 :
302 :
303 :			pmulhw mm5, [esi] ;C6
304 :			pmulhw mm7, [esi] ;D6
305 :			%if (%1 < 3)
306 :			movq mm1, [ebx] ;A2
307 :			movq mm3, [ebx] ;B2
308 :			%endif
309 :			%if (%1 == 0)
310 :			push ebp
311 :			%elif (%1 < 3)
312 :			nop
313 :			%endif
314 :			nop
315 :			%if (%1 == 3)
316 :			imul eax, [int_div+4*edi]
317 :			%endif
318 :			pxor mm5, mm4 ;C7
319 :			pxor mm7, mm6 ;D7
320 :			%endmacro
321 :
322 :
323 :			ALIGN 16
324 :			cglobal quant_h263_intra_3dne
325 :			quant_h263_intra_3dne:
326 :
327 :			mov eax, [esp + 12] ; quant
328 :			mov ecx, [esp + 8] ; data
329 :			mov edx, [esp + 4] ; coeff
330 :			cmp al, 1
331 :			pxor mm1, mm1
332 :			pxor mm3, mm3
333 :			movq mm0, [ecx] ; mm0 = [1st]
334 :			movq mm2, [ecx + 8]
335 :			push esi
336 :			lea esi, [mmx_div + eax*8 - 8]
337 :
338 :			push ebx
339 :			mov ebx, mmzero
340 :			push edi
341 :			jz near .q1loop
342 :
343 :			quant_intra 0
344 :			mov ebp, [esp + 16 + 16] ; dcscalar
345 :			; NB -- there are 3 pushes in the function preambule and one more
346 :			; in "quant_intra 0", thus an added offset of 16 bytes
347 :			movsx eax, word [byte ecx] ; DC
348 :
349 :			quant_intra 1
350 :			mov edi, eax
351 :			sar edi, 31 ; sign(DC)
352 :			shr ebp, byte 1 ; ebp = dcscalar/2
353 :
354 :			quant_intra 2
355 :			sub eax, edi ; DC (+1)
356 :			xor ebp, edi ; sign(DC) dcscalar /2 (-1)
357 :			mov edi, [esp + 16 + 16] ; dscalar
358 :			lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2
359 :			mov ebp, [byte esp]
360 :
361 :			quant_intra 3
362 :			psubw mm5, mm4 ;C8
363 :			mov esi, [esp + 12] ; pop back the register value
364 :			mov edi, [esp + 4] ; pop back the register value
365 :			sar eax, 16
366 :			lea ebx, [byte eax + 1] ; workaround for eax < 0
367 :			cmovs eax, ebx ; conditionnaly move the corrected value
368 :			mov [edx], ax ; coeff[0] = ax
369 :			mov ebx, [esp + 8] ; pop back the register value
370 :			add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
371 :			psubw mm7, mm6 ;D8
372 :			movq [edx + 3 * 32 + 16], mm5 ;C9
373 :			movq [edx + 3 * 32 + 24], mm7 ;D9
374 :
375 :			xor eax, eax
376 :			ret
377 :
378 :			ALIGN 16
379 :
380 :			.q1loop
381 :			quant_intra1 0
382 :			mov ebp, [esp + 16 + 16] ; dcscalar
383 :			movsx eax, word [byte ecx] ; DC
384 :
385 :			quant_intra1 1
386 :			mov edi, eax
387 :			sar edi, 31 ; sign(DC)
388 :			shr ebp, byte 1 ; ebp = dcscalar /2
389 :
390 :			quant_intra1 2
391 :			sub eax, edi ; DC (+1)
392 :			xor ebp, edi ; sign(DC) dcscalar /2 (-1)
393 :			mov edi, [esp + 16 + 16] ; dcscalar
394 :			lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2
395 :			mov ebp, [byte esp]
396 :
397 :			quant_intra1 3
398 :			psubw mm5, mm4 ;C8
399 :			mov esi, [dword esp + 12] ; pop back the register value
400 :			mov edi, [esp + 4] ; pop back the register value
401 :			sar eax, 16
402 :			lea ebx, [byte eax + 1] ; workaround for eax < 0
403 :			cmovs eax, ebx ; conditionnaly move the corrected value
404 :			mov [edx], ax ; coeff[0] = ax
405 :			mov ebx, [esp + 8] ; pop back the register value
406 :			add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
407 :			psubw mm7, mm6 ;D8
408 :			movq [edx + 3 * 32 + 16], mm5 ;C9
409 :			movq [edx + 3 * 32 + 24], mm7 ;D9
410 :
411 :			xor eax, eax
412 :			ret
413 :
414 :
415 :
416 :
417 :			;-----------------------------------------------------------------------------
418 :			;
419 :			; uint32_t quant_h263_inter_3dne(int16_t * coeff,
420 :			; const int16_t const * data,
421 :			; const uint32_t quant,
422 :			; const uint16_t *mpeg_matrices);
423 :			;
424 :			;-----------------------------------------------------------------------------
425 :			;This is Athlon-optimized code (ca 90 clk per call)
426 :			;Optimized by Jaan, 30 Nov 2002
427 :
428 :
429 :			%macro quantinter 1
430 :			movq mm1, [eax] ;A2
431 :			psraw mm3, 15 ;B6
432 :			%if (%1)
433 :			psubw mm2, mm6 ;C10
434 :			%endif
435 :			psubw mm1, mm0 ;A3
436 :			pmulhw mm4, mm7 ;B7
437 :			movq mm6, [ecx + %1*24+16] ;C1
438 :			pmaxsw mm1, mm0 ;A4
439 :			paddw mm5, mm4 ;B8
440 :			%if (%1)
441 :			movq [edx + %1*24+16-24], mm2 ;C11
442 :			%endif
443 :			psubusw mm1, [ebx] ;A5 mm0 -= sub (unsigned, dont go < 0)
444 :			pxor mm4, mm3 ;B9
445 :			movq mm2, [eax] ;C2
446 :			psraw mm0, 15 ;A6
447 :			psubw mm4, mm3 ;B10
448 :			psubw mm2, mm6 ;C3
449 :			pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24
450 :			movq mm3, [ecx + %1*24+8] ;B1
451 :			pmaxsw mm2, mm6 ;C4
452 :			paddw mm5, mm1 ;A8 sum += mm0
453 :			%if (%1)
454 :			movq [edx + %1*24+8-24], mm4 ;B11
455 :			%else
456 :			movq [edx + 120], mm4 ;B11
457 :			%endif
458 :			psubusw mm2, [ebx] ;C5
459 :			pxor mm1, mm0 ;A9 mm0 *= sign(mm0)
460 :			movq mm4, [eax] ;B2
461 :			psraw mm6, 15 ;C6
462 :			psubw mm1, mm0 ;A10 undisplace
463 :			psubw mm4, mm3 ;B3
464 :			pmulhw mm2, mm7 ;C7
465 :			movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st]
466 :			pmaxsw mm4, mm3 ;B4
467 :			paddw mm5, mm2 ;C8
468 :			movq [byte edx + %1*24], mm1 ;A11
469 :			psubusw mm4, [ebx] ;B5
470 :			pxor mm2, mm6 ;C9
471 :			%endmacro
472 :
473 :			%macro quantinter1 1
474 :			movq mm0, [byte ecx + %1*16] ;mm0 = [1st]
475 :			movq mm3, [ecx + %1*16+8] ;
476 :			movq mm1, [eax]
477 :			movq mm4, [eax]
478 :			psubw mm1, mm0
479 :			psubw mm4, mm3
480 :			pmaxsw mm1, mm0
481 :			pmaxsw mm4, mm3
482 :			psubusw mm1, mm6 ; mm0 -= sub (unsigned, dont go < 0)
483 :			psubusw mm4, mm6 ;
484 :			psraw mm0, 15
485 :			psraw mm3, 15
486 :			psrlw mm1, 1 ; mm0 = (mm0 / 2Q) >> 16
487 :			psrlw mm4, 1 ;
488 :			paddw mm5, mm1 ; sum += mm0
489 :			pxor mm1, mm0 ; mm0 *= sign(mm0)
490 :			paddw mm5, mm4
491 :			pxor mm4, mm3 ;
492 :			psubw mm1, mm0 ; undisplace
493 :			psubw mm4, mm3
494 :			cmp esp, esp
495 :			movq [byte edx + %1*16], mm1
496 :			movq [edx + %1*16+8], mm4
497 :			%endmacro
498 :
499 :			ALIGN 16
500 :			cglobal quant_h263_inter_3dne
501 :			quant_h263_inter_3dne:
502 :			mov edx, [esp + 4] ; coeff
503 :			mov ecx, [esp + 8] ; data
504 :			mov eax, [esp + 12] ; quant
505 :			push ebx
506 :
507 :			pxor mm5, mm5 ; sum
508 :			nop
509 :			lea ebx,[mmx_sub + eax * 8 - 8] ; sub
510 :			movq mm7, [mmx_div + eax * 8 - 8] ; divider
511 :
512 :			cmp al, 1
513 :			lea eax, [mmzero]
514 :			jz near .q1loop
515 :			cmp esp, esp
516 :			ALIGN 8
517 :			movq mm3, [ecx + 120] ;B1
518 :			pxor mm4, mm4 ;B2
519 :			psubw mm4, mm3 ;B3
520 :			movq mm0, [ecx] ;A1 mm0 = [1st]
521 :			pmaxsw mm4, mm3 ;B4
522 :			psubusw mm4, [ebx] ;B5
523 :
524 :			quantinter 0
525 :			quantinter 1
526 :			quantinter 2
527 :			quantinter 3
528 :			quantinter 4
529 :
530 :			psraw mm3, 15 ;B6
531 :			psubw mm2, mm6 ;C10
532 :			pmulhw mm4, mm7 ;B7
533 :			paddw mm5, mm4 ;B8
534 :			pxor mm4, mm3 ;B9
535 :			psubw mm4, mm3 ;B10
536 :			movq [edx + 4*24+16], mm2 ;C11
537 :			pop ebx
538 :			movq [edx + 4*24+8], mm4 ;B11
539 :			pmaddwd mm5, [plus_one]
540 :			movq mm0, mm5
541 :			punpckhdq mm5, mm5
542 :			paddd mm0, mm5
543 :			movd eax, mm0 ; return sum
544 :
545 :			ret
546 :
547 :			ALIGN 16
548 :			.q1loop
549 :			movq mm6, [byte ebx]
550 :
551 :			quantinter1 0
552 :			quantinter1 1
553 :			quantinter1 2
554 :			quantinter1 3
555 :			quantinter1 4
556 :			quantinter1 5
557 :			quantinter1 6
558 :			quantinter1 7
559 :
560 :			pmaddwd mm5, [plus_one]
561 :			movq mm0, mm5
562 :			psrlq mm5, 32
563 :			paddd mm0, mm5
564 :			movd eax, mm0 ; return sum
565 :
566 :			pop ebx
567 :
568 :			ret
569 :
570 :			;-----------------------------------------------------------------------------
571 :			;
572 :			; uint32_t dequant_h263_intra_3dne(int16_t *data,
573 :			; const int16_t const *coeff,
574 :			; const uint32_t quant,
575 :			; const uint32_t dcscalar,
576 :			; const uint16_t *mpeg_matrices);
577 :			;
578 :			;-----------------------------------------------------------------------------
579 :
580 :			; this is the same as dequant_inter_3dne, except that we're
581 :			; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
582 :
583 :			;This is Athlon-optimized code (ca 106 clk per call)
584 :
585 :			%macro dequant 1
586 :			movq mm1, [ecx+%1*24] ; c = coeff[i] ;A2
587 :			psubw mm0, mm1 ;-c ;A3 (1st dep)
588 :			%if (%1)
589 :			paddw mm4, mm6 ;C11 mm6 free (4th+)
590 :			%endif
591 :			pmaxsw mm0, mm1 ;\|c\| ;A4 (2nd)
592 :			%if (%1)
593 :			mov ebp, ebp
594 :			pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) later
595 :			%endif
596 :			movq mm6, [esi] ;0 ;A5 mm6 in use
597 :			pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
598 :			%if (%1)
599 :			pxor mm5, mm4 ;C13 (6th+) 1later
600 :			%endif
601 :			movq mm4, [esi] ;C1 ;0
602 :			mov esp, esp
603 :			pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st)
604 :			ALIGN 4
605 :			psraw mm1, 15 ; sign(c) ;A7 (2nd)
606 :			%if (%1)
607 :			movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later
608 :			%endif
609 :			paddw mm7, mm3 ;B10 offset +negate back (3rd)
610 :			pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+)
611 :			paddw mm2, mm7 ;B11 mm7 free (4th+)
612 :			lea ebp, [byte ebp]
613 :			movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i]
614 :			psubw mm4, mm5 ;-c ;C3 (1st dep)
615 :			pandn mm6, [eax] ;A9 offset = isZero ? 0 : quant_add (2nd)
616 :			pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
617 :			pxor mm3, mm2 ;B13 (6th+)
618 :			movq mm2, [byte esi] ;B1 ;0
619 :			%if (%1)
620 :			movq [edx+%1*24+8-24], mm3 ;B14 (7th)
621 :			%else
622 :			movq [edx+120], mm3
623 :			%endif
624 :			pmaxsw mm4, mm5 ;\|c\| ;C4 (2nd)
625 :			paddw mm6, mm1 ;A10 offset +negate back (3rd)
626 :			movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i]
627 :			psubw mm2, mm3 ;-c ;B3 (1st dep)
628 :			paddw mm0, mm6 ;A11 mm6 free (4th+)
629 :			movq mm6, [byte esi] ;0 ;C5 mm6 in use
630 :			pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st)
631 :			pminsw mm0, [ebx] ;A12 saturates to +2047 (5th+)
632 :			pmaxsw mm2, mm3 ;\|c\| ;B4 (2nd)
633 :			pxor mm1, mm0 ;A13 (6th+)
634 :			pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+)
635 :			psraw mm5, 15 ; sign(c) ;C7 (2nd)
636 :			movq mm7, [byte esi] ;0 ;B5 mm7 in use
637 :			pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st)
638 :			%if (%1 < 4)
639 :			movq mm0, [byte esi] ;A1 ;0
640 :			%endif
641 :			pandn mm6, [byte eax] ;C9 offset = isZero ? 0 : quant_add (2nd)
642 :			psraw mm3, 15 ;sign(c) ;B7 (2nd)
643 :			movq [byte edx+%1*24], mm1 ;A14 (7th)
644 :			paddw mm6, mm5 ;C10 offset +negate back (3rd)
645 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
646 :			mov esp, esp
647 :			%endmacro
648 :
649 :
650 :			ALIGN 16
651 :			cglobal dequant_h263_intra_3dne
652 :			dequant_h263_intra_3dne:
653 :			mov ecx, [esp+ 8] ; coeff
654 :			mov eax, [esp+12] ; quant
655 :			pxor mm0, mm0
656 :			pxor mm2, mm2
657 :			push edi
658 :			push ebx
659 :			lea edi, [mmx_mul + eax8 - 8] ; 2quant
660 :			push ebp
661 :			mov ebx, mmx_2047
662 :			movsx ebp, word [ecx]
663 :			lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1
664 :			push esi
665 :			mov esi, mmzero
666 :			pxor mm7, mm7
667 :			movq mm3, [ecx+120] ;B2 ; c = coeff[i]
668 :			pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st)
669 :
670 :			imul ebp, [esp+16+16] ; dcscalar
671 :			psubw mm2, mm3 ;-c ;B3 (1st dep)
672 :			pmaxsw mm2, mm3 ;\|c\| ;B4 (2nd)
673 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
674 :			psraw mm3, 15 ; sign(c) ;B7 (2nd)
675 :			mov edx, [esp+ 4+16] ; data
676 :
677 :			ALIGN 8
678 :			dequant 0
679 :
680 :			cmp ebp, -2048
681 :			mov esp, esp
682 :
683 :			dequant 1
684 :
685 :			cmovl ebp, [int_2048]
686 :			nop
687 :
688 :			dequant 2
689 :
690 :			cmp ebp, 2047
691 :			mov esp, esp
692 :
693 :			dequant 3
694 :
695 :			cmovg ebp, [int2047]
696 :			nop
697 :
698 :			dequant 4
699 :
700 :			paddw mm4, mm6 ;C11 mm6 free (4th+)
701 :			pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+)
702 :			pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
703 :			mov eax, ebp
704 :			mov esi, [esp]
705 :			mov ebp, [esp+4]
706 :			pxor mm5, mm4 ;C13 (6th+)
707 :			paddw mm7, mm3 ;B10 offset +negate back (3rd)
708 :			movq [edx+4*24+16], mm5 ;C14 (7th)
709 :			paddw mm2, mm7 ;B11 mm7 free (4th+)
710 :			pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
711 :			mov ebx, [esp+8]
712 :			mov edi, [esp+12]
713 :			add esp, byte 16
714 :			pxor mm3, mm2 ;B13 (6th+)
715 :			movq [edx+4*24+8], mm3 ;B14 (7th)
716 :			mov [edx], ax
717 :
718 :			xor eax, eax
719 :			ret
720 :
721 :			;-----------------------------------------------------------------------------
722 :			;
723 :			; uint32_t dequant_h263_inter_3dne(int16_t * data,
724 :			; const int16_t * const coeff,
725 :			; const uint32_t quant,
726 :			; const uint16_t *mpeg_matrices);
727 :			;
728 :			;-----------------------------------------------------------------------------
729 :
730 :			; this is the same as dequant_inter_3dne,
731 :			; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
732 :			; This is Athlon-optimized code (ca 100 clk per call)
733 :
734 :			ALIGN 16
735 :			cglobal dequant_h263_inter_3dne
736 :			dequant_h263_inter_3dne:
737 :			mov ecx, [esp+ 8] ; coeff
738 :			mov eax, [esp+12] ; quant
739 :			pxor mm0, mm0
740 :			pxor mm2, mm2
741 :			push edi
742 :			push ebx
743 :			push esi
744 :			lea edi, [mmx_mul + eax8 - 8] ; 2quant
745 :			mov ebx, mmx_2047
746 :			pxor mm7, mm7
747 :			movq mm3, [ecx+120] ;B2 ; c = coeff[i]
748 :			pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st)
749 :			lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1
750 :			psubw mm2, mm3 ;-c ;B3 (1st dep)
751 :			mov esi, mmzero
752 :			pmaxsw mm2, mm3 ;\|c\| ;B4 (2nd)
753 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
754 :			psraw mm3, 15 ; sign(c) ;B7 (2nd)
755 :			mov edx, [dword esp+ 4+12] ; data
756 :
757 :			ALIGN 8
758 :
759 :			dequant 0
760 :			dequant 1
761 :			dequant 2
762 :			dequant 3
763 :			dequant 4
764 :
765 :			paddw mm4, mm6 ;C11 mm6 free (4th+)
766 :			pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+)
767 :			pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
768 :			mov esi, [esp]
769 :			pxor mm5, mm4 ;C13 (6th+)
770 :			paddw mm7, mm3 ;B10 offset +negate back (3rd)
771 :			movq [edx+4*24+16], mm5 ;C14 (7th)
772 :			paddw mm2, mm7 ;B11 mm7 free (4th+)
773 :			pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
774 :			mov ebx, [esp+4]
775 :			mov edi, [esp+8]
776 :			add esp, byte 12
777 :			pxor mm3, mm2 ;B13 (6th+)
778 :			movq [edx+4*24+8], mm3 ;B14 (7th)
779 :
780 :			xor eax, eax
781 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4