Annotation of /trunk/xvidcore/src/quant/x86_asm/quantize_h263_3dne.asm

Revision 1793 - (view) (download)

1 :	edgomez	1382	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dne Quantization/Dequantization -
5 :			; *
6 :			; * Copyright(C) 2002-2003 Jaan Kalda
7 :			; *
8 :			; * This program is free software ; you can redistribute it and/or modify
9 :			; * it under the terms of the GNU General Public License as published by
10 :			; * the Free Software Foundation ; either version 2 of the License, or
11 :			; * (at your option) any later version.
12 :			; *
13 :			; * This program is distributed in the hope that it will be useful,
14 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
15 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :			; * GNU General Public License for more details.
17 :			; *
18 :			; * You should have received a copy of the GNU General Public License
19 :			; * along with this program ; if not, write to the Free Software
20 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :			; *
22 :	Isibaar	1793	; * $Id: quantize_h263_3dne.asm,v 1.8 2008-11-11 20:46:24 Isibaar Exp $
23 :	edgomez	1382	; *
24 :			; *************************************************************************/
25 :			;
26 :			; these 3dne functions are compatible with iSSE, but are optimized specifically for
27 :			; K7 pipelines
28 :
29 :			; enable dequant saturate [-2048,2047], test purposes only.
30 :			%define SATURATE
31 :
32 :			BITS 32
33 :
34 :			%macro cglobal 1
35 :			%ifdef PREFIX
36 :	edgomez	1535	%ifdef MARK_FUNCS
37 :	edgomez	1540	global _%1:function %1.endfunc-%1
38 :			%define %1 _%1:function %1.endfunc-%1
39 :	Isibaar	1793	%define ENDFUNC .endfunc
40 :	edgomez	1535	%else
41 :			global _%1
42 :			%define %1 _%1
43 :	Isibaar	1793	%define ENDFUNC
44 :	edgomez	1535	%endif
45 :	edgomez	1382	%else
46 :	edgomez	1535	%ifdef MARK_FUNCS
47 :	edgomez	1540	global %1:function %1.endfunc-%1
48 :	Isibaar	1793	%define ENDFUNC .endfunc
49 :	edgomez	1535	%else
50 :			global %1
51 :	Isibaar	1793	%define ENDFUNC
52 :	edgomez	1535	%endif
53 :	edgomez	1382	%endif
54 :			%endmacro
55 :
56 :			;=============================================================================
57 :			; Local data
58 :			;=============================================================================
59 :
60 :			%ifdef FORMAT_COFF
61 :	edgomez	1519	SECTION .rodata
62 :	edgomez	1382	%else
63 :	edgomez	1519	SECTION .rodata align=16
64 :	edgomez	1382	%endif
65 :
66 :			align 4
67 :			int_div:
68 :			dd 0
69 :			%assign i 1
70 :			%rep 255
71 :			dd (1 << 16) / (i) + 1
72 :			%assign i i+1
73 :			%endrep
74 :
75 :			ALIGN 16
76 :			plus_one:
77 :			times 8 dw 1
78 :
79 :			;-----------------------------------------------------------------------------
80 :			; subtract by Q/2 table
81 :			;-----------------------------------------------------------------------------
82 :
83 :			ALIGN 16
84 :			mmx_sub:
85 :			%assign i 1
86 :			%rep 31
87 :			times 4 dw i / 2
88 :			%assign i i+1
89 :			%endrep
90 :
91 :
92 :			;-----------------------------------------------------------------------------
93 :			;
94 :			; divide by 2Q table
95 :			;
96 :			; use a shift of 16 to take full advantage of _pmulhw_
97 :			; for q=1, _pmulhw_ will overflow so it is treated seperately
98 :			; (3dnow2 provides _pmulhuw_ which wont cause overflow)
99 :			;
100 :			;-----------------------------------------------------------------------------
101 :
102 :			ALIGN 16
103 :			mmx_div:
104 :			%assign i 1
105 :			%rep 31
106 :			times 4 dw (1 << 16) / (i * 2) + 1
107 :			%assign i i+1
108 :			%endrep
109 :
110 :			;-----------------------------------------------------------------------------
111 :			; add by (odd(Q) ? Q : Q - 1) table
112 :			;-----------------------------------------------------------------------------
113 :
114 :			ALIGN 16
115 :			mmx_add:
116 :			%assign i 1
117 :			%rep 31
118 :			%if i % 2 != 0
119 :			times 4 dw i
120 :			%else
121 :			times 4 dw i - 1
122 :			%endif
123 :			%assign i i+1
124 :			%endrep
125 :
126 :			;-----------------------------------------------------------------------------
127 :			; multiple by 2Q table
128 :			;-----------------------------------------------------------------------------
129 :
130 :			ALIGN 16
131 :			mmx_mul:
132 :			%assign i 1
133 :			%rep 31
134 :			times 4 dw i * 2
135 :			%assign i i+1
136 :			%endrep
137 :
138 :			;-----------------------------------------------------------------------------
139 :			; saturation limits
140 :			;-----------------------------------------------------------------------------
141 :
142 :			ALIGN 8
143 :			mmx_32768_minus_2048:
144 :			times 4 dw (32768-2048)
145 :			mmx_32767_minus_2047:
146 :			times 4 dw (32767-2047)
147 :
148 :			ALIGN 16
149 :			mmx_2047:
150 :			times 4 dw 2047
151 :
152 :			ALIGN 8
153 :			mmzero:
154 :			dd 0, 0
155 :			int2047:
156 :			dd 2047
157 :			int_2048:
158 :			dd -2048
159 :
160 :			;=============================================================================
161 :			; Code
162 :			;=============================================================================
163 :
164 :			SECTION .text
165 :
166 :
167 :			;-----------------------------------------------------------------------------
168 :			;
169 :			; uint32_t quant_h263_intra_3dne(int16_t * coeff,
170 :			; const int16_t const * data,
171 :			; const uint32_t quant,
172 :			; const uint32_t dcscalar,
173 :			; const uint16_t *mpeg_matrices);
174 :			;
175 :			;-----------------------------------------------------------------------------
176 :			;This is Athlon-optimized code (ca 70 clk per call)
177 :
178 :			%macro quant_intra1 1
179 :			psubw mm1, mm0 ;A3
180 :			psubw mm3, mm2 ;B3
181 :			%if (%1)
182 :			psubw mm5, mm4 ;C8
183 :			psubw mm7, mm6 ;D8
184 :			%endif
185 :
186 :			ALIGN 8
187 :			db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1
188 :			pmaxsw mm1, mm0 ;A4
189 :			db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1
190 :			pmaxsw mm3, mm2 ;B4
191 :
192 :
193 :			psraw mm0, 15 ;A5
194 :			psraw mm2, 15 ;B5
195 :			%if (%1)
196 :			movq [edx + %1 * 32 + 16-32], mm5 ;C9
197 :			movq [edx + %1 * 32 + 24-32], mm7 ;D9
198 :			%endif
199 :
200 :			psrlw mm1, 1 ;A6
201 :			psrlw mm3, 1 ;B6
202 :			movq mm5, [ebx] ;C2
203 :			movq mm7, [ebx] ;D2
204 :
205 :			pxor mm1, mm0 ;A7
206 :			pxor mm3, mm2 ;B7
207 :
208 :			psubw mm5, mm4 ;C3
209 :			psubw mm7, mm6 ;D3
210 :			psubw mm1, mm0 ;A8
211 :			psubw mm3, mm2 ;B8
212 :
213 :			%if (%1 == 0)
214 :			push ebp
215 :			movq mm0, [ecx + %1 * 32 +32]
216 :			%elif (%1 < 3)
217 :			db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1
218 :			%endif
219 :			pmaxsw mm5, mm4 ;C4
220 :			%if (%1 < 3)
221 :			db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1
222 :			%else
223 :			cmp esp, esp
224 :			%endif
225 :			pmaxsw mm7, mm6 ;D4
226 :
227 :			psraw mm4, 15 ;C5
228 :			psraw mm6, 15 ;D5
229 :			movq [byte edx + %1 * 32], mm1 ;A9
230 :			movq [edx + %1 * 32+8], mm3 ;B9
231 :
232 :
233 :			psrlw mm5, 1 ;C6
234 :			psrlw mm7, 1 ;D6
235 :			%if (%1 < 3)
236 :			movq mm1, [ebx] ;A2
237 :			movq mm3, [ebx] ;B2
238 :			%endif
239 :			%if (%1 == 3)
240 :			imul eax, [int_div+4*edi]
241 :			%endif
242 :			pxor mm5, mm4 ;C7
243 :			pxor mm7, mm6 ;D7
244 :			%endm
245 :
246 :
247 :			%macro quant_intra 1
248 :			; Rules for athlon:
249 :			; 1) schedule latencies
250 :			; 2) add/mul and load/store in 2:1 proportion
251 :			; 3) avoid spliting >3byte instructions over 8byte boundaries
252 :
253 :			psubw mm1, mm0 ;A3
254 :			psubw mm3, mm2 ;B3
255 :			%if (%1)
256 :			psubw mm5, mm4 ;C8
257 :			psubw mm7, mm6 ;D8
258 :			%endif
259 :
260 :			ALIGN 8
261 :			db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1
262 :			pmaxsw mm1, mm0 ;A4
263 :			db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1
264 :			pmaxsw mm3, mm2 ;B4
265 :
266 :
267 :			psraw mm0, 15 ;A5
268 :			psraw mm2, 15 ;B5
269 :			%if (%1)
270 :			movq [edx + %1 * 32 + 16-32], mm5 ;C9
271 :			movq [edx + %1 * 32 + 24-32], mm7 ;D9
272 :			%endif
273 :
274 :			pmulhw mm1, [esi] ;A6
275 :			pmulhw mm3, [esi] ;B6
276 :			movq mm5, [ebx] ;C2
277 :			movq mm7, [ebx] ;D2
278 :
279 :			nop
280 :			nop
281 :			pxor mm1, mm0 ;A7
282 :			pxor mm3, mm2 ;B7
283 :
284 :			psubw mm5, mm4 ;C3
285 :			psubw mm7, mm6 ;D3
286 :			psubw mm1, mm0 ;A8
287 :			psubw mm3, mm2 ;B8
288 :
289 :
290 :			%if (%1 < 3)
291 :			db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1
292 :			%endif
293 :			pmaxsw mm5, mm4 ;C4
294 :			%if (%1 < 3)
295 :			db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1
296 :			%else
297 :			cmp esp, esp
298 :			%endif
299 :			pmaxsw mm7,mm6 ;D4
300 :
301 :			psraw mm4, 15 ;C5
302 :			psraw mm6, 15 ;D5
303 :			movq [byte edx + %1 * 32], mm1 ;A9
304 :			movq [edx + %1 * 32+8], mm3 ;B9
305 :
306 :
307 :			pmulhw mm5, [esi] ;C6
308 :			pmulhw mm7, [esi] ;D6
309 :			%if (%1 < 3)
310 :			movq mm1, [ebx] ;A2
311 :			movq mm3, [ebx] ;B2
312 :			%endif
313 :			%if (%1 == 0)
314 :			push ebp
315 :			%elif (%1 < 3)
316 :			nop
317 :			%endif
318 :			nop
319 :			%if (%1 == 3)
320 :			imul eax, [int_div+4*edi]
321 :			%endif
322 :			pxor mm5, mm4 ;C7
323 :			pxor mm7, mm6 ;D7
324 :			%endmacro
325 :
326 :
327 :			ALIGN 16
328 :			cglobal quant_h263_intra_3dne
329 :			quant_h263_intra_3dne:
330 :
331 :			mov eax, [esp + 12] ; quant
332 :			mov ecx, [esp + 8] ; data
333 :			mov edx, [esp + 4] ; coeff
334 :			cmp al, 1
335 :			pxor mm1, mm1
336 :			pxor mm3, mm3
337 :			movq mm0, [ecx] ; mm0 = [1st]
338 :			movq mm2, [ecx + 8]
339 :			push esi
340 :			lea esi, [mmx_div + eax*8 - 8]
341 :
342 :			push ebx
343 :			mov ebx, mmzero
344 :			push edi
345 :			jz near .q1loop
346 :
347 :			quant_intra 0
348 :			mov ebp, [esp + 16 + 16] ; dcscalar
349 :			; NB -- there are 3 pushes in the function preambule and one more
350 :			; in "quant_intra 0", thus an added offset of 16 bytes
351 :			movsx eax, word [byte ecx] ; DC
352 :
353 :			quant_intra 1
354 :			mov edi, eax
355 :			sar edi, 31 ; sign(DC)
356 :			shr ebp, byte 1 ; ebp = dcscalar/2
357 :
358 :			quant_intra 2
359 :			sub eax, edi ; DC (+1)
360 :			xor ebp, edi ; sign(DC) dcscalar /2 (-1)
361 :			mov edi, [esp + 16 + 16] ; dscalar
362 :			lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2
363 :			mov ebp, [byte esp]
364 :
365 :			quant_intra 3
366 :			psubw mm5, mm4 ;C8
367 :			mov esi, [esp + 12] ; pop back the register value
368 :			mov edi, [esp + 4] ; pop back the register value
369 :			sar eax, 16
370 :			lea ebx, [byte eax + 1] ; workaround for eax < 0
371 :			cmovs eax, ebx ; conditionnaly move the corrected value
372 :			mov [edx], ax ; coeff[0] = ax
373 :			mov ebx, [esp + 8] ; pop back the register value
374 :			add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
375 :			psubw mm7, mm6 ;D8
376 :			movq [edx + 3 * 32 + 16], mm5 ;C9
377 :			movq [edx + 3 * 32 + 24], mm7 ;D9
378 :
379 :			xor eax, eax
380 :			ret
381 :
382 :			ALIGN 16
383 :
384 :	Isibaar	1793	.q1loop:
385 :	edgomez	1382	quant_intra1 0
386 :			mov ebp, [esp + 16 + 16] ; dcscalar
387 :			movsx eax, word [byte ecx] ; DC
388 :
389 :			quant_intra1 1
390 :			mov edi, eax
391 :			sar edi, 31 ; sign(DC)
392 :			shr ebp, byte 1 ; ebp = dcscalar /2
393 :
394 :			quant_intra1 2
395 :			sub eax, edi ; DC (+1)
396 :			xor ebp, edi ; sign(DC) dcscalar /2 (-1)
397 :			mov edi, [esp + 16 + 16] ; dcscalar
398 :			lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2
399 :			mov ebp, [byte esp]
400 :
401 :			quant_intra1 3
402 :			psubw mm5, mm4 ;C8
403 :	suxen_drol	1787	mov esi, [esp + 12] ; pop back the register value
404 :	edgomez	1382	mov edi, [esp + 4] ; pop back the register value
405 :			sar eax, 16
406 :			lea ebx, [byte eax + 1] ; workaround for eax < 0
407 :			cmovs eax, ebx ; conditionnaly move the corrected value
408 :			mov [edx], ax ; coeff[0] = ax
409 :			mov ebx, [esp + 8] ; pop back the register value
410 :			add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
411 :			psubw mm7, mm6 ;D8
412 :			movq [edx + 3 * 32 + 16], mm5 ;C9
413 :			movq [edx + 3 * 32 + 24], mm7 ;D9
414 :
415 :			xor eax, eax
416 :			ret
417 :	Isibaar	1793	ENDFUNC
418 :	edgomez	1382
419 :
420 :
421 :
422 :			;-----------------------------------------------------------------------------
423 :			;
424 :			; uint32_t quant_h263_inter_3dne(int16_t * coeff,
425 :			; const int16_t const * data,
426 :			; const uint32_t quant,
427 :			; const uint16_t *mpeg_matrices);
428 :			;
429 :			;-----------------------------------------------------------------------------
430 :			;This is Athlon-optimized code (ca 90 clk per call)
431 :			;Optimized by Jaan, 30 Nov 2002
432 :
433 :
434 :			%macro quantinter 1
435 :			movq mm1, [eax] ;A2
436 :			psraw mm3, 15 ;B6
437 :			%if (%1)
438 :			psubw mm2, mm6 ;C10
439 :			%endif
440 :			psubw mm1, mm0 ;A3
441 :			pmulhw mm4, mm7 ;B7
442 :			movq mm6, [ecx + %1*24+16] ;C1
443 :			pmaxsw mm1, mm0 ;A4
444 :			paddw mm5, mm4 ;B8
445 :			%if (%1)
446 :			movq [edx + %1*24+16-24], mm2 ;C11
447 :			%endif
448 :			psubusw mm1, [ebx] ;A5 mm0 -= sub (unsigned, dont go < 0)
449 :			pxor mm4, mm3 ;B9
450 :			movq mm2, [eax] ;C2
451 :			psraw mm0, 15 ;A6
452 :			psubw mm4, mm3 ;B10
453 :			psubw mm2, mm6 ;C3
454 :			pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24
455 :			movq mm3, [ecx + %1*24+8] ;B1
456 :			pmaxsw mm2, mm6 ;C4
457 :			paddw mm5, mm1 ;A8 sum += mm0
458 :			%if (%1)
459 :			movq [edx + %1*24+8-24], mm4 ;B11
460 :			%else
461 :			movq [edx + 120], mm4 ;B11
462 :			%endif
463 :			psubusw mm2, [ebx] ;C5
464 :			pxor mm1, mm0 ;A9 mm0 *= sign(mm0)
465 :			movq mm4, [eax] ;B2
466 :			psraw mm6, 15 ;C6
467 :			psubw mm1, mm0 ;A10 undisplace
468 :			psubw mm4, mm3 ;B3
469 :			pmulhw mm2, mm7 ;C7
470 :			movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st]
471 :			pmaxsw mm4, mm3 ;B4
472 :			paddw mm5, mm2 ;C8
473 :			movq [byte edx + %1*24], mm1 ;A11
474 :			psubusw mm4, [ebx] ;B5
475 :			pxor mm2, mm6 ;C9
476 :			%endmacro
477 :
478 :			%macro quantinter1 1
479 :			movq mm0, [byte ecx + %1*16] ;mm0 = [1st]
480 :			movq mm3, [ecx + %1*16+8] ;
481 :			movq mm1, [eax]
482 :			movq mm4, [eax]
483 :			psubw mm1, mm0
484 :			psubw mm4, mm3
485 :			pmaxsw mm1, mm0
486 :			pmaxsw mm4, mm3
487 :			psubusw mm1, mm6 ; mm0 -= sub (unsigned, dont go < 0)
488 :			psubusw mm4, mm6 ;
489 :			psraw mm0, 15
490 :			psraw mm3, 15
491 :			psrlw mm1, 1 ; mm0 = (mm0 / 2Q) >> 16
492 :			psrlw mm4, 1 ;
493 :			paddw mm5, mm1 ; sum += mm0
494 :			pxor mm1, mm0 ; mm0 *= sign(mm0)
495 :			paddw mm5, mm4
496 :			pxor mm4, mm3 ;
497 :			psubw mm1, mm0 ; undisplace
498 :			psubw mm4, mm3
499 :			cmp esp, esp
500 :			movq [byte edx + %1*16], mm1
501 :			movq [edx + %1*16+8], mm4
502 :			%endmacro
503 :
504 :			ALIGN 16
505 :			cglobal quant_h263_inter_3dne
506 :			quant_h263_inter_3dne:
507 :			mov edx, [esp + 4] ; coeff
508 :			mov ecx, [esp + 8] ; data
509 :			mov eax, [esp + 12] ; quant
510 :			push ebx
511 :
512 :			pxor mm5, mm5 ; sum
513 :			nop
514 :			lea ebx,[mmx_sub + eax * 8 - 8] ; sub
515 :			movq mm7, [mmx_div + eax * 8 - 8] ; divider
516 :
517 :			cmp al, 1
518 :			lea eax, [mmzero]
519 :			jz near .q1loop
520 :			cmp esp, esp
521 :			ALIGN 8
522 :			movq mm3, [ecx + 120] ;B1
523 :			pxor mm4, mm4 ;B2
524 :			psubw mm4, mm3 ;B3
525 :			movq mm0, [ecx] ;A1 mm0 = [1st]
526 :			pmaxsw mm4, mm3 ;B4
527 :			psubusw mm4, [ebx] ;B5
528 :
529 :			quantinter 0
530 :			quantinter 1
531 :			quantinter 2
532 :			quantinter 3
533 :			quantinter 4
534 :
535 :			psraw mm3, 15 ;B6
536 :			psubw mm2, mm6 ;C10
537 :			pmulhw mm4, mm7 ;B7
538 :			paddw mm5, mm4 ;B8
539 :			pxor mm4, mm3 ;B9
540 :			psubw mm4, mm3 ;B10
541 :			movq [edx + 4*24+16], mm2 ;C11
542 :			pop ebx
543 :			movq [edx + 4*24+8], mm4 ;B11
544 :			pmaddwd mm5, [plus_one]
545 :			movq mm0, mm5
546 :			punpckhdq mm5, mm5
547 :			paddd mm0, mm5
548 :			movd eax, mm0 ; return sum
549 :
550 :			ret
551 :
552 :			ALIGN 16
553 :	Isibaar	1793	.q1loop:
554 :	edgomez	1382	movq mm6, [byte ebx]
555 :
556 :			quantinter1 0
557 :			quantinter1 1
558 :			quantinter1 2
559 :			quantinter1 3
560 :			quantinter1 4
561 :			quantinter1 5
562 :			quantinter1 6
563 :			quantinter1 7
564 :
565 :			pmaddwd mm5, [plus_one]
566 :			movq mm0, mm5
567 :			psrlq mm5, 32
568 :			paddd mm0, mm5
569 :			movd eax, mm0 ; return sum
570 :
571 :			pop ebx
572 :
573 :			ret
574 :	Isibaar	1793	ENDFUNC
575 :	edgomez	1382
576 :			;-----------------------------------------------------------------------------
577 :			;
578 :			; uint32_t dequant_h263_intra_3dne(int16_t *data,
579 :			; const int16_t const *coeff,
580 :			; const uint32_t quant,
581 :			; const uint32_t dcscalar,
582 :			; const uint16_t *mpeg_matrices);
583 :			;
584 :			;-----------------------------------------------------------------------------
585 :
586 :			; this is the same as dequant_inter_3dne, except that we're
587 :			; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
588 :
589 :			;This is Athlon-optimized code (ca 106 clk per call)
590 :
591 :			%macro dequant 1
592 :			movq mm1, [ecx+%1*24] ; c = coeff[i] ;A2
593 :			psubw mm0, mm1 ;-c ;A3 (1st dep)
594 :			%if (%1)
595 :			paddw mm4, mm6 ;C11 mm6 free (4th+)
596 :			%endif
597 :			pmaxsw mm0, mm1 ;\|c\| ;A4 (2nd)
598 :			%if (%1)
599 :			mov ebp, ebp
600 :			pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) later
601 :			%endif
602 :			movq mm6, [esi] ;0 ;A5 mm6 in use
603 :			pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
604 :			%if (%1)
605 :			pxor mm5, mm4 ;C13 (6th+) 1later
606 :			%endif
607 :			movq mm4, [esi] ;C1 ;0
608 :			mov esp, esp
609 :			pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st)
610 :			ALIGN 4
611 :			psraw mm1, 15 ; sign(c) ;A7 (2nd)
612 :			%if (%1)
613 :			movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later
614 :			%endif
615 :			paddw mm7, mm3 ;B10 offset +negate back (3rd)
616 :			pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+)
617 :			paddw mm2, mm7 ;B11 mm7 free (4th+)
618 :			lea ebp, [byte ebp]
619 :			movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i]
620 :			psubw mm4, mm5 ;-c ;C3 (1st dep)
621 :			pandn mm6, [eax] ;A9 offset = isZero ? 0 : quant_add (2nd)
622 :			pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
623 :			pxor mm3, mm2 ;B13 (6th+)
624 :			movq mm2, [byte esi] ;B1 ;0
625 :			%if (%1)
626 :			movq [edx+%1*24+8-24], mm3 ;B14 (7th)
627 :			%else
628 :			movq [edx+120], mm3
629 :			%endif
630 :			pmaxsw mm4, mm5 ;\|c\| ;C4 (2nd)
631 :			paddw mm6, mm1 ;A10 offset +negate back (3rd)
632 :			movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i]
633 :			psubw mm2, mm3 ;-c ;B3 (1st dep)
634 :			paddw mm0, mm6 ;A11 mm6 free (4th+)
635 :			movq mm6, [byte esi] ;0 ;C5 mm6 in use
636 :			pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st)
637 :			pminsw mm0, [ebx] ;A12 saturates to +2047 (5th+)
638 :			pmaxsw mm2, mm3 ;\|c\| ;B4 (2nd)
639 :			pxor mm1, mm0 ;A13 (6th+)
640 :			pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+)
641 :			psraw mm5, 15 ; sign(c) ;C7 (2nd)
642 :			movq mm7, [byte esi] ;0 ;B5 mm7 in use
643 :			pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st)
644 :			%if (%1 < 4)
645 :			movq mm0, [byte esi] ;A1 ;0
646 :			%endif
647 :			pandn mm6, [byte eax] ;C9 offset = isZero ? 0 : quant_add (2nd)
648 :			psraw mm3, 15 ;sign(c) ;B7 (2nd)
649 :			movq [byte edx+%1*24], mm1 ;A14 (7th)
650 :			paddw mm6, mm5 ;C10 offset +negate back (3rd)
651 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
652 :			mov esp, esp
653 :			%endmacro
654 :
655 :
656 :			ALIGN 16
657 :			cglobal dequant_h263_intra_3dne
658 :			dequant_h263_intra_3dne:
659 :			mov ecx, [esp+ 8] ; coeff
660 :			mov eax, [esp+12] ; quant
661 :			pxor mm0, mm0
662 :			pxor mm2, mm2
663 :			push edi
664 :			push ebx
665 :			lea edi, [mmx_mul + eax8 - 8] ; 2quant
666 :			push ebp
667 :			mov ebx, mmx_2047
668 :			movsx ebp, word [ecx]
669 :			lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1
670 :			push esi
671 :			mov esi, mmzero
672 :			pxor mm7, mm7
673 :			movq mm3, [ecx+120] ;B2 ; c = coeff[i]
674 :			pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st)
675 :
676 :			imul ebp, [esp+16+16] ; dcscalar
677 :			psubw mm2, mm3 ;-c ;B3 (1st dep)
678 :			pmaxsw mm2, mm3 ;\|c\| ;B4 (2nd)
679 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
680 :			psraw mm3, 15 ; sign(c) ;B7 (2nd)
681 :			mov edx, [esp+ 4+16] ; data
682 :
683 :			ALIGN 8
684 :			dequant 0
685 :
686 :			cmp ebp, -2048
687 :			mov esp, esp
688 :
689 :			dequant 1
690 :
691 :			cmovl ebp, [int_2048]
692 :			nop
693 :
694 :			dequant 2
695 :
696 :			cmp ebp, 2047
697 :			mov esp, esp
698 :
699 :			dequant 3
700 :
701 :			cmovg ebp, [int2047]
702 :			nop
703 :
704 :			dequant 4
705 :
706 :			paddw mm4, mm6 ;C11 mm6 free (4th+)
707 :			pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+)
708 :			pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
709 :			mov eax, ebp
710 :			mov esi, [esp]
711 :			mov ebp, [esp+4]
712 :			pxor mm5, mm4 ;C13 (6th+)
713 :			paddw mm7, mm3 ;B10 offset +negate back (3rd)
714 :			movq [edx+4*24+16], mm5 ;C14 (7th)
715 :			paddw mm2, mm7 ;B11 mm7 free (4th+)
716 :			pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
717 :			mov ebx, [esp+8]
718 :			mov edi, [esp+12]
719 :			add esp, byte 16
720 :			pxor mm3, mm2 ;B13 (6th+)
721 :			movq [edx+4*24+8], mm3 ;B14 (7th)
722 :			mov [edx], ax
723 :
724 :			xor eax, eax
725 :			ret
726 :	Isibaar	1793	ENDFUNC
727 :	edgomez	1382
728 :			;-----------------------------------------------------------------------------
729 :			;
730 :			; uint32_t dequant_h263_inter_3dne(int16_t * data,
731 :			; const int16_t * const coeff,
732 :			; const uint32_t quant,
733 :			; const uint16_t *mpeg_matrices);
734 :			;
735 :			;-----------------------------------------------------------------------------
736 :
737 :			; this is the same as dequant_inter_3dne,
738 :			; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
739 :			; This is Athlon-optimized code (ca 100 clk per call)
740 :
741 :			ALIGN 16
742 :			cglobal dequant_h263_inter_3dne
743 :			dequant_h263_inter_3dne:
744 :			mov ecx, [esp+ 8] ; coeff
745 :			mov eax, [esp+12] ; quant
746 :			pxor mm0, mm0
747 :			pxor mm2, mm2
748 :			push edi
749 :			push ebx
750 :			push esi
751 :			lea edi, [mmx_mul + eax8 - 8] ; 2quant
752 :			mov ebx, mmx_2047
753 :			pxor mm7, mm7
754 :			movq mm3, [ecx+120] ;B2 ; c = coeff[i]
755 :			pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st)
756 :			lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1
757 :			psubw mm2, mm3 ;-c ;B3 (1st dep)
758 :			mov esi, mmzero
759 :			pmaxsw mm2, mm3 ;\|c\| ;B4 (2nd)
760 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
761 :			psraw mm3, 15 ; sign(c) ;B7 (2nd)
762 :	suxen_drol	1787	mov edx, [esp+ 4+12] ; data
763 :	edgomez	1382
764 :			ALIGN 8
765 :
766 :			dequant 0
767 :			dequant 1
768 :			dequant 2
769 :			dequant 3
770 :			dequant 4
771 :
772 :			paddw mm4, mm6 ;C11 mm6 free (4th+)
773 :			pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+)
774 :			pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd)
775 :			mov esi, [esp]
776 :			pxor mm5, mm4 ;C13 (6th+)
777 :			paddw mm7, mm3 ;B10 offset +negate back (3rd)
778 :			movq [edx+4*24+16], mm5 ;C14 (7th)
779 :			paddw mm2, mm7 ;B11 mm7 free (4th+)
780 :			pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+)
781 :			mov ebx, [esp+4]
782 :			mov edi, [esp+8]
783 :			add esp, byte 12
784 :			pxor mm3, mm2 ;B13 (6th+)
785 :			movq [edx+4*24+8], mm3 ;B14 (7th)
786 :
787 :			xor eax, eax
788 :			ret
789 :	Isibaar	1793	ENDFUNC
790 :	edgomez	1540
791 :	Isibaar	1790
792 :			%ifidn __OUTPUT_FORMAT__,elf
793 :			section ".note.GNU-stack" noalloc noexec nowrite progbits
794 :			%endif
795 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4