Annotation of /tags/branch-release-1-0/xvidcore/src/quant/x86_asm/quantize_3dne.asm

Revision 908 - (view) (download)

1 :	edgomez	851	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * mmx quantization/dequantization
5 :			; *
6 :			; * This program is an implementation of a part of one or more MPEG-4
7 :			; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
8 :			; * to use this software module in hardware or software products are
9 :			; * advised that its use may infringe existing patents or copyrights, and
10 :			; * any such use would be at such party's own risk. The original
11 :			; * developer of this software module and his/her company, and subsequent
12 :			; * editors and their companies, will have no liability for use of this
13 :			; * software or modifications or derivatives thereof.
14 :			; *
15 :			; * This program is free software; you can redistribute it and/or modify
16 :			; * it under the terms of the GNU General Public License as published by
17 :			; * the Free Software Foundation; either version 2 of the License, or
18 :			; * (at your option) any later version.
19 :			; *
20 :			; * This program is distributed in the hope that it will be useful,
21 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 :			; * GNU General Public License for more details.
24 :			; *
25 :			; * You should have received a copy of the GNU General Public License
26 :			; * along with this program; if not, write to the Free Software
27 :			; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 :			; *
29 :			; *************************************************************************/
30 :			; these 3dne functions are compatible with iSSE, but are optimized specifically for
31 :			; K7 pipelines
32 :			;
33 :			;------------------------------------------------------------------------------
34 :			; 09.12.2002 Athlon optimizations contributed by Jaan Kalda
35 :			;------------------------------------------------------------------------------
36 :
37 :			; enable dequant saturate [-2048,2047], test purposes only.
38 :			%define SATURATE
39 :
40 :			; data/text alignment
41 :			%define ALIGN 16
42 :
43 :			bits 32
44 :
45 :			%ifdef FORMAT_COFF
46 :			section .data data
47 :			%else
48 :			section .data data align=16
49 :			%endif
50 :
51 :			%macro cglobal 1
52 :			%ifdef PREFIX
53 :			global _%1
54 :			%define %1 _%1
55 :			%else
56 :			global %1
57 :			%endif
58 :			%endmacro
59 :			align 4
60 :			int_div
61 :			dd 0
62 :			%assign i 1
63 :			%rep 255
64 :			dd (1 << 16) / ( i) + 1
65 :			%assign i i+1
66 :			%endrep
67 :
68 :			align 16
69 :
70 :			plus_one times 8 dw 1
71 :
72 :			;===========================================================================
73 :			;
74 :			; subtract by Q/2 table
75 :			;
76 :			;===========================================================================
77 :
78 :			%macro MMX_SUB 1
79 :			times 4 dw %1 / 2
80 :			%endmacro
81 :
82 :
83 :			align 16
84 :			mmx_sub
85 :			%assign i 1
86 :			%rep 31
87 :			times 4 dw i / 2
88 :			%assign i i+1
89 :			%endrep
90 :
91 :
92 :			;===========================================================================
93 :			;
94 :			; divide by 2Q table
95 :			;
96 :			; use a shift of 16 to take full advantage of _pmulhw_
97 :			; for q=1, _pmulhw_ will overflow so it is treated seperately
98 :			; (3dnow2 provides _pmulhuw_ which wont cause overflow)
99 :			;
100 :			;===========================================================================
101 :
102 :			align 16
103 :			mmx_div
104 :
105 :			%assign i 1
106 :			%rep 31
107 :			times 4 dw (1 << 16) / (i * 2) + 1
108 :			%assign i i+1
109 :			%endrep
110 :
111 :			;===========================================================================
112 :			;
113 :			; add by (odd(Q) ? Q : Q - 1) table
114 :			;
115 :			;===========================================================================
116 :
117 :			%macro MMX_ADD 1
118 :			%if %1 % 2 != 0
119 :			times 4 dw %1
120 :			%else
121 :			times 4 dw %1 - 1
122 :			%endif
123 :			%endmacro
124 :
125 :			align 16
126 :			mmx_add
127 :
128 :			%assign i 1
129 :			%rep 31
130 :			MMX_ADD i
131 :			%assign i i+1
132 :			%endrep
133 :
134 :			;===========================================================================
135 :			;
136 :			; multiple by 2Q table
137 :			;
138 :			;===========================================================================
139 :
140 :			%macro MMX_MUL 1
141 :			times 4 dw %1 * 2
142 :			%endmacro
143 :
144 :			align 16
145 :			mmx_mul
146 :
147 :			%assign i 1
148 :			%rep 31
149 :			times 4 dw i * 2
150 :			%assign i i+1
151 :			%endrep
152 :
153 :			;===========================================================================
154 :			;
155 :			; saturation limits
156 :			;
157 :			;===========================================================================
158 :
159 :			align 8
160 :			mmx_32768_minus_2048 times 4 dw (32768-2048)
161 :			mmx_32767_minus_2047 times 4 dw (32767-2047)
162 :
163 :			align 16
164 :			mmx_2047 times 4 dw 2047
165 :
166 :			align 8
167 :			mmzero dd 0, 0
168 :			int2047 dd 2047
169 :			int_2048 dd -2048
170 :
171 :			section .text
172 :
173 :
174 :			;===========================================================================
175 :			;
176 :			; void quant_intra_3dne(int16_t * coeff,
177 :			; const int16_t const * data,
178 :			; const uint32_t quant,
179 :			; const uint32_t dcscalar);
180 :			;
181 :			;===========================================================================
182 :			;This is Athlon-optimized code (ca 70 clk per call)
183 :			;Optimized by Jaan, 30 Nov 2002
184 :
185 :			%macro quant_intra1 1
186 :			psubw mm1,mm0 ;A3
187 :			psubw mm3,mm2 ;B3
188 :			%if (%1)
189 :			psubw mm5, mm4 ;C8
190 :			psubw mm7, mm6 ;D8
191 :			%endif
192 :
193 :			align 8
194 :			db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1
195 :			pmaxsw mm1,mm0 ;A4
196 :			db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24);movq mm6, [ecx + %1 * 32 +24+32] ;D1
197 :			pmaxsw mm3,mm2 ;B4
198 :
199 :
200 :			psraw mm0,15 ;A5
201 :			psraw mm2,15 ;B5
202 :			%if (%1)
203 :			movq [edx + %1 * 32 + 16-32], mm5 ;C9
204 :			movq [edx + %1 * 32 + 24-32], mm7 ;D9
205 :			%endif
206 :
207 :			psrlw mm1, 1 ;A6
208 :			psrlw mm3, 1 ;B6
209 :			movq mm5, [ebx] ;C2
210 :			movq mm7, [ebx] ;D2
211 :
212 :			pxor mm1, mm0 ;A7
213 :			pxor mm3, mm2 ;B7
214 :
215 :			psubw mm5,mm4 ;C3
216 :			psubw mm7,mm6 ;D3
217 :			psubw mm1, mm0 ;A8
218 :			psubw mm3, mm2 ;B8
219 :
220 :			%if (%1 == 0)
221 :			push ebp
222 :			movq mm0, [ecx + %1 * 32 +32]
223 :			%elif (%1 < 3)
224 :			db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1
225 :			%endif
226 :			pmaxsw mm5,mm4 ;C4
227 :			%if (%1 < 3)
228 :			db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1
229 :			%else
230 :			cmp esp,esp
231 :			%endif
232 :			pmaxsw mm7,mm6 ;D4
233 :
234 :			psraw mm4,15 ;C5
235 :			psraw mm6,15 ;D5
236 :			movq [byte edx + %1 * 32], mm1 ;A9
237 :			movq [edx + %1 * 32+8], mm3 ;B9
238 :
239 :
240 :			psrlw mm5, 1 ;C6
241 :			psrlw mm7, 1 ;D6
242 :			%if (%1 < 3)
243 :			movq mm1, [ebx] ;A2
244 :			movq mm3, [ebx] ;B2
245 :			%endif
246 :			%if (%1 == 3)
247 :			imul eax,[int_div+4*edi]
248 :			%endif
249 :			pxor mm5, mm4 ;C7
250 :			pxor mm7, mm6 ;D7
251 :			%endm
252 :
253 :
254 :			%macro quant_intra 1 ;rules for athlon: 1) schedule latencies, 2) add/mul and load/store in 2:1 proportion,
255 :			; 3) avoid spliting >3byte instructions over 8byte boundaries
256 :			psubw mm1,mm0 ;A3
257 :			psubw mm3,mm2 ;B3
258 :			%if (%1)
259 :			psubw mm5, mm4 ;C8
260 :			psubw mm7, mm6 ;D8
261 :			%endif
262 :
263 :			align 8
264 :			db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1
265 :			pmaxsw mm1,mm0 ;A4
266 :			db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24);movq mm6, [ecx + %1 * 32 +24+32] ;D1
267 :			pmaxsw mm3,mm2 ;B4
268 :
269 :
270 :			psraw mm0,15 ;A5
271 :			psraw mm2,15 ;B5
272 :			%if (%1)
273 :			movq [edx + %1 * 32 + 16-32], mm5 ;C9
274 :			movq [edx + %1 * 32 + 24-32], mm7 ;D9
275 :			%endif
276 :
277 :			pmulhw mm1, [esi] ;A6
278 :			pmulhw mm3, [esi] ;B6
279 :			movq mm5, [ebx] ;C2
280 :			movq mm7, [ebx] ;D2
281 :
282 :			nop
283 :			nop
284 :			pxor mm1, mm0 ;A7
285 :			pxor mm3, mm2 ;B7
286 :
287 :			psubw mm5,mm4 ;C3
288 :			psubw mm7,mm6 ;D3
289 :			psubw mm1, mm0 ;A8
290 :			psubw mm3, mm2 ;B8
291 :
292 :
293 :			%if (%1 < 3)
294 :			db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1
295 :			%endif
296 :			pmaxsw mm5,mm4 ;C4
297 :			%if (%1 < 3)
298 :			db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1
299 :			%else
300 :			cmp esp,esp
301 :			%endif
302 :			pmaxsw mm7,mm6 ;D4
303 :
304 :			psraw mm4,15 ;C5
305 :			psraw mm6,15 ;D5
306 :			movq [byte edx + %1 * 32], mm1 ;A9
307 :			movq [edx + %1 * 32+8], mm3 ;B9
308 :
309 :
310 :			pmulhw mm5, [esi] ;C6
311 :			pmulhw mm7, [esi] ;D6
312 :			%if (%1 < 3)
313 :			movq mm1, [ebx] ;A2
314 :			movq mm3, [ebx] ;B2
315 :			%endif
316 :			%if (%1 == 0)
317 :			push ebp
318 :			%elif (%1 < 3)
319 :			nop
320 :			%endif
321 :			nop
322 :			%if (%1 == 3)
323 :			imul eax,[int_div+4*edi]
324 :			%endif
325 :			pxor mm5, mm4 ;C7
326 :			pxor mm7, mm6 ;D7
327 :			%endmacro
328 :
329 :
330 :			align ALIGN
331 :			cglobal quant_intra_3dne
332 :			quant_intra_3dne:
333 :
334 :			mov eax, [esp + 12] ; quant
335 :			mov ecx, [esp + 8] ; data
336 :			mov edx, [esp + 4] ; coeff
337 :			cmp al, 1
338 :			pxor mm1,mm1
339 :			pxor mm3,mm3
340 :			movq mm0, [ecx ] ; mm0 = [1st]
341 :			movq mm2, [ecx +8]
342 :			push esi
343 :			lea esi, [mmx_div + eax * 8 - 8]
344 :
345 :			push ebx
346 :			mov ebx,mmzero
347 :			push edi
348 :			jz near .q1loop
349 :			quant_intra 0
350 :			mov ebp, [esp + 16 + 16] ; dcscalar
351 :			movsx eax, word [byte ecx] ;x
352 :			quant_intra 1
353 :			mov edi,eax
354 :			sar edi,31 ;sign(x)
355 :			shr ebp,byte 1 ; ebp = dcscalar /2
356 :			quant_intra 2
357 :			sub eax,edi ; x (+1)
358 :			xor ebp,edi ;sign(x) dcscalar /2 (-1)
359 :			mov edi,[esp + 16 + 16]
360 :			lea eax,[byte eax+ebp] ;x + sign(x) dcscalar /2
361 :			mov ebp,[byte esp]
362 :			quant_intra 3
363 :			psubw mm5, mm4 ;C8
364 :			mov esi,[esp+12]
365 :			mov edi,[esp+4]
366 :			mov ebx,[esp+8]
367 :			add esp,byte 16
368 :			sar eax,16
369 :			mov [edx], ax ; coeff[0] = ax
370 :			psubw mm7, mm6 ;D8
371 :			movq [edx + 3 * 32 + 16], mm5 ;C9
372 :			movq [edx + 3 * 32 + 24], mm7 ;D9
373 :			ret
374 :			align 16
375 :			.q1loop
376 :			quant_intra1 0
377 :			mov ebp, [esp + 16 + 16] ; dcscalar
378 :			movsx eax, word [byte ecx] ;x
379 :			quant_intra1 1
380 :			mov edi,eax
381 :			sar edi,31 ;sign(x)
382 :			shr ebp,byte 1 ; ebp = dcscalar /2
383 :			quant_intra1 2
384 :			sub eax,edi ; x (+1)
385 :			xor ebp,edi ;sign(x) dcscalar /2 (-1)
386 :			mov edi,[esp + 16 + 16]
387 :			lea eax,[byte eax+ebp] ;x + sign(x) dcscalar /2
388 :			mov ebp,[byte esp]
389 :			quant_intra1 3
390 :			psubw mm5, mm4 ;C8
391 :			mov esi,[dword esp+12]
392 :			mov edi,[esp+4]
393 :			mov ebx,[esp+8]
394 :			add esp,byte 16
395 :			sar eax,16
396 :			mov [edx], ax ; coeff[0] = ax
397 :			psubw mm7, mm6 ;D8
398 :			movq [edx + 3 * 32 + 16], mm5 ;C9
399 :			movq [edx + 3 * 32 + 24], mm7 ;D9
400 :			ret
401 :
402 :
403 :
404 :
405 :			;===========================================================================
406 :			;
407 :			; uint32_t quant_inter_3dne(int16_t * coeff,
408 :			; const int16_t const * data,
409 :			; const uint32_t quant);
410 :			;
411 :			;===========================================================================
412 :			;This is Athlon-optimized code (ca 90 clk per call)
413 :			;Optimized by Jaan, 30 Nov 2002
414 :
415 :
416 :			%macro quantinter 1
417 :			movq mm1, [eax] ;A2
418 :			psraw mm3,15 ;B6
419 :			%if (%1)
420 :			psubw mm2, mm6 ;C10
421 :			%endif
422 :			psubw mm1,mm0 ;A3
423 :			pmulhw mm4, mm7 ; B7
424 :			movq mm6, [ecx + %1*24+16] ; C1
425 :			pmaxsw mm1,mm0 ;A4
426 :			paddw mm5, mm4 ;B8
427 :			%if (%1)
428 :			movq [edx + %1*24+16-24], mm2 ;C11
429 :			%endif
430 :			psubusw mm1, [ebx] ; A5 mm0 -= sub (unsigned, dont go < 0)
431 :			pxor mm4, mm3 ;B9
432 :			movq mm2, [eax] ;C2
433 :			psraw mm0,15 ;A6
434 :			psubw mm4, mm3 ;B10
435 :			psubw mm2,mm6 ;C3
436 :			pmulhw mm1, mm7 ; A7 mm0 = (mm0 / 2Q) >> 24
437 :			movq mm3, [ecx + %1*24+8] ; B1
438 :			pmaxsw mm2,mm6 ;C4
439 :			paddw mm5, mm1 ; A8 sum += mm0
440 :			%if (%1)
441 :			movq [edx + %1*24+8-24], mm4 ;B11
442 :			%else
443 :			movq [edx + 120], mm4 ;B11
444 :			%endif
445 :			psubusw mm2, [ebx] ;C5
446 :			pxor mm1, mm0 ; A9 mm0 *= sign(mm0)
447 :			movq mm4, [eax] ;B2
448 :			psraw mm6,15 ;C6
449 :			psubw mm1, mm0 ;A10 undisplace
450 :			psubw mm4,mm3 ;B3
451 :			pmulhw mm2, mm7 ; C7
452 :			movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st]
453 :			pmaxsw mm4,mm3 ;B4
454 :			paddw mm5, mm2 ;C8
455 :			movq [byte edx + %1*24], mm1 ;A11
456 :			psubusw mm4, [ebx] ;B5
457 :			pxor mm2, mm6 ;C9
458 :			%endmacro
459 :
460 :			%macro quantinter1 1
461 :			movq mm0, [byte ecx + %1*16] ; mm0 = [1st]
462 :			movq mm3, [ecx + %1*16+8] ;
463 :			movq mm1, [eax]
464 :			movq mm4, [eax]
465 :			psubw mm1,mm0
466 :			psubw mm4,mm3
467 :			pmaxsw mm1,mm0
468 :			pmaxsw mm4,mm3
469 :			psubusw mm1, mm6 ; mm0 -= sub (unsigned, dont go < 0)
470 :			psubusw mm4, mm6 ;
471 :			psraw mm0,15
472 :			psraw mm3,15
473 :			psrlw mm1, 1 ; mm0 = (mm0 / 2Q) >> 16
474 :			psrlw mm4, 1 ;
475 :			paddw mm5, mm1 ; sum += mm0
476 :			pxor mm1, mm0 ; mm0 *= sign(mm0)
477 :			paddw mm5, mm4
478 :			pxor mm4, mm3 ;
479 :			psubw mm1, mm0 ; undisplace
480 :			psubw mm4, mm3
481 :			cmp esp,esp
482 :			movq [byte edx + %1*16], mm1
483 :			movq [edx + %1*16+8], mm4
484 :			%endmacro
485 :
486 :			align ALIGN
487 :			cglobal quant_inter_3dne
488 :			quant_inter_3dne
489 :
490 :			mov edx, [esp + 4] ; coeff
491 :			mov ecx, [esp + 8] ; data
492 :			mov eax, [esp + 12] ; quant
493 :			push ebx
494 :
495 :			pxor mm5, mm5 ; sum
496 :			nop
497 :			lea ebx,[mmx_sub + eax * 8 - 8] ; sub
498 :			movq mm7, [mmx_div + eax * 8 - 8] ; divider
499 :
500 :			cmp al, 1
501 :			lea eax,[mmzero]
502 :			jz near .q1loop
503 :			cmp esp,esp
504 :			align 8
505 :			movq mm3, [ecx + 120] ; B1
506 :			pxor mm4,mm4 ;B2
507 :			psubw mm4,mm3 ;B3
508 :			movq mm0, [ecx] ;A1 mm0 = [1st]
509 :			pmaxsw mm4,mm3 ;B4
510 :			psubusw mm4, [ebx] ;B5
511 :
512 :			quantinter 0
513 :			quantinter 1
514 :			quantinter 2
515 :			quantinter 3
516 :			quantinter 4
517 :			psraw mm3,15 ;B6
518 :			psubw mm2, mm6 ;C10
519 :			pmulhw mm4, mm7 ; B7
520 :			paddw mm5, mm4 ;B8
521 :			pxor mm4, mm3 ;B9
522 :			psubw mm4, mm3 ;B10
523 :			movq [edx + 4*24+16], mm2 ;C11
524 :			pop ebx
525 :			movq [edx + 4*24+8], mm4 ;B11
526 :			pmaddwd mm5, [plus_one]
527 :			movq mm0, mm5
528 :			punpckhdq mm5, mm5
529 :			paddd mm0, mm5
530 :			movd eax, mm0 ; return sum
531 :			ret
532 :
533 :			align ALIGN
534 :			.q1loop
535 :			movq mm6,[byte ebx]
536 :			quantinter1 0
537 :			quantinter1 1
538 :			quantinter1 2
539 :			quantinter1 3
540 :			quantinter1 4
541 :			quantinter1 5
542 :			quantinter1 6
543 :			quantinter1 7
544 :
545 :			pmaddwd mm5, [plus_one]
546 :			movq mm0, mm5
547 :			psrlq mm5, 32
548 :			paddd mm0, mm5
549 :			movd eax, mm0 ; return sum
550 :
551 :			pop ebx
552 :
553 :			ret
554 :
555 :			;===========================================================================
556 :			;
557 :			; void dequant_intra_3dne(int16_t *data,
558 :			; const int16_t const *coeff,
559 :			; const uint32_t quant,
560 :			; const uint32_t dcscalar);
561 :			;
562 :			;===========================================================================
563 :
564 :			; this is the same as dequant_inter_3dne, except that we're
565 :			; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
566 :
567 :			;This is Athlon-optimized code (ca 106 clk per call)
568 :
569 :			%macro dequant 1
570 :			movq mm1, [ecx+%1*24] ;A2 ; c = coeff[i]
571 :			psubw mm0,mm1 ;-c ;A3 (1st dep)
572 :			%if (%1)
573 :			paddw mm4,mm6 ; C11 mm6 free (4th+)
574 :			%endif
575 :			pmaxsw mm0,mm1 ;\|c\| ;A4 (2nd)
576 :			%if (%1)
577 :			mov ebp,ebp
578 :			pminsw mm4,[ebx] ; C12 saturates to +2047 (5th+) 1ater
579 :			%endif
580 :			movq mm6,[esi] ;0 ;A5 mm6 in use
581 :			pandn mm7,[eax] ; B9 offset = isZero ? 0 : quant_add (2nd)
582 :			%if (%1)
583 :			pxor mm5, mm4 ; C13 (6th+) 1later
584 :			%endif
585 :			movq mm4,[esi] ; C1 ;0
586 :			mov esp,esp
587 :			pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st)
588 :			align 4
589 :			psraw mm1,15 ; sign(c) ;A7 (2nd)
590 :			%if (%1)
591 :			movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later
592 :			%endif
593 :			paddw mm7,mm3 ; B10 offset +negate back (3rd)
594 :			pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+)
595 :			paddw mm2,mm7 ; B11 mm7 free (4th+)
596 :			lea ebp,[byte ebp]
597 :			movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i]
598 :			psubw mm4,mm5 ;-c ;C3 (1st dep)
599 :			pandn mm6,[eax] ; A9 offset = isZero ? 0 : quant_add (2nd)
600 :			pminsw mm2,[ebx] ; B12 saturates to +2047 (5th+)
601 :			pxor mm3, mm2 ; B13 (6th+)
602 :			movq mm2,[byte esi] ; B1 ;0
603 :			%if (%1)
604 :			movq [edx+%1*24+8-24], mm3 ; B14 (7th)
605 :			%else
606 :			movq [edx+120], mm3
607 :			%endif
608 :			pmaxsw mm4,mm5 ;\|c\| ;C4 (2nd)
609 :			paddw mm6,mm1 ; A10 offset +negate back (3rd)
610 :			movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i]
611 :			psubw mm2,mm3 ;-c ;B3 (1st dep)
612 :			paddw mm0,mm6 ; A11 mm6 free (4th+)
613 :			movq mm6,[byte esi] ;0 ;C5 mm6 in use
614 :			pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st)
615 :			pminsw mm0,[ebx] ; A12 saturates to +2047 (5th+)
616 :			pmaxsw mm2,mm3 ;\|c\| ;B4 (2nd)
617 :			pxor mm1, mm0 ; A13 (6th+)
618 :			pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+)
619 :			psraw mm5,15 ; sign(c) ;C7 (2nd)
620 :			movq mm7,[byte esi] ;0 ;B5 mm7 in use
621 :			pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st)
622 :			%if (%1 < 4)
623 :			movq mm0,[byte esi] ; A1 ;0
624 :			%endif
625 :			pandn mm6,[byte eax] ; C9 offset = isZero ? 0 : quant_add (2nd)
626 :			psraw mm3,15 ; sign(c) ;B7 (2nd)
627 :			movq [byte edx+%1*24], mm1 ; A14 (7th)
628 :			paddw mm6,mm5 ; C10 offset +negate back (3rd)
629 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
630 :			mov esp,esp
631 :			%endmacro
632 :
633 :
634 :			align ALIGN
635 :			cglobal dequant_intra_3dne
636 :			dequant_intra_3dne:
637 :			mov ecx, [esp+ 8] ; coeff
638 :			mov eax, [esp+12] ; quant
639 :			pxor mm0,mm0
640 :			pxor mm2,mm2
641 :			push edi
642 :			push ebx
643 :			lea edi,[mmx_mul + eax8 - 8] ; 2quant
644 :			push ebp
645 :			mov ebx,mmx_2047
646 :			movsx ebp,word [ecx]
647 :			lea eax,[mmx_add + eax*8 - 8] ; quant or quant-1
648 :			push esi
649 :			mov esi,mmzero
650 :			pxor mm7,mm7
651 :			movq mm3, [ecx+120] ;B2 ; c = coeff[i]
652 :			pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st)
653 :
654 :			imul ebp,[esp+16+16] ; dcscalar
655 :			psubw mm2,mm3 ;-c ;B3 (1st dep)
656 :			pmaxsw mm2,mm3 ;\|c\| ;B4 (2nd)
657 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
658 :			psraw mm3,15 ; sign(c) ;B7 (2nd)
659 :			mov edx, [esp+ 4+16] ; data
660 :			align 8
661 :			dequant 0
662 :			cmp ebp,-2048
663 :			mov esp,esp
664 :			dequant 1
665 :			cmovl ebp,[int_2048]
666 :			nop
667 :			dequant 2
668 :			cmp ebp,2047
669 :			mov esp,esp
670 :			dequant 3
671 :			cmovg ebp,[int2047]
672 :			nop
673 :			dequant 4
674 :
675 :			paddw mm4,mm6 ; C11 mm6 free (4th+)
676 :			pminsw mm4,[ebx] ; C12 saturates to +2047 (5th+)
677 :			pandn mm7,[eax] ; B9 offset = isZero ? 0 : quant_add (2nd)
678 :			mov eax,ebp
679 :			mov esi,[esp]
680 :			mov ebp,[esp+4]
681 :			pxor mm5, mm4 ; C13 (6th+)
682 :			paddw mm7,mm3 ; B10 offset +negate back (3rd)
683 :			movq [edx+4*24+16], mm5 ; C14 (7th)
684 :			paddw mm2,mm7 ; B11 mm7 free (4th+)
685 :			pminsw mm2,[ebx] ; B12 saturates to +2047 (5th+)
686 :			mov ebx,[esp+8]
687 :			mov edi,[esp+12]
688 :			add esp,byte 16
689 :			pxor mm3, mm2 ; B13 (6th+)
690 :			movq [edx+4*24+8], mm3 ; B14 (7th)
691 :			mov [edx], ax
692 :			ret
693 :
694 :			;===========================================================================
695 :			;
696 :			; void dequant_inter_3dne(int16_t * data,
697 :			; const int16_t * const coeff,
698 :			; const uint32_t quant);
699 :			;
700 :			;===========================================================================
701 :
702 :			; this is the same as dequant_inter_3dne,
703 :			; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
704 :			;This is Athlon-optimized code (ca 100 clk per call)
705 :			;Optimized by Jaan, 30 Nov 2002
706 :
707 :			align ALIGN
708 :			cglobal dequant_inter_3dne
709 :			dequant_inter_3dne:
710 :
711 :			mov ecx, [esp+ 8] ; coeff
712 :			mov eax, [esp+12] ; quant
713 :			pxor mm0,mm0
714 :			pxor mm2,mm2
715 :			push edi
716 :			push ebx
717 :			push esi
718 :			lea edi,[mmx_mul + eax8 - 8] ; 2quant
719 :			mov ebx,mmx_2047
720 :			pxor mm7,mm7
721 :			movq mm3, [ecx+120] ;B2 ; c = coeff[i]
722 :			pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st)
723 :			lea eax,[mmx_add + eax*8 - 8] ; quant or quant-1
724 :			psubw mm2,mm3 ;-c ;B3 (1st dep)
725 :			mov esi,mmzero
726 :			pmaxsw mm2,mm3 ;\|c\| ;B4 (2nd)
727 :			pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+)
728 :			psraw mm3,15 ; sign(c) ;B7 (2nd)
729 :			mov edx, [dword esp+ 4+12] ; data
730 :			align 8
731 :			dequant 0
732 :			dequant 1
733 :			dequant 2
734 :			dequant 3
735 :			dequant 4
736 :
737 :			paddw mm4,mm6 ; C11 mm6 free (4th+)
738 :			pminsw mm4,[ebx] ; C12 saturates to +2047 (5th+)
739 :			pandn mm7,[eax] ; B9 offset = isZero ? 0 : quant_add (2nd)
740 :			mov esi,[esp]
741 :			pxor mm5, mm4 ; C13 (6th+)
742 :			paddw mm7,mm3 ; B10 offset +negate back (3rd)
743 :			movq [edx+4*24+16], mm5 ; C14 (7th)
744 :			paddw mm2,mm7 ; B11 mm7 free (4th+)
745 :			pminsw mm2,[ebx] ; B12 saturates to +2047 (5th+)
746 :			mov ebx,[esp+4]
747 :			mov edi,[esp+8]
748 :			add esp,byte 12
749 :			pxor mm3, mm2 ; B13 (6th+)
750 :			movq [edx+4*24+8], mm3 ; B14 (7th)
751 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4