ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/svn/tags/branch-release-1-0/xvidcore/src/quant/x86_asm/quantize_mmx.asm
Revision: 908
Committed: Thu Mar 6 21:12:04 2003 UTC (21 years, 7 months ago)
File size: 24013 byte(s)
Log Message:
This commit was manufactured by cvs2svn to create tag 'branch-release-1-0'.

File Contents

# Content
1 ;/**************************************************************************
2 ; *
3 ; * XVID MPEG-4 VIDEO CODEC
4 ; * mmx quantization/dequantization
5 ; *
6 ; * This program is an implementation of a part of one or more MPEG-4
7 ; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
8 ; * to use this software module in hardware or software products are
9 ; * advised that its use may infringe existing patents or copyrights, and
10 ; * any such use would be at such party's own risk. The original
11 ; * developer of this software module and his/her company, and subsequent
12 ; * editors and their companies, will have no liability for use of this
13 ; * software or modifications or derivatives thereof.
14 ; *
15 ; * This program is free software; you can redistribute it and/or modify
16 ; * it under the terms of the GNU General Public License as published by
17 ; * the Free Software Foundation; either version 2 of the License, or
18 ; * (at your option) any later version.
19 ; *
20 ; * This program is distributed in the hope that it will be useful,
21 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 ; * GNU General Public License for more details.
24 ; *
25 ; * You should have received a copy of the GNU General Public License
26 ; * along with this program; if not, write to the Free Software
27 ; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 ; *
29 ; *************************************************************************/
30
31 ;/**************************************************************************
32 ; *
33 ; * History:
34 ; *
35 ; * 09.08.2002 sse2 dequant funcs revamped
36 ; * 14.06.2002 mmx+xmm dequant_* funcs revamped -Skal-
37 ; * 24.02.2002 sse2 quant_intra / dequant_intra (have to use movdqu ???)
38 ; * 17.04.2002 sse2 quant_inter / dequant_inter
39 ; * 26.12.2001 minor bug fixes, dequant saturate, further optimization
40 ; * 19.11.2001 quant_inter_mmx now returns sum of abs. coefficient values
41 ; * 04.11.2001 nasm version; (c)2001 peter ross <pross@cs.rmit.edu.au>
42 ; *
43 ; *************************************************************************/
44
45 ; enable dequant saturate [-2048,2047], test purposes only.
46 %define SATURATE
47
48 ; data/text alignment
49 %define ALIGN 8
50
51 bits 32
52
53 section .data
54
55
56 %macro cglobal 1
57 %ifdef PREFIX
58 global _%1
59 %define %1 _%1
60 %else
61 global %1
62 %endif
63 %endmacro
64
65 align 16
66
67 plus_one times 8 dw 1
68
69 ;===========================================================================
70 ;
71 ; subtract by Q/2 table
72 ;
73 ;===========================================================================
74
75 %macro MMX_SUB 1
76 times 4 dw %1 / 2
77 %endmacro
78
79 align 16
80 mmx_sub
81 MMX_SUB 1
82 MMX_SUB 2
83 MMX_SUB 3
84 MMX_SUB 4
85 MMX_SUB 5
86 MMX_SUB 6
87 MMX_SUB 7
88 MMX_SUB 8
89 MMX_SUB 9
90 MMX_SUB 10
91 MMX_SUB 11
92 MMX_SUB 12
93 MMX_SUB 13
94 MMX_SUB 14
95 MMX_SUB 15
96 MMX_SUB 16
97 MMX_SUB 17
98 MMX_SUB 18
99 MMX_SUB 19
100 MMX_SUB 20
101 MMX_SUB 21
102 MMX_SUB 22
103 MMX_SUB 23
104 MMX_SUB 24
105 MMX_SUB 25
106 MMX_SUB 26
107 MMX_SUB 27
108 MMX_SUB 28
109 MMX_SUB 29
110 MMX_SUB 30
111 MMX_SUB 31
112
113
114
115 ;===========================================================================
116 ;
117 ; divide by 2Q table
118 ;
119 ; use a shift of 16 to take full advantage of _pmulhw_
120 ; for q=1, _pmulhw_ will overflow so it is treated seperately
121 ; (3dnow2 provides _pmulhuw_ which wont cause overflow)
122 ;
123 ;===========================================================================
124
125 %macro MMX_DIV 1
126 times 4 dw (1 << 16) / (%1 * 2) + 1
127 %endmacro
128
129 align 16
130 mmx_div
131 MMX_DIV 1
132 MMX_DIV 2
133 MMX_DIV 3
134 MMX_DIV 4
135 MMX_DIV 5
136 MMX_DIV 6
137 MMX_DIV 7
138 MMX_DIV 8
139 MMX_DIV 9
140 MMX_DIV 10
141 MMX_DIV 11
142 MMX_DIV 12
143 MMX_DIV 13
144 MMX_DIV 14
145 MMX_DIV 15
146 MMX_DIV 16
147 MMX_DIV 17
148 MMX_DIV 18
149 MMX_DIV 19
150 MMX_DIV 20
151 MMX_DIV 21
152 MMX_DIV 22
153 MMX_DIV 23
154 MMX_DIV 24
155 MMX_DIV 25
156 MMX_DIV 26
157 MMX_DIV 27
158 MMX_DIV 28
159 MMX_DIV 29
160 MMX_DIV 30
161 MMX_DIV 31
162
163
164
165 ;===========================================================================
166 ;
167 ; add by (odd(Q) ? Q : Q - 1) table
168 ;
169 ;===========================================================================
170
171 %macro MMX_ADD 1
172 %if %1 % 2 != 0
173 times 4 dw %1
174 %else
175 times 4 dw %1 - 1
176 %endif
177 %endmacro
178
179 align 16
180 mmx_add
181 MMX_ADD 1
182 MMX_ADD 2
183 MMX_ADD 3
184 MMX_ADD 4
185 MMX_ADD 5
186 MMX_ADD 6
187 MMX_ADD 7
188 MMX_ADD 8
189 MMX_ADD 9
190 MMX_ADD 10
191 MMX_ADD 11
192 MMX_ADD 12
193 MMX_ADD 13
194 MMX_ADD 14
195 MMX_ADD 15
196 MMX_ADD 16
197 MMX_ADD 17
198 MMX_ADD 18
199 MMX_ADD 19
200 MMX_ADD 20
201 MMX_ADD 21
202 MMX_ADD 22
203 MMX_ADD 23
204 MMX_ADD 24
205 MMX_ADD 25
206 MMX_ADD 26
207 MMX_ADD 27
208 MMX_ADD 28
209 MMX_ADD 29
210 MMX_ADD 30
211 MMX_ADD 31
212
213
214 ;===========================================================================
215 ;
216 ; multiple by 2Q table
217 ;
218 ;===========================================================================
219
220 %macro MMX_MUL 1
221 times 4 dw %1 * 2
222 %endmacro
223
224 align 16
225 mmx_mul
226 MMX_MUL 1
227 MMX_MUL 2
228 MMX_MUL 3
229 MMX_MUL 4
230 MMX_MUL 5
231 MMX_MUL 6
232 MMX_MUL 7
233 MMX_MUL 8
234 MMX_MUL 9
235 MMX_MUL 10
236 MMX_MUL 11
237 MMX_MUL 12
238 MMX_MUL 13
239 MMX_MUL 14
240 MMX_MUL 15
241 MMX_MUL 16
242 MMX_MUL 17
243 MMX_MUL 18
244 MMX_MUL 19
245 MMX_MUL 20
246 MMX_MUL 21
247 MMX_MUL 22
248 MMX_MUL 23
249 MMX_MUL 24
250 MMX_MUL 25
251 MMX_MUL 26
252 MMX_MUL 27
253 MMX_MUL 28
254 MMX_MUL 29
255 MMX_MUL 30
256 MMX_MUL 31
257
258
259 ;===========================================================================
260 ;
261 ; saturation limits
262 ;
263 ;===========================================================================
264
265 align 16
266 sse2_2047 times 8 dw 2047
267
268 align 16
269 mmx_2047 times 4 dw 2047
270
271 align 8
272 mmx_32768_minus_2048 times 4 dw (32768-2048)
273 mmx_32767_minus_2047 times 4 dw (32767-2047)
274
275
276 section .text
277
278
279 ;===========================================================================
280 ;
281 ; void quant_intra_mmx(int16_t * coeff,
282 ; const int16_t const * data,
283 ; const uint32_t quant,
284 ; const uint32_t dcscalar);
285 ;
286 ;===========================================================================
287
288 align ALIGN
289 cglobal quant_intra_mmx
290 quant_intra_mmx
291
292 push ecx
293 push esi
294 push edi
295
296 mov edi, [esp + 12 + 4] ; coeff
297 mov esi, [esp + 12 + 8] ; data
298 mov eax, [esp + 12 + 12] ; quant
299
300 xor ecx, ecx
301 cmp al, 1
302 jz .q1loop
303
304 movq mm7, [mmx_div + eax * 8 - 8]
305 align ALIGN
306 .loop
307 movq mm0, [esi + 8*ecx] ; mm0 = [1st]
308 movq mm3, [esi + 8*ecx + 8] ;
309 pxor mm1, mm1 ; mm1 = 0
310 pxor mm4, mm4 ;
311 pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
312 pcmpgtw mm4, mm3 ;
313 pxor mm0, mm1 ; mm0 = |mm0|
314 pxor mm3, mm4 ;
315 psubw mm0, mm1 ; displace
316 psubw mm3, mm4 ;
317 pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
318 pmulhw mm3, mm7 ;
319 pxor mm0, mm1 ; mm0 *= sign(mm0)
320 pxor mm3, mm4 ;
321 psubw mm0, mm1 ; undisplace
322 psubw mm3, mm4 ;
323 movq [edi + 8*ecx], mm0
324 movq [edi + 8*ecx + 8], mm3
325
326 add ecx,2
327 cmp ecx,16
328 jnz .loop
329
330 .done
331 ; caclulate data[0] // (int32_t)dcscalar)
332
333 mov ecx, [esp + 12 + 16] ; dcscalar
334 mov edx, ecx
335 movsx eax, word [esi] ; data[0]
336 shr edx, 1 ; edx = dcscalar /2
337 cmp eax, 0
338 jg .gtzero
339
340 sub eax, edx
341 jmp short .mul
342 .gtzero
343 add eax, edx
344 .mul
345 cdq ; expand eax -> edx:eax
346 idiv ecx ; eax = edx:eax / dcscalar
347
348 mov [edi], ax ; coeff[0] = ax
349
350 pop edi
351 pop esi
352 pop ecx
353
354 ret
355
356 align ALIGN
357 .q1loop
358 movq mm0, [esi + 8*ecx] ; mm0 = [1st]
359 movq mm3, [esi + 8*ecx + 8] ;
360 pxor mm1, mm1 ; mm1 = 0
361 pxor mm4, mm4 ;
362 pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
363 pcmpgtw mm4, mm3 ;
364 pxor mm0, mm1 ; mm0 = |mm0|
365 pxor mm3, mm4 ;
366 psubw mm0, mm1 ; displace
367 psubw mm3, mm4 ;
368 psrlw mm0, 1 ; mm0 >>= 1 (/2)
369 psrlw mm3, 1 ;
370 pxor mm0, mm1 ; mm0 *= sign(mm0)
371 pxor mm3, mm4 ;
372 psubw mm0, mm1 ; undisplace
373 psubw mm3, mm4 ;
374 movq [edi + 8*ecx], mm0
375 movq [edi + 8*ecx + 8], mm3
376
377 add ecx,2
378 cmp ecx,16
379 jnz .q1loop
380 jmp short .done
381
382
383
384 ;===========================================================================
385 ;
386 ; void quant_intra_sse2(int16_t * coeff,
387 ; const int16_t const * data,
388 ; const uint32_t quant,
389 ; const uint32_t dcscalar);
390 ;
391 ;===========================================================================
392
393 align ALIGN
394 cglobal quant_intra_sse2
395 quant_intra_sse2
396
397 push esi
398 push edi
399
400 mov edi, [esp + 8 + 4] ; coeff
401 mov esi, [esp + 8 + 8] ; data
402 mov eax, [esp + 8 + 12] ; quant
403
404 xor ecx, ecx
405 cmp al, 1
406 jz near .qas2_q1loop
407
408 .qas2_not1
409 movq mm7, [mmx_div + eax*8 - 8]
410 movq2dq xmm7, mm7
411 movlhps xmm7, xmm7
412
413 align 16
414 .qas2_loop
415 movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st]
416 movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd]
417 pxor xmm1, xmm1
418 pxor xmm4, xmm4
419 pcmpgtw xmm1, xmm0
420 pcmpgtw xmm4, xmm3
421 pxor xmm0, xmm1
422 pxor xmm3, xmm4
423 psubw xmm0, xmm1
424 psubw xmm3, xmm4
425 pmulhw xmm0, xmm7
426 pmulhw xmm3, xmm7
427 pxor xmm0, xmm1
428 pxor xmm3, xmm4
429 psubw xmm0, xmm1
430 psubw xmm3, xmm4
431 movdqa [edi + ecx*8], xmm0
432 movdqa [edi + ecx*8 + 16], xmm3
433
434 add ecx, 4
435 cmp ecx, 16
436 jnz .qas2_loop
437
438 .qas2_done
439 mov ecx, [esp + 8 + 16] ; dcscalar
440 mov edx, ecx
441 movsx eax, word [esi]
442 shr edx, 1
443 cmp eax, 0
444 jg .qas2_gtzero
445
446 sub eax, edx
447 jmp short .qas2_mul
448 .qas2_gtzero
449 add eax, edx
450 .qas2_mul
451 cdq
452 idiv ecx
453
454 mov [edi], ax
455
456 pop edi
457 pop esi
458
459 ret
460
461 align 16
462 .qas2_q1loop
463 movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st]
464 movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd]
465 pxor xmm1, xmm1
466 pxor xmm4, xmm4
467 pcmpgtw xmm1, xmm0
468 pcmpgtw xmm4, xmm3
469 pxor xmm0, xmm1
470 pxor xmm3, xmm4
471 psubw xmm0, xmm1
472 psubw xmm3, xmm4
473 psrlw xmm0, 1
474 psrlw xmm3, 1
475 pxor xmm0, xmm1
476 pxor xmm3, xmm4
477 psubw xmm0, xmm1
478 psubw xmm3, xmm4
479 movdqa [edi + ecx*8], xmm0
480 movdqa [edi + ecx*8 + 16], xmm3
481
482 add ecx, 4
483 cmp ecx, 16
484 jnz .qas2_q1loop
485 jmp near .qas2_done
486
487
488
489 ;===========================================================================
490 ;
491 ; uint32_t quant_inter_mmx(int16_t * coeff,
492 ; const int16_t const * data,
493 ; const uint32_t quant);
494 ;
495 ;===========================================================================
496
497 align ALIGN
498 cglobal quant_inter_mmx
499 quant_inter_mmx
500
501 push ecx
502 push esi
503 push edi
504
505 mov edi, [esp + 12 + 4] ; coeff
506 mov esi, [esp + 12 + 8] ; data
507 mov eax, [esp + 12 + 12] ; quant
508
509 xor ecx, ecx
510
511 pxor mm5, mm5 ; sum
512 movq mm6, [mmx_sub + eax * 8 - 8] ; sub
513
514 cmp al, 1
515 jz .q1loop
516
517 movq mm7, [mmx_div + eax * 8 - 8] ; divider
518
519 align ALIGN
520 .loop
521 movq mm0, [esi + 8*ecx] ; mm0 = [1st]
522 movq mm3, [esi + 8*ecx + 8] ;
523 pxor mm1, mm1 ; mm1 = 0
524 pxor mm4, mm4 ;
525 pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
526 pcmpgtw mm4, mm3 ;
527 pxor mm0, mm1 ; mm0 = |mm0|
528 pxor mm3, mm4 ;
529 psubw mm0, mm1 ; displace
530 psubw mm3, mm4 ;
531 psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0)
532 psubusw mm3, mm6 ;
533 pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
534 pmulhw mm3, mm7 ;
535 paddw mm5, mm0 ; sum += mm0
536 pxor mm0, mm1 ; mm0 *= sign(mm0)
537 paddw mm5, mm3 ;
538 pxor mm3, mm4 ;
539 psubw mm0, mm1 ; undisplace
540 psubw mm3, mm4
541 movq [edi + 8*ecx], mm0
542 movq [edi + 8*ecx + 8], mm3
543
544 add ecx, 2
545 cmp ecx, 16
546 jnz .loop
547
548 .done
549 pmaddwd mm5, [plus_one]
550 movq mm0, mm5
551 psrlq mm5, 32
552 paddd mm0, mm5
553 movd eax, mm0 ; return sum
554
555 pop edi
556 pop esi
557 pop ecx
558
559 ret
560
561 align ALIGN
562 .q1loop
563 movq mm0, [esi + 8*ecx] ; mm0 = [1st]
564 movq mm3, [esi + 8*ecx+ 8] ;
565 pxor mm1, mm1 ; mm1 = 0
566 pxor mm4, mm4 ;
567 pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
568 pcmpgtw mm4, mm3 ;
569 pxor mm0, mm1 ; mm0 = |mm0|
570 pxor mm3, mm4 ;
571 psubw mm0, mm1 ; displace
572 psubw mm3, mm4 ;
573 psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0)
574 psubusw mm3, mm6 ;
575 psrlw mm0, 1 ; mm0 >>= 1 (/2)
576 psrlw mm3, 1 ;
577 paddw mm5, mm0 ; sum += mm0
578 pxor mm0, mm1 ; mm0 *= sign(mm0)
579 paddw mm5, mm3 ;
580 pxor mm3, mm4 ;
581 psubw mm0, mm1 ; undisplace
582 psubw mm3, mm4
583 movq [edi + 8*ecx], mm0
584 movq [edi + 8*ecx + 8], mm3
585
586 add ecx,2
587 cmp ecx,16
588 jnz .q1loop
589
590 jmp .done
591
592
593
594 ;===========================================================================
595 ;
596 ; uint32_t quant_inter_sse2(int16_t * coeff,
597 ; const int16_t const * data,
598 ; const uint32_t quant);
599 ;
600 ;===========================================================================
601
602 align 16
603 cglobal quant_inter_sse2
604 quant_inter_sse2
605
606 push esi
607 push edi
608
609 mov edi, [esp + 8 + 4] ; coeff
610 mov esi, [esp + 8 + 8] ; data
611 mov eax, [esp + 8 + 12] ; quant
612
613 xor ecx, ecx
614
615 pxor xmm5, xmm5 ; sum
616
617 movq mm0, [mmx_sub + eax*8 - 8] ; sub
618 movq2dq xmm6, mm0 ; load into low 8 bytes
619 movlhps xmm6, xmm6 ; duplicate into high 8 bytes
620
621 cmp al, 1
622 jz near .qes2_q1loop
623
624 .qes2_not1
625 movq mm0, [mmx_div + eax*8 - 8] ; divider
626 movq2dq xmm7, mm0
627 movlhps xmm7, xmm7
628
629 align 16
630 .qes2_loop
631 movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st]
632 movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd]
633 pxor xmm1, xmm1
634 pxor xmm4, xmm4
635 pcmpgtw xmm1, xmm0
636 pcmpgtw xmm4, xmm3
637 pxor xmm0, xmm1
638 pxor xmm3, xmm4
639 psubw xmm0, xmm1
640 psubw xmm3, xmm4
641 psubusw xmm0, xmm6
642 psubusw xmm3, xmm6
643 pmulhw xmm0, xmm7
644 pmulhw xmm3, xmm7
645 paddw xmm5, xmm0
646 pxor xmm0, xmm1
647 paddw xmm5, xmm3
648 pxor xmm3, xmm4
649 psubw xmm0, xmm1
650 psubw xmm3, xmm4
651 movdqa [edi + ecx*8], xmm0
652 movdqa [edi + ecx*8 + 16], xmm3
653
654 add ecx, 4
655 cmp ecx, 16
656 jnz .qes2_loop
657
658 .qes2_done
659 movdqu xmm6, [plus_one]
660 pmaddwd xmm5, xmm6
661 movhlps xmm6, xmm5
662 paddd xmm5, xmm6
663 movdq2q mm0, xmm5
664
665 movq mm5, mm0
666 psrlq mm5, 32
667 paddd mm0, mm5
668 movd eax, mm0 ; return sum
669
670 pop edi
671 pop esi
672
673 ret
674
675 align 16
676 .qes2_q1loop
677 movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st]
678 movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd]
679 pxor xmm1, xmm1
680 pxor xmm4, xmm4
681 pcmpgtw xmm1, xmm0
682 pcmpgtw xmm4, xmm3
683 pxor xmm0, xmm1
684 pxor xmm3, xmm4
685 psubw xmm0, xmm1
686 psubw xmm3, xmm4
687 psubusw xmm0, xmm6
688 psubusw xmm3, xmm6
689 psrlw xmm0, 1
690 psrlw xmm3, 1
691 paddw xmm5, xmm0
692 pxor xmm0, xmm1
693 paddw xmm5, xmm3
694 pxor xmm3, xmm4
695 psubw xmm0, xmm1
696 psubw xmm3, xmm4
697 movdqa [edi + ecx*8], xmm0
698 movdqa [edi + ecx*8 + 16], xmm3
699
700 add ecx,4
701 cmp ecx,16
702 jnz .qes2_q1loop
703 jmp .qes2_done
704
705
706 ;===========================================================================
707 ;
708 ; void dequant_intra_mmx(int16_t *data,
709 ; const int16_t const *coeff,
710 ; const uint32_t quant,
711 ; const uint32_t dcscalar);
712 ;
713 ;===========================================================================
714
715 ; note: we only saturate to +2047 *before* restoring the sign.
716 ; Hence, final clamp really is [-2048,2047]
717
718 align ALIGN
719 cglobal dequant_intra_mmx
720 dequant_intra_mmx:
721
722 mov edx, [esp+ 4] ; data
723 mov ecx, [esp+ 8] ; coeff
724 mov eax, [esp+12] ; quant
725 movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1
726 movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant
727 mov eax, -16
728
729 align ALIGN
730 .loop
731 movq mm0, [ecx+8*eax+8*16] ; c = coeff[i]
732 movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1]
733 pxor mm1, mm1
734 pxor mm4, mm4
735 pcmpgtw mm1, mm0 ; sign(c)
736 pcmpgtw mm4, mm3 ; sign(c')
737 pxor mm2, mm2
738 pxor mm5, mm5
739 pcmpeqw mm2, mm0 ; c is zero
740 pcmpeqw mm5, mm3 ; c' is zero
741 pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
742 pandn mm5, mm6
743 pxor mm0, mm1 ; negate if negative
744 pxor mm3, mm4 ; negate if negative
745 psubw mm0, mm1
746 psubw mm3, mm4
747 pmullw mm0, mm7 ; *= 2Q
748 pmullw mm3, mm7 ; *= 2Q
749 paddw mm0, mm2 ; + offset
750 paddw mm3, mm5 ; + offset
751 paddw mm0, mm1 ; negate back
752 paddw mm3, mm4 ; negate back
753
754 ; saturates to +2047
755 movq mm2, [mmx_32767_minus_2047]
756 add eax, 2
757 paddsw mm0, mm2
758 paddsw mm3, mm2
759 psubsw mm0, mm2
760 psubsw mm3, mm2
761
762 pxor mm0, mm1
763 pxor mm3, mm4
764 movq [edx + 8*eax + 8*16 - 2*8], mm0
765 movq [edx + 8*eax + 8*16+8 - 2*8], mm3
766 jnz near .loop
767
768 ; deal with DC
769
770 movd mm0, [ecx]
771 pmullw mm0, [esp+16] ; dcscalar
772 movq mm2, [mmx_32767_minus_2047]
773 paddsw mm0, mm2
774 psubsw mm0, mm2
775 movq mm3, [mmx_32768_minus_2048]
776 psubsw mm0, mm3
777 paddsw mm0, mm3
778 movd eax, mm0
779 mov [edx], ax
780
781 ret
782
783 ;===========================================================================
784 ;
785 ; void dequant_intra_xmm(int16_t *data,
786 ; const int16_t const *coeff,
787 ; const uint32_t quant,
788 ; const uint32_t dcscalar);
789 ;
790 ;===========================================================================
791
792 ; this is the same as dequant_inter_mmx, except that we're
793 ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
794
795 align ALIGN
796 cglobal dequant_intra_xmm
797 dequant_intra_xmm:
798
799 mov edx, [esp+ 4] ; data
800 mov ecx, [esp+ 8] ; coeff
801 mov eax, [esp+12] ; quant
802 movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1
803 movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant
804 mov eax, -16
805
806 align ALIGN
807 .loop
808 movq mm0, [ecx+8*eax+8*16] ; c = coeff[i]
809 movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1]
810 pxor mm1, mm1
811 pxor mm4, mm4
812 pcmpgtw mm1, mm0 ; sign(c)
813 pcmpgtw mm4, mm3 ; sign(c')
814 pxor mm2, mm2
815 pxor mm5, mm5
816 pcmpeqw mm2, mm0 ; c is zero
817 pcmpeqw mm5, mm3 ; c' is zero
818 pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
819 pandn mm5, mm6
820 pxor mm0, mm1 ; negate if negative
821 pxor mm3, mm4 ; negate if negative
822 psubw mm0, mm1
823 psubw mm3, mm4
824 pmullw mm0, mm7 ; *= 2Q
825 pmullw mm3, mm7 ; *= 2Q
826 paddw mm0, mm2 ; + offset
827 paddw mm3, mm5 ; + offset
828 paddw mm0, mm1 ; negate back
829 paddw mm3, mm4 ; negate back
830
831 ; saturates to +2047
832 movq mm2, [mmx_2047]
833 pminsw mm0, mm2
834 add eax, 2
835 pminsw mm3, mm2
836
837 pxor mm0, mm1
838 pxor mm3, mm4
839 movq [edx + 8*eax + 8*16 - 2*8], mm0
840 movq [edx + 8*eax + 8*16+8 - 2*8], mm3
841 jnz near .loop
842
843 ; deal with DC
844
845 movd mm0, [ecx]
846 pmullw mm0, [esp+16] ; dcscalar
847 movq mm2, [mmx_32767_minus_2047]
848 paddsw mm0, mm2
849 psubsw mm0, mm2
850 movq mm2, [mmx_32768_minus_2048]
851 psubsw mm0, mm2
852 paddsw mm0, mm2
853 movd eax, mm0
854 mov [edx], ax
855
856 ret
857
858
859 ;===========================================================================
860 ;
861 ; void dequant_intra_sse2(int16_t *data,
862 ; const int16_t const *coeff,
863 ; const uint32_t quant,
864 ; const uint32_t dcscalar);
865 ;
866 ;===========================================================================
867 align ALIGN
868 cglobal dequant_intra_sse2
869 dequant_intra_sse2:
870 mov edx, [esp+ 4] ; data
871 mov ecx, [esp+ 8] ; coeff
872 mov eax, [esp+12] ; quant
873 movq mm6, [mmx_add + eax * 8 - 8]
874 movq mm7, [mmx_mul + eax * 8 - 8]
875 movq2dq xmm6, mm6
876 movq2dq xmm7, mm7
877 movlhps xmm6, xmm6
878 movlhps xmm7, xmm7
879 mov eax, -16
880
881 align ALIGN
882 .loop
883 movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i]
884 movdqa xmm3, [ecx + 8*16 + 8*eax+ 16]
885 pxor xmm1, xmm1
886 pxor xmm4, xmm4
887 pcmpgtw xmm1, xmm0 ; sign(c)
888 pcmpgtw xmm4, xmm3
889 pxor xmm2, xmm2
890 pxor xmm5, xmm5
891 pcmpeqw xmm2, xmm0 ; c is zero
892 pcmpeqw xmm5, xmm3
893 pandn xmm2, xmm6 ; offset = isZero ? 0 : quant_add
894 pandn xmm5, xmm6
895 pxor xmm0, xmm1 ; negate if negative
896 pxor xmm3, xmm4
897 psubw xmm0, xmm1
898 psubw xmm3, xmm4
899 pmullw xmm0, xmm7 ; *= 2Q
900 pmullw xmm3, xmm7
901 paddw xmm0, xmm2 ; + offset
902 paddw xmm3, xmm5
903 paddw xmm0, xmm1 ; negate back
904 paddw xmm3, xmm4
905
906 ; saturates to +2047
907 movdqa xmm2, [sse2_2047]
908 pminsw xmm0, xmm2
909 add eax, 4
910 pminsw xmm3, xmm2
911
912 pxor xmm0, xmm1
913 pxor xmm3, xmm4
914 movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0
915 movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3
916 jnz near .loop
917
918 ; deal with DC
919 movd mm0, [ecx]
920 pmullw mm0, [esp+16] ; dcscalar
921 movq mm2, [mmx_32767_minus_2047]
922 paddsw mm0, mm2
923 psubsw mm0, mm2
924 movq mm2, [mmx_32768_minus_2048]
925 psubsw mm0, mm2
926 paddsw mm0, mm2
927 movd eax, mm0
928 mov [edx], ax
929
930 ret
931
932
933
934
935 ;===========================================================================
936 ;
937 ; void dequant_inter_mmx(int16_t * data,
938 ; const int16_t * const coeff,
939 ; const uint32_t quant);
940 ;
941 ;===========================================================================
942
943 align ALIGN
944 cglobal dequant_inter_mmx
945 dequant_inter_mmx:
946
947 mov edx, [esp+ 4] ; data
948 mov ecx, [esp+ 8] ; coeff
949 mov eax, [esp+12] ; quant
950 movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1
951 movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant
952 mov eax, -16
953
954 align ALIGN
955 .loop
956 movq mm0, [ecx+8*eax+8*16] ; c = coeff[i]
957 movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1]
958 pxor mm1, mm1
959 pxor mm4, mm4
960 pcmpgtw mm1, mm0 ; sign(c)
961 pcmpgtw mm4, mm3 ; sign(c')
962 pxor mm2, mm2
963 pxor mm5, mm5
964 pcmpeqw mm2, mm0 ; c is zero
965 pcmpeqw mm5, mm3 ; c' is zero
966 pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
967 pandn mm5, mm6
968 pxor mm0, mm1 ; negate if negative
969 pxor mm3, mm4 ; negate if negative
970 psubw mm0, mm1
971 psubw mm3, mm4
972 pmullw mm0, mm7 ; *= 2Q
973 pmullw mm3, mm7 ; *= 2Q
974 paddw mm0, mm2 ; + offset
975 paddw mm3, mm5 ; + offset
976 paddw mm0, mm1 ; negate back
977 paddw mm3, mm4 ; negate back
978
979 ; saturates to +2047
980 movq mm2, [mmx_32767_minus_2047]
981 add eax, 2
982 paddsw mm0, mm2
983 paddsw mm3, mm2
984 psubsw mm0, mm2
985 psubsw mm3, mm2
986
987 pxor mm0, mm1
988 pxor mm3, mm4
989 movq [edx + 8*eax + 8*16 - 2*8], mm0
990 movq [edx + 8*eax + 8*16+8 - 2*8], mm3
991 jnz near .loop
992
993 ret
994
995 ;===========================================================================
996 ;
997 ; void dequant_inter_xmm(int16_t * data,
998 ; const int16_t * const coeff,
999 ; const uint32_t quant);
1000 ;
1001 ;===========================================================================
1002
1003 ; this is the same as dequant_inter_mmx,
1004 ; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
1005
1006 align ALIGN
1007 cglobal dequant_inter_xmm
1008 dequant_inter_xmm:
1009
1010 mov edx, [esp+ 4] ; data
1011 mov ecx, [esp+ 8] ; coeff
1012 mov eax, [esp+12] ; quant
1013 movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1
1014 movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant
1015 mov eax, -16
1016
1017 align ALIGN
1018 .loop
1019 movq mm0, [ecx+8*eax+8*16] ; c = coeff[i]
1020 movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1]
1021 pxor mm1, mm1
1022 pxor mm4, mm4
1023 pcmpgtw mm1, mm0 ; sign(c)
1024 pcmpgtw mm4, mm3 ; sign(c')
1025 pxor mm2, mm2
1026 pxor mm5, mm5
1027 pcmpeqw mm2, mm0 ; c is zero
1028 pcmpeqw mm5, mm3 ; c' is zero
1029 pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
1030 pandn mm5, mm6
1031 pxor mm0, mm1 ; negate if negative
1032 pxor mm3, mm4 ; negate if negative
1033 psubw mm0, mm1
1034 psubw mm3, mm4
1035 pmullw mm0, mm7 ; *= 2Q
1036 pmullw mm3, mm7 ; *= 2Q
1037 paddw mm0, mm2 ; + offset
1038 paddw mm3, mm5 ; + offset
1039 paddw mm0, mm1 ; start restoring sign
1040 paddw mm3, mm4 ; start restoring sign
1041
1042 ; saturates to +2047
1043 movq mm2, [mmx_2047]
1044 pminsw mm0, mm2
1045 add eax, 2
1046 pminsw mm3, mm2
1047
1048 pxor mm0, mm1 ; finish restoring sign
1049 pxor mm3, mm4 ; finish restoring sign
1050 movq [edx + 8*eax + 8*16 - 2*8], mm0
1051 movq [edx + 8*eax + 8*16+8 - 2*8], mm3
1052 jnz near .loop
1053
1054 ret
1055
1056 ;===========================================================================
1057 ;
1058 ; void dequant_inter_sse2(int16_t * data,
1059 ; const int16_t * const coeff,
1060 ; const uint32_t quant);
1061 ;
1062 ;===========================================================================
1063 align ALIGN
1064 cglobal dequant_inter_sse2
1065 dequant_inter_sse2
1066 mov edx, [esp + 4] ; data
1067 mov ecx, [esp + 8] ; coeff
1068 mov eax, [esp + 12] ; quant
1069 movq mm6, [mmx_add + eax * 8 - 8]
1070 movq mm7, [mmx_mul + eax * 8 - 8]
1071 movq2dq xmm6, mm6
1072 movq2dq xmm7, mm7
1073 movlhps xmm6, xmm6
1074 movlhps xmm7, xmm7
1075 mov eax, -16
1076
1077 align ALIGN
1078 .loop
1079 movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i]
1080 movdqa xmm3, [ecx + 8*16 + 8*eax + 16]
1081
1082 pxor xmm1, xmm1
1083 pxor xmm4, xmm4
1084 pcmpgtw xmm1, xmm0 ; sign(c)
1085 pcmpgtw xmm4, xmm3
1086 pxor xmm2, xmm2
1087 pxor xmm5, xmm5
1088 pcmpeqw xmm2, xmm0 ; c is zero
1089 pcmpeqw xmm5, xmm3
1090 pandn xmm2, xmm6
1091 pandn xmm5, xmm6
1092 pxor xmm0, xmm1 ; negate if negative
1093 pxor xmm3, xmm4
1094 psubw xmm0, xmm1
1095 psubw xmm3, xmm4
1096 pmullw xmm0, xmm7 ; *= 2Q
1097 pmullw xmm3, xmm7
1098 paddw xmm0, xmm2 ; + offset
1099 paddw xmm3, xmm5
1100
1101 paddw xmm0, xmm1 ; start restoring sign
1102 paddw xmm3, xmm4
1103
1104 ; saturates to +2047
1105 movdqa xmm2, [sse2_2047]
1106 pminsw xmm0, xmm2
1107 add eax, 4
1108 pminsw xmm3, xmm2
1109
1110 pxor xmm0, xmm1 ; finish restoring sign
1111 pxor xmm3, xmm4
1112 movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0
1113 movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3
1114 jnz near .loop
1115
1116 ret