ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/svn/tags/branch-release-1-0/xvidcore/src/quant/x86_asm/quantize4_xmm.asm
Revision: 908
Committed: Thu Mar 6 21:12:04 2003 UTC (21 years, 7 months ago)
File size: 22794 byte(s)
Log Message:
This commit was manufactured by cvs2svn to create tag 'branch-release-1-0'.

File Contents

# Content
1 ;/******************************************************************************
2 ; * *
3 ; * This file is part of XviD, a free MPEG-4 video encoder/decoder *
4 ; * *
5 ; * XviD is an implementation of a part of one or more MPEG-4 Video tools *
6 ; * as specified in ISO/IEC 14496-2 standard. Those intending to use this *
7 ; * software module in hardware or software products are advised that its *
8 ; * use may infringe existing patents or copyrights, and any such use *
9 ; * would be at such party's own risk. The original developer of this *
10 ; * software module and his/her company, and subsequent editors and their *
11 ; * companies, will have no liability for use of this software or *
12 ; * modifications or derivatives thereof. *
13 ; * *
14 ; * XviD is free software; you can redistribute it and/or modify it *
15 ; * under the terms of the GNU General Public License as published by *
16 ; * the Free Software Foundation; either version 2 of the License, or *
17 ; * (at your option) any later version. *
18 ; * *
19 ; * XviD is distributed in the hope that it will be useful, but *
20 ; * WITHOUT ANY WARRANTY; without even the implied warranty of *
21 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
22 ; * GNU General Public License for more details. *
23 ; * *
24 ; * You should have received a copy of the GNU General Public License *
25 ; * along with this program; if not, write to the Free Software *
26 ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
27 ; * *
28 ; ******************************************************************************/
29 ;
30 ;/******************************************************************************
31 ; * quant4 bugs have been fixed: (a) overflow bug for matrix elements *
32 ; * equal to 1 or 2 is fixed by substituting pmulhw with pmulhuw (iSSE) *
33 ; * and using multiplier 0ffffh instead of 10001h (for matrix element = 1; *
34 ; * in that case, 1 is added before multiplying, that additional 1 comes *
35 ; * from intra_matrix1; (b) rounding error for large coefficients and matrix *
36 ; * elements is fixed by two-step approach: first approximation (rounded *
37 ; * down) is found as usual; the result is multiplied by the matrix element *
38 ; * and mismatch is used to calculate the correction. *
39 ; ******************************************************************************/
40 ;
41 ; _3dne functions are compatible with iSSE, but are optimized specifically for
42 ; K7 pipelines
43 ;
44 ;------------------------------------------------------------------------------
45 ; 09.12.2002 Athlon optimizations contributed by Jaan Kalda
46 ;------------------------------------------------------------------------------
47
48
49 ; data/text alignment
50 %define ALIGN 8
51 %define SATURATE
52
53 bits 32
54
55 %ifdef FORMAT_COFF
56 SECTION .data data
57 %else
58 SECTION .data data align=8
59 %endif
60
61 %macro cglobal 1
62 %ifdef PREFIX
63 global _%1
64 %define %1 _%1
65 %else
66 global %1
67 %endif
68 %endmacro
69
70 %macro cextern 1
71 %ifdef PREFIX
72 extern _%1
73 %define %1 _%1
74 %else
75 extern %1
76 %endif
77 %endmacro
78 align 8
79 mmzero dd 0,0
80
81 mmx_one times 4 dw 1
82
83 ;===========================================================================
84 ;
85 ; divide by 2Q table
86 ;
87 ;===========================================================================
88
89 align ALIGN
90 mmx_divs ;i>2
91 %assign i 1
92 %rep 31
93 times 4 dw ((1 << 15) / i + 1)
94 %assign i i+1
95 %endrep
96
97 align ALIGN
98 mmx_div ;i>2
99 %assign i 1
100 %rep 31
101 times 4 dw ((1 << 16) / i + 1)
102 %assign i i+1
103 %endrep
104
105
106 ;===========================================================================
107 ;
108 ; intra matrix
109 ;
110 ;===========================================================================
111
112 %macro FIXX 1
113 dw (1 << 16) / (%1) + 1
114 %endmacro
115
116 cextern intra_matrix_fixl
117 cextern intra_matrix_fix
118 cextern intra_matrix1
119 cextern intra_matrix
120
121 ;===========================================================================
122 ;
123 ; inter matrix
124 ;
125 ;===========================================================================
126
127 cextern inter_matrix1
128 cextern inter_matrix
129 cextern inter_matrix_fix
130 cextern inter_matrix_fixl
131
132
133 %define VM18P 3
134 %define VM18Q 4
135 %define nop4 DB 08Dh,074h,026h,0
136 %define nop3 add esp,byte 0
137 %define nop2 mov esp,esp
138 %define nop7 db 08dh,02ch,02dh,0,0,0,0
139 %define nop6 add ebp,dword 0
140 ;===========================================================================
141 ;
142 ; quantd table
143 ;
144 ;===========================================================================
145
146
147 quantd
148 %assign i 1
149 %rep 31
150 times 4 dw (((VM18P*i) + (VM18Q/2)) / VM18Q)
151 %assign i i+1
152 %endrep
153
154 ;===========================================================================
155 ;
156 ; multiple by 2Q table
157 ;
158 ;===========================================================================
159
160
161 mmx_mul_quant
162 %assign i 1
163 %rep 31
164 times 4 dw i
165 %assign i i+1
166 %endrep
167
168 ;===========================================================================
169 ;
170 ; saturation limits
171 ;
172 ;===========================================================================
173
174 align 16
175
176 mmx_32767_minus_2047 times 4 dw (32767-2047)
177 mmx_32768_minus_2048 times 4 dw (32768-2048)
178 mmx_2047 times 4 dw 2047
179 mmx_minus_2048 times 4 dw (-2048)
180 zero times 4 dw 0
181
182 int_div
183 dd 0
184 %assign i 1
185 %rep 255
186 dd (1 << 17) / ( i) + 1
187 %assign i i+1
188 %endrep
189
190 section .text
191
192 ;===========================================================================
193 ;
194 ; void quant_intra4_xmm(int16_t * coeff,
195 ; const int16_t const * data,
196 ; const uint32_t quant,
197 ; const uint32_t dcscalar);
198 ;
199 ;===========================================================================
200
201 align ALIGN
202 cglobal quant4_intra_xmm
203 quant4_intra_xmm
204
205 mov eax, [esp + 8] ; data
206 mov ecx, [esp + 12] ; quant
207 mov edx, [esp + 4] ; coeff
208 push esi
209 push edi
210 push ebx
211 nop
212 mov edi,mmzero
213 mov esi,-14
214 pxor mm0,mm0
215 pxor mm3,mm3
216 cmp ecx,byte 1
217 je near .q1loop
218 cmp ecx,byte 19
219 jg near .lloop
220 nop6
221
222
223 align ALIGN
224 .loop
225 movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
226 psubw mm0,mm1 ;-mm1
227 movq mm4, [eax + 8*esi + 120] ;
228 psubw mm3,mm4 ;-mm4
229 pmaxsw mm0,mm1 ;|src|
230 pmaxsw mm3,mm4
231 nop2
232 psraw mm1,15 ;sign src
233 psraw mm4,15
234 psllw mm0, 4 ; level << 4
235 psllw mm3, 4 ;
236 paddw mm0, [intra_matrix1 + 8*esi+112]
237 paddw mm3, [intra_matrix1 + 8*esi+120]
238 movq mm5,[intra_matrix_fixl + 8*esi+112]
239 movq mm7,[intra_matrix_fixl + 8*esi+120]
240 pmulhuw mm5,mm0
241 pmulhuw mm7,mm3
242 mov esp,esp
243 movq mm2,[intra_matrix + 8*esi+112]
244 movq mm6,[intra_matrix + 8*esi+120]
245 pmullw mm2,mm5
246 pmullw mm6,mm7
247 psubw mm0,mm2
248 psubw mm3,mm6
249 nop4
250 movq mm2,[quantd + ecx * 8 - 8]
251 movq mm6,[mmx_divs + ecx * 8 - 8]
252 paddw mm5,mm2
253 paddw mm7,mm2
254 mov esp,esp
255 pmulhuw mm0,[intra_matrix_fix + 8*esi+112]
256 pmulhuw mm3,[intra_matrix_fix + 8*esi+120]
257 paddw mm5,mm0
258 paddw mm7,mm3
259 movq mm0,[edi]
260 movq mm3,[edi]
261 pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
262 pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32)
263 pxor mm5, mm1 ; mm0 *= sign(mm0)
264 pxor mm7, mm4 ;
265 psubw mm5, mm1 ; undisplace
266 psubw mm7, mm4 ;
267 movq [edx + 8*esi+112], mm5
268 movq [edx + 8*esi +120], mm7
269 add esi,byte 2
270 jng near .loop
271 .done
272 ; caclulate data[0] // (int32_t)dcscalar)
273
274
275 mov esi, [esp + 12 + 16] ; dcscalar
276 movsx ecx, word [eax]
277 mov edi,ecx
278 mov edx,[esp + 12 + 16]
279 shr edx, 1 ; ebx = dcscalar /2
280 sar edi,31 ;cdq is vectorpath
281 xor edx,edi ; ebx = eax V -eax -1
282 sub ecx,edi
283 add ecx,edx
284 mov edx,[dword esp + 12 + 4]
285 mov esi,[int_div+4*esi]
286 imul ecx,esi
287 sar ecx,17
288 ; idiv cx ; ecx = edi:ecx / dcscalar
289
290 mov ebx,[esp]
291 mov edi,[esp+4]
292 mov esi,[esp+8]
293 add esp,byte 12
294 mov [edx], cx ; coeff[0] = ax
295
296 ret
297
298 align ALIGN
299 .q1loop
300 movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
301 psubw mm0,mm1 ;-mm1
302 movq mm4, [eax + 8*esi+120] ;
303 psubw mm3,mm4 ;-mm4
304 pmaxsw mm0,mm1 ;|src|
305 pmaxsw mm3,mm4
306 nop2
307 psraw mm1,15 ;sign src
308 psraw mm4,15
309 psllw mm0, 4 ; level << 4
310 psllw mm3, 4
311 paddw mm0, [intra_matrix1 + 8*esi+112] ;mm0 is to be divided
312 paddw mm3, [intra_matrix1 + 8*esi+120] ; intra1 contains fix for division by 1
313 movq mm5,[intra_matrix_fixl + 8*esi+112] ;with rounding down
314 movq mm7,[intra_matrix_fixl + 8*esi+120]
315 pmulhuw mm5,mm0
316 pmulhuw mm7,mm3 ;mm7: first approx of division
317 mov esp,esp
318 movq mm2,[intra_matrix + 8*esi+112]
319 movq mm6,[intra_matrix + 8*esi+120] ; divs for q<=16
320 pmullw mm2,mm5 ;test value <= original
321 pmullw mm6,mm7
322 psubw mm0,mm2 ;mismatch
323 psubw mm3,mm6
324 nop4
325 movq mm2,[quantd + ecx * 8 - 8]
326 paddw mm5,mm2 ;first approx with quantd
327 paddw mm7,mm2
328 mov esp,esp
329 pmulhuw mm0,[intra_matrix_fix + 8*esi+112] ;correction
330 pmulhuw mm3,[intra_matrix_fix + 8*esi+120]
331 paddw mm5,mm0 ;final result with quantd
332 paddw mm7,mm3
333 movq mm0,[edi]
334 movq mm3,[edi]
335 mov esp,esp
336 psrlw mm5, 1 ; (level + quantd) /2 (quant = 1)
337 psrlw mm7, 1
338 pxor mm5, mm1 ; mm0 *= sign(mm0)
339 pxor mm7, mm4 ;
340 psubw mm5, mm1 ; undisplace
341 psubw mm7, mm4 ;
342 movq [edx + 8*esi+112], mm5
343 movq [edx + 8*esi +120], mm7
344 add esi,byte 2
345 jng near .q1loop
346 jmp near .done
347
348 align 8
349 .lloop
350 movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
351 psubw mm0,mm1 ;-mm1
352 movq mm4, [eax + 8*esi+120] ;
353 psubw mm3,mm4 ;-mm4
354 pmaxsw mm0,mm1 ;|src|
355 pmaxsw mm3,mm4
356 nop2
357 psraw mm1,15 ;sign src
358 psraw mm4,15
359 psllw mm0, 4 ; level << 4
360 psllw mm3, 4 ;
361 paddw mm0, [intra_matrix1 + 8*esi+112] ;mm0 is to be divided intra1 contains fix for division by 1
362 paddw mm3, [intra_matrix1 + 8*esi+120]
363 movq mm5,[intra_matrix_fixl + 8*esi+112]
364 movq mm7,[intra_matrix_fixl + 8*esi+120]
365 pmulhuw mm5,mm0
366 pmulhuw mm7,mm3 ;mm7: first approx of division
367 mov esp,esp
368 movq mm2,[intra_matrix + 8*esi+112]
369 movq mm6,[intra_matrix + 8*esi+120]
370 pmullw mm2,mm5 ;test value <= original
371 pmullw mm6,mm7
372 psubw mm0,mm2 ;mismatch
373 psubw mm3,mm6
374 nop4
375 movq mm2,[quantd + ecx * 8 - 8]
376 movq mm6,[mmx_div + ecx * 8 - 8] ; divs for q<=16
377 paddw mm5,mm2 ;first approx with quantd
378 paddw mm7,mm2
379 mov esp,esp
380 pmulhuw mm0,[intra_matrix_fix + 8*esi+112] ;correction
381 pmulhuw mm3,[intra_matrix_fix + 8*esi+120]
382 paddw mm5,mm0 ;final result with quantd
383 paddw mm7,mm3
384 movq mm0,[edi]
385 movq mm3,[edi]
386 mov esp,esp
387 pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
388 pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32)
389 psrlw mm5, 1 ; (level + quantd) / (2*quant)
390 psrlw mm7, 1
391 pxor mm5, mm1 ; mm0 *= sign(mm0)
392 pxor mm7, mm4 ;
393 psubw mm5, mm1 ; undisplace
394 psubw mm7, mm4 ;
395 movq [edx + 8*esi+112], mm5
396 movq [edx + 8*esi +120], mm7
397 add esi,byte 2
398 jng near .lloop
399 jmp near .done
400
401 ;===========================================================================
402 ;
403 ; uint32_t quant4_inter_xmm(int16_t * coeff,
404 ; const int16_t const * data,
405 ; const uint32_t quant);
406 ;
407 ;===========================================================================
408
409 align ALIGN
410 cglobal quant4_inter_xmm
411 quant4_inter_xmm
412
413 mov eax, [esp + 8] ; data
414 mov ecx, [esp + 12] ; quant
415 mov edx, [esp + 4] ; coeff
416 push esi
417 push edi
418 push ebx
419 nop
420 mov edi,mmzero
421 mov esi,-14
422 mov ebx,esp
423 sub esp,byte 24
424 lea ebx,[esp+8]
425 and ebx,byte -8 ;align 8
426 pxor mm0,mm0
427 pxor mm3,mm3
428 movq [byte ebx],mm0
429 db 0Fh, 7Fh, 44h, 23h, 8 ;movq [ebx+8],mm0
430 cmp ecx,byte 1
431 je near .q1loop
432 cmp ecx,byte 19
433 jg near .lloop
434 nop
435
436
437 align ALIGN
438 .loop
439 movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
440 psubw mm0,mm1 ;-mm1
441 movq mm4, [eax + 8*esi + 120] ;
442 psubw mm3,mm4 ;-mm4
443 pmaxsw mm0,mm1 ;|src|
444 pmaxsw mm3,mm4
445 nop2
446 psraw mm1,15 ;sign src
447 psraw mm4,15
448 psllw mm0, 4 ; level << 4
449 psllw mm3, 4 ;
450 paddw mm0, [inter_matrix1 + 8*esi+112]
451 paddw mm3, [inter_matrix1 + 8*esi+120]
452 movq mm5,[inter_matrix_fixl + 8*esi+112]
453 movq mm7,[inter_matrix_fixl + 8*esi+120]
454 pmulhuw mm5,mm0
455 pmulhuw mm7,mm3
456 mov esp,esp
457 movq mm2,[inter_matrix + 8*esi+112]
458 movq mm6,[inter_matrix + 8*esi+120]
459 pmullw mm2,mm5
460 pmullw mm6,mm7
461 psubw mm0,mm2
462 psubw mm3,mm6
463 movq mm2,[byte ebx]
464 movq mm6,[mmx_divs + ecx * 8 - 8]
465 pmulhuw mm0,[inter_matrix_fix + 8*esi+112]
466 pmulhuw mm3,[inter_matrix_fix + 8*esi+120]
467 paddw mm2,[ebx+8] ;sum
468 paddw mm5,mm0
469 paddw mm7,mm3
470 movq mm0,[edi]
471 movq mm3,[edi]
472 pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
473 pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32)
474 add esi,byte 2
475 paddw mm2,mm5 ;sum += x1
476 movq [ebx],mm7 ;store x2
477 pxor mm5, mm1 ; mm0 *= sign(mm0)
478 pxor mm7, mm4 ;
479 psubw mm5, mm1 ; undisplace
480 psubw mm7, mm4 ;
481 db 0Fh, 7Fh, 54h, 23h, 08 ;movq [ebx+8],mm2 ;store sum
482 movq [edx + 8*esi+112-16], mm5
483 movq [edx + 8*esi +120-16], mm7
484 jng near .loop
485 .done
486 ; caclulate data[0] // (int32_t)dcscalar)
487
488 paddw mm2,[ebx]
489 mov ebx,[esp+24]
490 mov edi,[esp+4+24]
491 mov esi,[esp+8+24]
492 add esp,byte 12+24
493 pmaddwd mm2, [mmx_one]
494 punpckldq mm0,mm2 ;get low dw to mm0:high
495 paddd mm0,mm2
496 punpckhdq mm0,mm0 ;get result to low
497 movd eax, mm0
498 ret
499
500 align ALIGN
501 .q1loop
502 movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
503 psubw mm0,mm1 ;-mm1
504 movq mm4, [eax + 8*esi+120] ;
505 psubw mm3,mm4 ;-mm4
506 pmaxsw mm0,mm1 ;|src|
507 pmaxsw mm3,mm4
508 nop2
509 psraw mm1,15 ;sign src
510 psraw mm4,15
511 psllw mm0, 4 ; level << 4
512 psllw mm3, 4
513 paddw mm0, [inter_matrix1 + 8*esi+112] ;mm0 is to be divided
514 paddw mm3, [inter_matrix1 + 8*esi+120] ; inter1 contains fix for division by 1
515 movq mm5,[inter_matrix_fixl + 8*esi+112] ;with rounding down
516 movq mm7,[inter_matrix_fixl + 8*esi+120]
517 pmulhuw mm5,mm0
518 pmulhuw mm7,mm3 ;mm7: first approx of division
519 mov esp,esp
520 movq mm2,[inter_matrix + 8*esi+112]
521 movq mm6,[inter_matrix + 8*esi+120] ; divs for q<=16
522 pmullw mm2,mm5 ;test value <= original
523 pmullw mm6,mm7
524 psubw mm0,mm2 ;mismatch
525 psubw mm3,mm6
526 movq mm2,[byte ebx]
527 pmulhuw mm0,[inter_matrix_fix + 8*esi+112] ;correction
528 pmulhuw mm3,[inter_matrix_fix + 8*esi+120]
529 paddw mm2,[ebx+8] ;sum
530 paddw mm5,mm0 ;final result
531 paddw mm7,mm3
532 movq mm0,[edi]
533 movq mm3,[edi]
534 psrlw mm5, 1 ; (level ) /2 (quant = 1)
535 psrlw mm7, 1
536 add esi,byte 2
537 paddw mm2,mm5 ;sum += x1
538 movq [ebx],mm7 ;store x2
539 pxor mm5, mm1 ; mm0 *= sign(mm0)
540 pxor mm7, mm4 ;
541 psubw mm5, mm1 ; undisplace
542 psubw mm7, mm4 ;
543 movq [ebx+8],mm2 ;store sum
544 movq [edx + 8*esi+112-16], mm5
545 movq [edx + 8*esi +120-16], mm7
546 jng near .q1loop
547 jmp near .done
548
549 align 8
550 .lloop
551 movq mm1, [eax + 8*esi+112] ; mm0 = [1st]
552 psubw mm0,mm1 ;-mm1
553 movq mm4, [eax + 8*esi+120] ;
554 psubw mm3,mm4 ;-mm4
555 pmaxsw mm0,mm1 ;|src|
556 pmaxsw mm3,mm4
557 nop2
558 psraw mm1,15 ;sign src
559 psraw mm4,15
560 psllw mm0, 4 ; level << 4
561 psllw mm3, 4 ;
562 paddw mm0, [inter_matrix1 + 8*esi+112] ;mm0 is to be divided inter1 contains fix for division by 1
563 paddw mm3, [inter_matrix1 + 8*esi+120]
564 movq mm5,[inter_matrix_fixl + 8*esi+112]
565 movq mm7,[inter_matrix_fixl + 8*esi+120]
566 pmulhuw mm5,mm0
567 pmulhuw mm7,mm3 ;mm7: first approx of division
568 mov esp,esp
569 movq mm2,[inter_matrix + 8*esi+112]
570 movq mm6,[inter_matrix + 8*esi+120]
571 pmullw mm2,mm5 ;test value <= original
572 pmullw mm6,mm7
573 psubw mm0,mm2 ;mismatch
574 psubw mm3,mm6
575 movq mm2,[byte ebx]
576 movq mm6,[mmx_div + ecx * 8 - 8] ; divs for q<=16
577 pmulhuw mm0,[inter_matrix_fix + 8*esi+112] ;correction
578 pmulhuw mm3,[inter_matrix_fix + 8*esi+120]
579 paddw mm2,[ebx+8] ;sum
580 paddw mm5,mm0 ;final result
581 paddw mm7,mm3
582 movq mm0,[edi]
583 movq mm3,[edi]
584 pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
585 pmulhuw mm7, mm6 ; (level ) / quant (0<quant<32)
586 add esi,byte 2
587 psrlw mm5, 1 ; (level ) / (2*quant)
588 paddw mm2,mm5 ;sum += x1
589 psrlw mm7, 1
590 movq [ebx],mm7 ;store x2
591 pxor mm5, mm1 ; mm0 *= sign(mm0)
592 pxor mm7, mm4 ;
593 psubw mm5, mm1 ; undisplace
594 psubw mm7, mm4 ;
595 db 0Fh, 7Fh, 54h, 23h, 08 ;movq [ebx+8],mm2 ;store sum
596 movq [edx + 8*esi+112-16], mm5
597 movq [edx + 8*esi +120-16], mm7
598 jng near .lloop
599 jmp near .done
600
601
602 ;===========================================================================
603 ;
604 ; void dequant4_intra_mmx(int16_t *data,
605 ; const int16_t const *coeff,
606 ; const uint32_t quant,
607 ; const uint32_t dcscalar);
608 ;
609 ;===========================================================================
610
611 ; Note: in order to saturate 'easily', we pre-shift the quantifier
612 ; by 4. Then, the high-word of (coeff[]*matrix[i]*quant) are used to
613 ; build a saturating mask. It is non-zero only when an overflow occured.
614 ; We thus avoid packing/unpacking toward double-word.
615 ; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g.,
616 ; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not
617 ; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a
618 ; and quant in [1..31].
619 ;
620 ;********************************************************************
621 %macro DEQUANT4INTRAMMX 1
622 movq mm1, [byte ecx+ 16 * %1] ; mm0 = c = coeff[i]
623 movq mm4, [ecx+ 16 * %1 +8]; mm3 = c' = coeff[i+1]
624 psubw mm0,mm1
625 psubw mm3,mm4
626 pmaxsw mm0,mm1
627 pmaxsw mm3,mm4
628 psraw mm1,15
629 psraw mm4,15
630 %if %1
631 movq mm2,[eax+8] ;preshifted quant
632 movq mm7,[eax+8]
633 %endif
634 pmullw mm2, [intra_matrix + 16 * %1 ] ; matrix[i]*quant
635 pmullw mm7, [intra_matrix + 16 * %1 +8] ; matrix[i+1]*quant
636 movq mm5,mm0
637 movq mm6,mm3
638 pmulhw mm0, mm2 ; high of coeff*(matrix*quant)
639 pmulhw mm3, mm7 ; high of coeff*(matrix*quant)
640 pmullw mm2, mm5 ; low of coeff*(matrix*quant)
641 pmullw mm7, mm6 ; low of coeff*(matrix*quant)
642 pcmpgtw mm0, [eax]
643 pcmpgtw mm3, [eax]
644 paddusw mm2, mm0
645 paddusw mm7, mm3
646 psrlw mm2, 5
647 psrlw mm7, 5
648 pxor mm2, mm1 ; start negating back
649 pxor mm7, mm4 ; start negating back
650 psubusw mm1, mm0
651 psubusw mm4, mm3
652 movq mm0,[eax] ;zero
653 movq mm3,[eax] ;zero
654 psubw mm2, mm1 ; finish negating back
655 psubw mm7, mm4 ; finish negating back
656 movq [byte edx + 16 * %1], mm2 ; data[i]
657 movq [edx + 16 * %1 +8], mm7 ; data[i+1]
658 %endmacro
659
660 align 16
661 cglobal dequant4_intra_3dne
662 dequant4_intra_3dne:
663
664 mov eax, [esp+12] ; quant
665 mov ecx, [esp+8] ; coeff
666 movq mm7, [mmx_mul_quant + eax*8 - 8]
667 psllw mm7, 2 ; << 2. See comment.
668 mov edx, [esp+4] ; data
669 push ebx
670 movsx ebx,word [ecx]
671 pxor mm0, mm0
672 pxor mm3, mm3
673 push esi
674 lea eax,[esp-28]
675 sub esp,byte 32
676 and eax,byte -8 ;points to qword aligned space on stack
677 movq [eax],mm0
678 movq [eax+8],mm7
679 imul ebx,[esp+16+8+32] ; dcscalar
680 movq mm2,mm7
681
682
683 align 4
684 DEQUANT4INTRAMMX 0
685 mov esi,-2048
686 nop
687 cmp ebx,esi
688 DEQUANT4INTRAMMX 1
689 cmovl ebx,esi
690 neg esi
691 sub esi,byte 1 ;2047
692 DEQUANT4INTRAMMX 2
693 cmp ebx,esi
694 cmovg ebx,esi
695 lea ebp,[byte ebp]
696 DEQUANT4INTRAMMX 3
697 mov esi,[esp+32]
698 mov [byte edx], bx
699 mov ebx,[esp+32+4]
700 DEQUANT4INTRAMMX 4
701 DEQUANT4INTRAMMX 5
702 DEQUANT4INTRAMMX 6
703 DEQUANT4INTRAMMX 7
704 add esp,byte 32+8
705
706 ret
707
708 ;===========================================================================
709 ;
710 ; void dequant4_inter_3dne(int16_t * data,
711 ; const int16_t * const coeff,
712 ; const uint32_t quant);
713 ;
714 ;===========================================================================
715
716 ; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier
717 ; so we handle the 3 cases: c<0, c==0, and c>0 in one shot.
718 ; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0.
719 ; It's mixed with the extraction of the absolute value.
720
721 align 16
722 cglobal dequant4_inter_3dne
723 dequant4_inter_3dne:
724
725 mov edx, [esp+ 4] ; data
726 mov ecx, [esp+ 8] ; coeff
727 mov eax, [esp+12] ; quant
728 movq mm7, [mmx_mul_quant + eax*8 - 8]
729 mov eax, -14
730 paddw mm7, mm7 ; << 1
731 pxor mm6, mm6 ; mismatch sum
732 push esi
733 mov esi,mmzero
734 pxor mm1,mm1
735 pxor mm3,mm3
736 nop
737 nop4
738
739 align 16
740 .loop
741 movq mm0, [ecx+8*eax + 7*16 ] ; mm0 = coeff[i]
742 pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved)
743 movq mm2, [ecx+8*eax + 7*16 +8] ; mm2 = coeff[i+1]
744 pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved)
745 paddsw mm0, mm1 ; c += sgn(c)
746 paddsw mm2, mm3 ; c += sgn(c')
747 paddw mm0, mm0 ; c *= 2
748 paddw mm2, mm2 ; c'*= 2
749
750 movq mm4, [esi]
751 movq mm5, [esi]
752 psubw mm4, mm0 ; -c
753 psubw mm5, mm2 ; -c'
754
755 psraw mm4, 16 ; mm4 = sgn(-c)
756 psraw mm5, 16 ; mm5 = sgn(-c')
757 psubsw mm0, mm4 ; c -= sgn(-c)
758 psubsw mm2, mm5 ; c' -= sgn(-c')
759 pxor mm0, mm1 ; finish changing sign if needed
760 pxor mm2, mm3 ; finish changing sign if needed
761
762 ; we're short on register, here. Poor pairing...
763
764 movq mm4, mm7 ; (matrix*quant)
765 nop
766 pmullw mm4, [inter_matrix + 8*eax + 7*16]
767 movq mm5, mm4
768 pmulhw mm5, mm0 ; high of c*(matrix*quant)
769 pmullw mm0, mm4 ; low of c*(matrix*quant)
770
771 movq mm4, mm7 ; (matrix*quant)
772 pmullw mm4, [inter_matrix + 8*eax + 7*16 + 8]
773 add eax,byte 2
774
775 pcmpgtw mm5, [esi]
776 paddusw mm0, mm5
777 psrlw mm0, 5
778 pxor mm0, mm1 ; start restoring sign
779 psubusw mm1, mm5
780
781 movq mm5, mm4
782 pmulhw mm5, mm2 ; high of c*(matrix*quant)
783 pmullw mm2, mm4 ; low of c*(matrix*quant)
784 psubw mm0, mm1 ; finish restoring sign
785
786 pcmpgtw mm5, [esi]
787 paddusw mm2, mm5
788 psrlw mm2, 5
789 pxor mm2, mm3 ; start restoring sign
790 psubusw mm3, mm5
791 psubw mm2, mm3 ; finish restoring sign
792 movq mm1, [esi]
793 movq mm3, [byte esi]
794 pxor mm6, mm0 ; mismatch control
795 movq [edx + 8*eax + 7*16 -2*8 ], mm0 ; data[i]
796 pxor mm6, mm2 ; mismatch control
797 movq [edx + 8*eax + 7*16 -2*8 +8], mm2 ; data[i+1]
798
799 jng .loop
800 nop
801
802 ; mismatch control
803
804 pshufw mm0,mm6,01010101b
805 pshufw mm1,mm6,10101010b
806 pshufw mm2,mm6,11111111b
807 pxor mm6, mm0
808 pxor mm1, mm2
809 pxor mm6, mm1
810 movd eax, mm6
811 and eax,byte 1
812 xor eax,byte 1
813 mov esi,[esp]
814 add esp,byte 4
815 xor word [edx + 2*63], ax
816
817 ret
818