Annotation of /trunk/xvidcore/src/quant/x86_asm/quantize4_mmx.asm

Revision 463 - (view) (download)

1 :	chl	463	;/*****************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * mmx optimized MPEG quantization/dequantization
5 :			; *
6 :			; * Copyright(C) 2002 Peter Ross <pross@xvid.org>
7 :			; * Copyright(C) 2002 Michael Militzer <michael@xvid.org>
8 :			; * Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>
9 :			; *
10 :			; * This program is an implementation of a part of one or more MPEG-4
11 :			; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
12 :			; * to use this software module in hardware or software products are
13 :			; * advised that its use may infringe existing patents or copyrights, and
14 :			; * any such use would be at such party's own risk. The original
15 :			; * developer of this software module and his/her company, and subsequent
16 :			; * editors and their companies, will have no liability for use of this
17 :			; * software or modifications or derivatives thereof.
18 :			; *
19 :			; * This program is free software; you can redistribute it and/or modify
20 :			; * it under the terms of the GNU General Public License as published by
21 :			; * the Free Software Foundation; either version 2 of the License, or
22 :			; * (at your option) any later version.
23 :			; *
24 :			; * This program is distributed in the hope that it will be useful,
25 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 :			; * GNU General Public License for more details.
28 :			; *
29 :			; * You should have received a copy of the GNU General Public License
30 :			; * along with this program; if not, write to the Free Software
31 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32 :			; *
33 :			; *************************************************************************/
34 :	Isibaar	3
35 :			; data/text alignment
36 :			%define ALIGN 8
37 :
38 :			%define SATURATE
39 :
40 :			bits 32
41 :
42 :			section .data
43 :
44 :			%macro cglobal 1
45 :			%ifdef PREFIX
46 :			global _%1
47 :			%define %1 _%1
48 :			%else
49 :			global %1
50 :			%endif
51 :			%endmacro
52 :
53 :	Isibaar	4	%macro cextern 1
54 :			%ifdef PREFIX
55 :			extern _%1
56 :			%define %1 _%1
57 :			%else
58 :			extern %1
59 :			%endif
60 :			%endmacro
61 :
62 :	Isibaar	3	mmx_one times 4 dw 1
63 :
64 :			;===========================================================================
65 :			;
66 :			; divide by 2Q table
67 :			;
68 :			;===========================================================================
69 :
70 :			%macro MMX_DIV 1
71 :			times 4 dw (1 << 17) / (%1 * 2) + 1
72 :			%endmacro
73 :
74 :			align ALIGN
75 :			mmx_div
76 :			MMX_DIV 1
77 :			MMX_DIV 2
78 :			MMX_DIV 3
79 :			MMX_DIV 4
80 :			MMX_DIV 5
81 :			MMX_DIV 6
82 :			MMX_DIV 7
83 :			MMX_DIV 8
84 :			MMX_DIV 9
85 :			MMX_DIV 10
86 :			MMX_DIV 11
87 :			MMX_DIV 12
88 :			MMX_DIV 13
89 :			MMX_DIV 14
90 :			MMX_DIV 15
91 :			MMX_DIV 16
92 :			MMX_DIV 17
93 :			MMX_DIV 18
94 :			MMX_DIV 19
95 :			MMX_DIV 20
96 :			MMX_DIV 21
97 :			MMX_DIV 22
98 :			MMX_DIV 23
99 :			MMX_DIV 24
100 :			MMX_DIV 25
101 :			MMX_DIV 26
102 :			MMX_DIV 27
103 :			MMX_DIV 28
104 :			MMX_DIV 29
105 :			MMX_DIV 30
106 :			MMX_DIV 31
107 :
108 :
109 :			;===========================================================================
110 :			;
111 :	Isibaar	4	; intra matrix
112 :	Isibaar	3	;
113 :			;===========================================================================
114 :
115 :	Isibaar	4	cextern intra_matrix
116 :			cextern intra_matrix_fix
117 :	Isibaar	3
118 :			;===========================================================================
119 :			;
120 :	Isibaar	4	; inter matrix
121 :	Isibaar	3	;
122 :			;===========================================================================
123 :
124 :	Isibaar	4	cextern inter_matrix
125 :			cextern inter_matrix_fix
126 :	Isibaar	3
127 :
128 :			%define VM18P 3
129 :			%define VM18Q 4
130 :
131 :	Isibaar	4
132 :	Isibaar	3	;===========================================================================
133 :			;
134 :			; quantd table
135 :			;
136 :			;===========================================================================
137 :
138 :			%macro MMX_QUANTD 1
139 :			times 4 dw ((VM18P*%1) + (VM18Q/2)) / VM18Q
140 :			%endmacro
141 :
142 :			quantd
143 :			MMX_QUANTD 1
144 :			MMX_QUANTD 2
145 :			MMX_QUANTD 3
146 :			MMX_QUANTD 4
147 :			MMX_QUANTD 5
148 :			MMX_QUANTD 6
149 :			MMX_QUANTD 7
150 :			MMX_QUANTD 8
151 :			MMX_QUANTD 9
152 :			MMX_QUANTD 10
153 :			MMX_QUANTD 11
154 :			MMX_QUANTD 12
155 :			MMX_QUANTD 13
156 :			MMX_QUANTD 14
157 :			MMX_QUANTD 15
158 :			MMX_QUANTD 16
159 :			MMX_QUANTD 17
160 :			MMX_QUANTD 18
161 :			MMX_QUANTD 19
162 :			MMX_QUANTD 20
163 :			MMX_QUANTD 21
164 :			MMX_QUANTD 22
165 :			MMX_QUANTD 23
166 :			MMX_QUANTD 24
167 :			MMX_QUANTD 25
168 :			MMX_QUANTD 26
169 :			MMX_QUANTD 27
170 :			MMX_QUANTD 28
171 :			MMX_QUANTD 29
172 :			MMX_QUANTD 30
173 :			MMX_QUANTD 31
174 :
175 :
176 :			;===========================================================================
177 :			;
178 :			; multiple by 2Q table
179 :			;
180 :			;===========================================================================
181 :
182 :			%macro MMX_MUL_QUANT 1
183 :			times 4 dw %1
184 :			%endmacro
185 :
186 :			mmx_mul_quant
187 :			MMX_MUL_QUANT 1
188 :			MMX_MUL_QUANT 2
189 :			MMX_MUL_QUANT 3
190 :			MMX_MUL_QUANT 4
191 :			MMX_MUL_QUANT 5
192 :			MMX_MUL_QUANT 6
193 :			MMX_MUL_QUANT 7
194 :			MMX_MUL_QUANT 8
195 :			MMX_MUL_QUANT 9
196 :			MMX_MUL_QUANT 10
197 :			MMX_MUL_QUANT 11
198 :			MMX_MUL_QUANT 12
199 :			MMX_MUL_QUANT 13
200 :			MMX_MUL_QUANT 14
201 :			MMX_MUL_QUANT 15
202 :			MMX_MUL_QUANT 16
203 :			MMX_MUL_QUANT 17
204 :			MMX_MUL_QUANT 18
205 :			MMX_MUL_QUANT 19
206 :			MMX_MUL_QUANT 20
207 :			MMX_MUL_QUANT 21
208 :			MMX_MUL_QUANT 22
209 :			MMX_MUL_QUANT 23
210 :			MMX_MUL_QUANT 24
211 :			MMX_MUL_QUANT 25
212 :			MMX_MUL_QUANT 26
213 :			MMX_MUL_QUANT 27
214 :			MMX_MUL_QUANT 28
215 :			MMX_MUL_QUANT 29
216 :			MMX_MUL_QUANT 30
217 :			MMX_MUL_QUANT 31
218 :
219 :			;===========================================================================
220 :			;
221 :			; saturation limits
222 :			;
223 :			;===========================================================================
224 :
225 :			align 16
226 :
227 :	Isibaar	262	mmx_32767_minus_2047 times 4 dw (32767-2047)
228 :			mmx_32768_minus_2048 times 4 dw (32768-2048)
229 :			mmx_2047 times 4 dw 2047
230 :			mmx_minus_2048 times 4 dw (-2048)
231 :			zero times 4 dw 0
232 :
233 :	Isibaar	3	section .text
234 :
235 :			;===========================================================================
236 :			;
237 :			; void quant_intra4_mmx(int16_t * coeff,
238 :			; const int16_t const * data,
239 :			; const uint32_t quant,
240 :			; const uint32_t dcscalar);
241 :			;
242 :			;===========================================================================
243 :
244 :			align ALIGN
245 :			cglobal quant4_intra_mmx
246 :			quant4_intra_mmx
247 :
248 :			push ecx
249 :			push esi
250 :			push edi
251 :
252 :			mov edi, [esp + 12 + 4] ; coeff
253 :			mov esi, [esp + 12 + 8] ; data
254 :			mov eax, [esp + 12 + 12] ; quant
255 :
256 :			movq mm5, [quantd + eax * 8 - 8] ; quantd -> mm5
257 :
258 :			xor ecx, ecx
259 :			cmp al, 1
260 :			jz near .q1loop
261 :
262 :			cmp al, 2
263 :			jz near .q2loop
264 :
265 :			movq mm7, [mmx_div + eax * 8 - 8] ; multipliers[quant] -> mm7
266 :
267 :			align ALIGN
268 :			.loop
269 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
270 :			movq mm3, [esi + 8*ecx + 8] ;
271 :
272 :			pxor mm1, mm1 ; mm1 = 0
273 :			pxor mm4, mm4
274 :
275 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
276 :			pcmpgtw mm4, mm3
277 :
278 :			pxor mm0, mm1 ; mm0 = \|mm0\|
279 :			pxor mm3, mm4 ;
280 :			psubw mm0, mm1 ; displace
281 :			psubw mm3, mm4 ;
282 :
283 :			psllw mm0, 4 ; level << 4
284 :			psllw mm3, 4 ;
285 :
286 :	Isibaar	4	movq mm2, [intra_matrix + 8*ecx]
287 :	Isibaar	3	psrlw mm2, 1 ; intra_matrix[i]>>1
288 :			paddw mm0, mm2
289 :
290 :	Isibaar	4	movq mm2, [intra_matrix_fix + ecx*8]
291 :	Isibaar	3	pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
292 :
293 :	Isibaar	4	movq mm2, [intra_matrix + 8*ecx + 8]
294 :	Isibaar	3	psrlw mm2, 1
295 :			paddw mm3, mm2
296 :
297 :	Isibaar	4	movq mm2, [intra_matrix_fix + ecx*8 + 8]
298 :	Isibaar	3	pmulhw mm3, mm2
299 :
300 :			paddw mm0, mm5 ; + quantd
301 :			paddw mm3, mm5
302 :
303 :			pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
304 :			pmulhw mm3, mm7 ;
305 :			psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17
306 :			psrlw mm3, 1
307 :
308 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
309 :			pxor mm3, mm4 ;
310 :			psubw mm0, mm1 ; undisplace
311 :			psubw mm3, mm4 ;
312 :
313 :			movq [edi + 8*ecx], mm0
314 :			movq [edi + 8*ecx + 8], mm3
315 :
316 :			add ecx,2
317 :			cmp ecx,16
318 :			jnz near .loop
319 :
320 :			.done
321 :			; caclulate data[0] // (int32_t)dcscalar)
322 :
323 :			mov ecx, [esp + 12 + 16] ; dcscalar
324 :			mov edx, ecx
325 :			movsx eax, word [esi] ; data[0]
326 :			shr edx, 1 ; edx = dcscalar /2
327 :			cmp eax, 0
328 :			jg .gtzero
329 :
330 :			sub eax, edx
331 :			jmp short .mul
332 :			.gtzero
333 :			add eax, edx
334 :			.mul
335 :			cdq ; expand eax -> edx:eax
336 :			idiv ecx ; eax = edx:eax / dcscalar
337 :
338 :			mov [edi], ax ; coeff[0] = ax
339 :
340 :			pop edi
341 :			pop esi
342 :			pop ecx
343 :
344 :			ret
345 :
346 :			align ALIGN
347 :			.q1loop
348 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
349 :			movq mm3, [esi + 8*ecx + 8] ;
350 :
351 :			pxor mm1, mm1 ; mm1 = 0
352 :			pxor mm4, mm4 ;
353 :
354 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
355 :			pcmpgtw mm4, mm3 ;
356 :
357 :			pxor mm0, mm1 ; mm0 = \|mm0\|
358 :			pxor mm3, mm4 ;
359 :			psubw mm0, mm1 ; displace
360 :			psubw mm3, mm4 ;
361 :
362 :			psllw mm0, 4
363 :			psllw mm3, 4
364 :
365 :	Isibaar	4	movq mm2, [intra_matrix + 8*ecx]
366 :	Isibaar	3	psrlw mm2, 1
367 :			paddw mm0, mm2
368 :
369 :	Isibaar	4	movq mm2, [intra_matrix_fix + ecx*8]
370 :	Isibaar	3	pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
371 :
372 :	Isibaar	4	movq mm2, [intra_matrix + 8*ecx + 8]
373 :	Isibaar	3	psrlw mm2, 1
374 :			paddw mm3, mm2
375 :
376 :	Isibaar	4	movq mm2, [intra_matrix_fix + ecx*8 + 8]
377 :	Isibaar	3	pmulhw mm3, mm2
378 :
379 :			paddw mm0, mm5
380 :			paddw mm3, mm5
381 :
382 :			psrlw mm0, 1 ; mm0 >>= 1 (/2)
383 :			psrlw mm3, 1 ;
384 :
385 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
386 :			pxor mm3, mm4 ;
387 :			psubw mm0, mm1 ; undisplace
388 :			psubw mm3, mm4 ;
389 :
390 :			movq [edi + 8*ecx], mm0
391 :			movq [edi + 8*ecx + 8], mm3
392 :
393 :			add ecx,2
394 :			cmp ecx,16
395 :			jnz near .q1loop
396 :			jmp near .done
397 :
398 :
399 :			align ALIGN
400 :			.q2loop
401 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
402 :			movq mm3, [esi + 8*ecx + 8] ;
403 :
404 :			pxor mm1, mm1 ; mm1 = 0
405 :			pxor mm4, mm4 ;
406 :
407 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
408 :			pcmpgtw mm4, mm3 ;
409 :
410 :			pxor mm0, mm1 ; mm0 = \|mm0\|
411 :			pxor mm3, mm4 ;
412 :			psubw mm0, mm1 ; displace
413 :			psubw mm3, mm4 ;
414 :
415 :			psllw mm0, 4
416 :			psllw mm3, 4
417 :
418 :	Isibaar	4	movq mm2, [intra_matrix + 8*ecx]
419 :	Isibaar	3	psrlw mm2, 1
420 :			paddw mm0, mm2
421 :
422 :	Isibaar	4	movq mm2, [intra_matrix_fix + ecx*8]
423 :	Isibaar	3	pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
424 :
425 :	Isibaar	4	movq mm2, [intra_matrix + 8*ecx + 8]
426 :	Isibaar	3	psrlw mm2, 1
427 :			paddw mm3, mm2
428 :
429 :	Isibaar	4	movq mm2, [intra_matrix_fix + ecx*8 + 8]
430 :	Isibaar	3	pmulhw mm3, mm2
431 :
432 :			paddw mm0, mm5
433 :			paddw mm3, mm5
434 :
435 :			psrlw mm0, 2 ; mm0 >>= 1 (/4)
436 :			psrlw mm3, 2 ;
437 :
438 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
439 :			pxor mm3, mm4 ;
440 :			psubw mm0, mm1 ; undisplace
441 :			psubw mm3, mm4 ;
442 :
443 :			movq [edi + 8*ecx], mm0
444 :			movq [edi + 8*ecx + 8], mm3
445 :
446 :			add ecx,2
447 :			cmp ecx,16
448 :			jnz near .q2loop
449 :			jmp near .done
450 :
451 :
452 :			;===========================================================================
453 :			;
454 :			; uint32_t quant4_inter_mmx(int16_t * coeff,
455 :			; const int16_t const * data,
456 :			; const uint32_t quant);
457 :			;
458 :			;===========================================================================
459 :
460 :			align ALIGN
461 :			cglobal quant4_inter_mmx
462 :			quant4_inter_mmx
463 :
464 :			push ecx
465 :			push esi
466 :			push edi
467 :
468 :			mov edi, [esp + 12 + 4] ; coeff
469 :			mov esi, [esp + 12 + 8] ; data
470 :			mov eax, [esp + 12 + 12] ; quant
471 :
472 :			xor ecx, ecx
473 :
474 :			pxor mm5, mm5 ; sum
475 :
476 :			cmp al, 1
477 :			jz near .q1loop
478 :
479 :			cmp al, 2
480 :			jz near .q2loop
481 :
482 :			movq mm7, [mmx_div + eax * 8 - 8] ; divider
483 :
484 :			align ALIGN
485 :			.loop
486 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
487 :			movq mm3, [esi + 8*ecx + 8] ;
488 :			pxor mm1, mm1 ; mm1 = 0
489 :			pxor mm4, mm4 ;
490 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
491 :			pcmpgtw mm4, mm3 ;
492 :			pxor mm0, mm1 ; mm0 = \|mm0\|
493 :			pxor mm3, mm4 ;
494 :			psubw mm0, mm1 ; displace
495 :			psubw mm3, mm4 ;
496 :
497 :			psllw mm0, 4
498 :			psllw mm3, 4
499 :
500 :	Isibaar	4	movq mm2, [inter_matrix + 8*ecx]
501 :	Isibaar	3	psrlw mm2, 1
502 :			paddw mm0, mm2
503 :
504 :	Isibaar	4	movq mm2, [inter_matrix_fix + ecx*8]
505 :	Isibaar	3	pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
506 :
507 :	Isibaar	4	movq mm2, [inter_matrix + 8*ecx + 8]
508 :	Isibaar	3	psrlw mm2, 1
509 :			paddw mm3, mm2
510 :
511 :	Isibaar	4	movq mm2, [inter_matrix_fix + ecx*8 + 8]
512 :	Isibaar	3	pmulhw mm3, mm2
513 :
514 :			pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
515 :			pmulhw mm3, mm7 ;
516 :			psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17
517 :			psrlw mm3, 1
518 :
519 :			paddw mm5, mm0 ; sum += mm0
520 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
521 :			paddw mm5, mm3 ;
522 :			pxor mm3, mm4 ;
523 :			psubw mm0, mm1 ; undisplace
524 :			psubw mm3, mm4
525 :			movq [edi + 8*ecx], mm0
526 :			movq [edi + 8*ecx + 8], mm3
527 :
528 :			add ecx, 2
529 :			cmp ecx, 16
530 :			jnz near .loop
531 :
532 :			.done
533 :			pmaddwd mm5, [mmx_one]
534 :			movq mm0, mm5
535 :			psrlq mm5, 32
536 :			paddd mm0, mm5
537 :			movd eax, mm0 ; return sum
538 :
539 :			pop edi
540 :			pop esi
541 :			pop ecx
542 :
543 :			ret
544 :
545 :			align ALIGN
546 :			.q1loop
547 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
548 :			movq mm3, [esi + 8*ecx+ 8]
549 :			;
550 :			pxor mm1, mm1 ; mm1 = 0
551 :			pxor mm4, mm4 ;
552 :
553 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
554 :			pcmpgtw mm4, mm3 ;
555 :
556 :			pxor mm0, mm1 ; mm0 = \|mm0\|
557 :			pxor mm3, mm4 ;
558 :			psubw mm0, mm1 ; displace
559 :			psubw mm3, mm4 ;
560 :
561 :			psllw mm0, 4
562 :			psllw mm3, 4
563 :
564 :	Isibaar	4	movq mm2, [inter_matrix + 8*ecx]
565 :	Isibaar	3	psrlw mm2, 1
566 :			paddw mm0, mm2
567 :
568 :	Isibaar	4	movq mm2, [inter_matrix_fix + ecx*8]
569 :	Isibaar	3	pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
570 :
571 :	Isibaar	4	movq mm2, [inter_matrix + 8*ecx + 8]
572 :	Isibaar	3	psrlw mm2, 1
573 :			paddw mm3, mm2
574 :
575 :	Isibaar	4	movq mm2, [inter_matrix_fix + ecx*8 + 8]
576 :	Isibaar	3	pmulhw mm3, mm2
577 :
578 :			psrlw mm0, 1 ; mm0 >>= 1 (/2)
579 :			psrlw mm3, 1 ;
580 :
581 :			paddw mm5, mm0 ; sum += mm0
582 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
583 :			paddw mm5, mm3 ;
584 :			pxor mm3, mm4 ;
585 :			psubw mm0, mm1 ; undisplace
586 :			psubw mm3, mm4
587 :
588 :			movq [edi + 8*ecx], mm0
589 :			movq [edi + 8*ecx + 8], mm3
590 :
591 :			add ecx,2
592 :			cmp ecx,16
593 :			jnz near .q1loop
594 :
595 :			jmp .done
596 :
597 :
598 :			align ALIGN
599 :			.q2loop
600 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
601 :			movq mm3, [esi + 8*ecx+ 8]
602 :			;
603 :			pxor mm1, mm1 ; mm1 = 0
604 :			pxor mm4, mm4 ;
605 :
606 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
607 :			pcmpgtw mm4, mm3 ;
608 :
609 :			pxor mm0, mm1 ; mm0 = \|mm0\|
610 :			pxor mm3, mm4 ;
611 :			psubw mm0, mm1 ; displace
612 :			psubw mm3, mm4 ;
613 :
614 :			psllw mm0, 4
615 :			psllw mm3, 4
616 :
617 :	Isibaar	4	movq mm2, [inter_matrix + 8*ecx]
618 :	Isibaar	3	psrlw mm2, 1
619 :			paddw mm0, mm2
620 :
621 :	Isibaar	4	movq mm2, [inter_matrix_fix + ecx*8]
622 :	Isibaar	3	pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
623 :
624 :	Isibaar	4	movq mm2, [inter_matrix + 8*ecx + 8]
625 :	Isibaar	3	psrlw mm2, 1
626 :			paddw mm3, mm2
627 :
628 :	Isibaar	4	movq mm2, [inter_matrix_fix + ecx*8 + 8]
629 :	Isibaar	3	pmulhw mm3, mm2
630 :
631 :			psrlw mm0, 2 ; mm0 >>= 1 (/2)
632 :			psrlw mm3, 2 ;
633 :
634 :			paddw mm5, mm0 ; sum += mm0
635 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
636 :			paddw mm5, mm3 ;
637 :			pxor mm3, mm4 ;
638 :			psubw mm0, mm1 ; undisplace
639 :			psubw mm3, mm4
640 :
641 :			movq [edi + 8*ecx], mm0
642 :			movq [edi + 8*ecx + 8], mm3
643 :
644 :			add ecx,2
645 :			cmp ecx,16
646 :			jnz near .q2loop
647 :
648 :			jmp .done
649 :
650 :
651 :			;===========================================================================
652 :			;
653 :			; void dequant4_intra_mmx(int16_t *data,
654 :			; const int16_t const *coeff,
655 :			; const uint32_t quant,
656 :			; const uint32_t dcscalar);
657 :			;
658 :			;===========================================================================
659 :
660 :	Isibaar	262	; Note: in order to saturate 'easily', we pre-shift the quantifier
661 :			; by 4. Then, the high-word of (coeff[]matrix[i]quant) are used to
662 :			; build a saturating mask. It is non-zero only when an overflow occured.
663 :			; We thus avoid packing/unpacking toward double-word.
664 :			; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g.,
665 :			; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not
666 :			; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a
667 :			; and quant in [1..31].
668 :			;
669 :			; The original loop is:
670 :			;
671 :			%if 0
672 :			movq mm0, [ecx+8eax + 816] ; mm0 = coeff[i]
673 :			pxor mm1, mm1
674 :			pcmpgtw mm1, mm0
675 :			pxor mm0, mm1 ; change sign if negative
676 :			psubw mm0, mm1 ; -> mm0 = abs(coeff[i]), mm1 = sign of coeff[i]
677 :
678 :			movq mm2, mm7 ; mm2 = quant
679 :			pmullw mm2, [intra_matrix + 8eax + 816 ] ; matrix[i]*quant.
680 :
681 :			movq mm6, mm2
682 :			pmulhw mm2, mm0 ; high of coeff(matrixquant) (should be 0 if no overflow)
683 :			pmullw mm0, mm6 ; low of coeff(matrixquant)
684 :
685 :			pxor mm5, mm5
686 :			pcmpgtw mm2, mm5 ; otherflow?
687 :			psrlw mm2, 5 ; =0 if no clamp, 2047 otherwise
688 :			psrlw mm0, 5
689 :			paddw mm0, mm1 ; start restoring sign
690 :			por mm0, mm2 ; saturate to 2047 if needed
691 :			pxor mm0, mm1 ; finish negating back
692 :
693 :			movq [edx + 8eax + 816], mm0 ; data[i]
694 :			add eax, 1
695 :			%endif
696 :
697 :			;********************************************************************
698 :
699 :	Isibaar	3	align 16
700 :			cglobal dequant4_intra_mmx
701 :	Isibaar	262	dequant4_intra_mmx:
702 :	Isibaar	3
703 :	Isibaar	262	mov edx, [esp+4] ; data
704 :			mov ecx, [esp+8] ; coeff
705 :			mov eax, [esp+12] ; quant
706 :	Isibaar	3
707 :	Isibaar	262	movq mm7, [mmx_mul_quant + eax*8 - 8]
708 :			mov eax, -16 ; to keep aligned, we regularly process coeff[0]
709 :			psllw mm7, 2 ; << 2. See comment.
710 :			pxor mm6, mm6 ; this is a NOP
711 :	Isibaar	3
712 :	Isibaar	262	align 16
713 :	Isibaar	3	.loop
714 :	Isibaar	262	movq mm0, [ecx+8eax + 816] ; mm0 = c = coeff[i]
715 :			movq mm3, [ecx+8eax + 816 +8]; mm3 = c' = coeff[i+1]
716 :			pxor mm1, mm1
717 :			pxor mm4, mm4
718 :			pcmpgtw mm1, mm0 ; mm1 = sgn(c)
719 :			movq mm2, mm7 ; mm2 = quant
720 :
721 :			pcmpgtw mm4, mm3 ; mm4 = sgn(c')
722 :			pmullw mm2, [intra_matrix + 8eax + 816 ] ; matrix[i]*quant
723 :	Isibaar	3
724 :	Isibaar	262	pxor mm0, mm1 ; negate if negative
725 :			pxor mm3, mm4 ; negate if negative
726 :
727 :			psubw mm0, mm1
728 :			psubw mm3, mm4
729 :
730 :			; we're short on register, here. Poor pairing...
731 :	Isibaar	3
732 :	Isibaar	262	movq mm5, mm2
733 :			pmullw mm2, mm0 ; low of coeff(matrixquant)
734 :	Isibaar	3
735 :	Isibaar	262	pmulhw mm0, mm5 ; high of coeff(matrixquant)
736 :			movq mm5, mm7 ; mm2 = quant
737 :	Isibaar	3
738 :	Isibaar	262	pmullw mm5, [intra_matrix + 8eax + 816 +8] ; matrix[i+1]*quant
739 :	Isibaar	3
740 :	Isibaar	262	movq mm6, mm5
741 :			add eax,2 ; z-flag will be tested later
742 :	Isibaar	3
743 :	Isibaar	262	pmullw mm6, mm3 ; low of coeff(matrixquant)
744 :			pmulhw mm3, mm5 ; high of coeff(matrixquant)
745 :	Isibaar	3
746 :	Isibaar	262	pcmpgtw mm0, [zero]
747 :			paddusw mm2, mm0
748 :			psrlw mm2, 5
749 :	Isibaar	3
750 :	Isibaar	262	pcmpgtw mm3, [zero]
751 :			paddusw mm6, mm3
752 :			psrlw mm6, 5
753 :	Isibaar	3
754 :	Isibaar	262	pxor mm2, mm1 ; start negating back
755 :			pxor mm6, mm4 ; start negating back
756 :	Isibaar	3
757 :	Isibaar	262	psubusw mm1, mm0
758 :			psubusw mm4, mm3
759 :	Isibaar	3
760 :	Isibaar	262	psubw mm2, mm1 ; finish negating back
761 :			psubw mm6, mm4 ; finish negating back
762 :	Isibaar	3
763 :	Isibaar	262	movq [edx + 8eax + 816 -2*8 ], mm2 ; data[i]
764 :			movq [edx + 8eax + 816 -2*8 +8], mm6 ; data[i+1]
765 :	Isibaar	3
766 :	Isibaar	268	jnz near .loop
767 :	Isibaar	3
768 :	Isibaar	262	; deal with DC
769 :	Isibaar	3
770 :	Isibaar	262	movd mm0, [ecx]
771 :			pmullw mm0, [esp+16] ; dcscalar
772 :			movq mm2, [mmx_32767_minus_2047]
773 :			paddsw mm0, mm2
774 :			psubsw mm0, mm2
775 :			movq mm2, [mmx_32768_minus_2048]
776 :			psubsw mm0, mm2
777 :			paddsw mm0, mm2
778 :			movd eax, mm0
779 :			mov [edx], ax
780 :	Isibaar	3
781 :	Isibaar	262	ret
782 :
783 :	Isibaar	3	;===========================================================================
784 :			;
785 :			; void dequant4_inter_mmx(int16_t * data,
786 :			; const int16_t * const coeff,
787 :			; const uint32_t quant);
788 :			;
789 :			;===========================================================================
790 :
791 :	Isibaar	262	; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier
792 :			; so we handle the 3 cases: c<0, c==0, and c>0 in one shot.
793 :			; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0.
794 :			; It's mixed with the extraction of the absolute value.
795 :
796 :	Isibaar	3	align 16
797 :			cglobal dequant4_inter_mmx
798 :	Isibaar	262	dequant4_inter_mmx:
799 :	Isibaar	3
800 :	Isibaar	262	mov edx, [esp+ 4] ; data
801 :			mov ecx, [esp+ 8] ; coeff
802 :			mov eax, [esp+12] ; quant
803 :			movq mm7, [mmx_mul_quant + eax*8 - 8]
804 :			mov eax, -16
805 :			paddw mm7, mm7 ; << 1
806 :			pxor mm6, mm6 ; mismatch sum
807 :	Isibaar	3
808 :	Isibaar	262	align 16
809 :	Isibaar	3	.loop
810 :	Isibaar	262	movq mm0, [ecx+8eax + 816 ] ; mm0 = coeff[i]
811 :			movq mm2, [ecx+8eax + 816 +8] ; mm2 = coeff[i+1]
812 :			add eax,2
813 :	Isibaar	3
814 :	Isibaar	262	pxor mm1, mm1
815 :			pxor mm3, mm3
816 :			pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved)
817 :			pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved)
818 :			paddsw mm0, mm1 ; c += sgn(c)
819 :			paddsw mm2, mm3 ; c += sgn(c')
820 :			paddw mm0, mm0 ; c *= 2
821 :			paddw mm2, mm2 ; c'*= 2
822 :	Isibaar	3
823 :	Isibaar	262	pxor mm4, mm4
824 :			pxor mm5, mm5
825 :			psubw mm4, mm0 ; -c
826 :			psubw mm5, mm2 ; -c'
827 :			psraw mm4, 16 ; mm4 = sgn(-c)
828 :			psraw mm5, 16 ; mm5 = sgn(-c')
829 :			psubsw mm0, mm4 ; c -= sgn(-c)
830 :			psubsw mm2, mm5 ; c' -= sgn(-c')
831 :			pxor mm0, mm1 ; finish changing sign if needed
832 :			pxor mm2, mm3 ; finish changing sign if needed
833 :	Isibaar	3
834 :	Isibaar	262	; we're short on register, here. Poor pairing...
835 :	Isibaar	3
836 :	Isibaar	262	movq mm4, mm7 ; (matrix*quant)
837 :			pmullw mm4, [inter_matrix + 8eax + 816 -2*8]
838 :			movq mm5, mm4
839 :			pmulhw mm5, mm0 ; high of c(matrixquant)
840 :			pmullw mm0, mm4 ; low of c(matrixquant)
841 :	Isibaar	3
842 :	Isibaar	262	movq mm4, mm7 ; (matrix*quant)
843 :			pmullw mm4, [inter_matrix + 8eax + 816 -2*8 + 8]
844 :	Isibaar	3
845 :	Isibaar	262	pcmpgtw mm5, [zero]
846 :			paddusw mm0, mm5
847 :			psrlw mm0, 5
848 :			pxor mm0, mm1 ; start restoring sign
849 :			psubusw mm1, mm5
850 :	Isibaar	3
851 :	Isibaar	262	movq mm5, mm4
852 :			pmulhw mm5, mm2 ; high of c(matrixquant)
853 :			pmullw mm2, mm4 ; low of c(matrixquant)
854 :			psubw mm0, mm1 ; finish restoring sign
855 :	Isibaar	3
856 :	Isibaar	262	pcmpgtw mm5, [zero]
857 :			paddusw mm2, mm5
858 :			psrlw mm2, 5
859 :			pxor mm2, mm3 ; start restoring sign
860 :			psubusw mm3, mm5
861 :			psubw mm2, mm3 ; finish restoring sign
862 :	Isibaar	3
863 :	Isibaar	262	pxor mm6, mm0 ; mismatch control
864 :			movq [edx + 8eax + 816 -2*8 ], mm0 ; data[i]
865 :			pxor mm6, mm2 ; mismatch control
866 :			movq [edx + 8eax + 816 -2*8 +8], mm2 ; data[i+1]
867 :	Isibaar	3
868 :	Isibaar	268	jnz near .loop
869 :	Isibaar	3
870 :	Isibaar	262	; mismatch control
871 :	Isibaar	3
872 :	Isibaar	262	movq mm0, mm6
873 :			psrlq mm0, 48
874 :			movq mm1, mm6
875 :			movq mm2, mm6
876 :			psrlq mm1, 32
877 :			pxor mm6, mm0
878 :			psrlq mm2, 16
879 :			pxor mm6, mm1
880 :			pxor mm6, mm2
881 :			movd eax, mm6
882 :			and eax, 1
883 :			xor eax, 1
884 :			xor word [edx + 2*63], ax
885 :	Isibaar	3
886 :	Isibaar	262	ret
887 :	Isibaar	3

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4