Annotation of /branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize4_mmx.asm

Revision 268 - (view) (download)
Original Path: trunk/xvidcore/src/quant/x86_asm/quantize4_mmx.asm

1 :	Isibaar	3	;/******************************************************************************
2 :			; * *
3 :			; * This file is part of XviD, a free MPEG-4 video encoder/decoder *
4 :			; * *
5 :			; * XviD is an implementation of a part of one or more MPEG-4 Video tools *
6 :			; * as specified in ISO/IEC 14496-2 standard. Those intending to use this *
7 :			; * software module in hardware or software products are advised that its *
8 :			; * use may infringe existing patents or copyrights, and any such use *
9 :			; * would be at such party's own risk. The original developer of this *
10 :			; * software module and his/her company, and subsequent editors and their *
11 :			; * companies, will have no liability for use of this software or *
12 :			; * modifications or derivatives thereof. *
13 :			; * *
14 :			; * XviD is free software; you can redistribute it and/or modify it *
15 :			; * under the terms of the GNU General Public License as published by *
16 :			; * the Free Software Foundation; either version 2 of the License, or *
17 :			; * (at your option) any later version. *
18 :			; * *
19 :			; * XviD is distributed in the hope that it will be useful, but *
20 :			; * WITHOUT ANY WARRANTY; without even the implied warranty of *
21 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
22 :			; * GNU General Public License for more details. *
23 :			; * *
24 :			; * You should have received a copy of the GNU General Public License *
25 :			; * along with this program; if not, write to the Free Software *
26 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
27 :			; * *
28 :			; ******************************************************************************/
29 :			;
30 :			;/******************************************************************************
31 :			; * *
32 :			; * quantize4.asm, MMX optimized MPEG quantization/dequantization *
33 :			; * *
34 :			; * Copyright (C) 2002 - Peter Ross <pross@cs.rmit.edu.au> *
35 :			; * Copyright (C) 2002 - Michael Militzer <isibaar@xvid.org> *
36 :			; * *
37 :			; * For more information visit the XviD homepage: http://www.xvid.org *
38 :			; * *
39 :			; ******************************************************************************/
40 :			;
41 :			;/******************************************************************************
42 :			; * *
43 :			; * Revision history: *
44 :			; * *
45 :	Isibaar	262	; * 14.06.2002 mmx dequant4_* funcs revamped -Skal- *
46 :	Isibaar	3	; * 22.01.2002 initial version *
47 :			; * *
48 :			; ******************************************************************************/
49 :
50 :			; data/text alignment
51 :			%define ALIGN 8
52 :
53 :			%define SATURATE
54 :
55 :			bits 32
56 :
57 :			section .data
58 :
59 :			%macro cglobal 1
60 :			%ifdef PREFIX
61 :			global _%1
62 :			%define %1 _%1
63 :			%else
64 :			global %1
65 :			%endif
66 :			%endmacro
67 :
68 :	Isibaar	4	%macro cextern 1
69 :			%ifdef PREFIX
70 :			extern _%1
71 :			%define %1 _%1
72 :			%else
73 :			extern %1
74 :			%endif
75 :			%endmacro
76 :
77 :	Isibaar	3	mmx_one times 4 dw 1
78 :
79 :			;===========================================================================
80 :			;
81 :			; divide by 2Q table
82 :			;
83 :			;===========================================================================
84 :
85 :			%macro MMX_DIV 1
86 :			times 4 dw (1 << 17) / (%1 * 2) + 1
87 :			%endmacro
88 :
89 :			align ALIGN
90 :			mmx_div
91 :			MMX_DIV 1
92 :			MMX_DIV 2
93 :			MMX_DIV 3
94 :			MMX_DIV 4
95 :			MMX_DIV 5
96 :			MMX_DIV 6
97 :			MMX_DIV 7
98 :			MMX_DIV 8
99 :			MMX_DIV 9
100 :			MMX_DIV 10
101 :			MMX_DIV 11
102 :			MMX_DIV 12
103 :			MMX_DIV 13
104 :			MMX_DIV 14
105 :			MMX_DIV 15
106 :			MMX_DIV 16
107 :			MMX_DIV 17
108 :			MMX_DIV 18
109 :			MMX_DIV 19
110 :			MMX_DIV 20
111 :			MMX_DIV 21
112 :			MMX_DIV 22
113 :			MMX_DIV 23
114 :			MMX_DIV 24
115 :			MMX_DIV 25
116 :			MMX_DIV 26
117 :			MMX_DIV 27
118 :			MMX_DIV 28
119 :			MMX_DIV 29
120 :			MMX_DIV 30
121 :			MMX_DIV 31
122 :
123 :
124 :			;===========================================================================
125 :			;
126 :	Isibaar	4	; intra matrix
127 :	Isibaar	3	;
128 :			;===========================================================================
129 :
130 :	Isibaar	4	cextern intra_matrix
131 :			cextern intra_matrix_fix
132 :	Isibaar	3
133 :			;===========================================================================
134 :			;
135 :	Isibaar	4	; inter matrix
136 :	Isibaar	3	;
137 :			;===========================================================================
138 :
139 :	Isibaar	4	cextern inter_matrix
140 :			cextern inter_matrix_fix
141 :	Isibaar	3
142 :
143 :			%define VM18P 3
144 :			%define VM18Q 4
145 :
146 :	Isibaar	4
147 :	Isibaar	3	;===========================================================================
148 :			;
149 :			; quantd table
150 :			;
151 :			;===========================================================================
152 :
153 :			%macro MMX_QUANTD 1
154 :			times 4 dw ((VM18P*%1) + (VM18Q/2)) / VM18Q
155 :			%endmacro
156 :
157 :			quantd
158 :			MMX_QUANTD 1
159 :			MMX_QUANTD 2
160 :			MMX_QUANTD 3
161 :			MMX_QUANTD 4
162 :			MMX_QUANTD 5
163 :			MMX_QUANTD 6
164 :			MMX_QUANTD 7
165 :			MMX_QUANTD 8
166 :			MMX_QUANTD 9
167 :			MMX_QUANTD 10
168 :			MMX_QUANTD 11
169 :			MMX_QUANTD 12
170 :			MMX_QUANTD 13
171 :			MMX_QUANTD 14
172 :			MMX_QUANTD 15
173 :			MMX_QUANTD 16
174 :			MMX_QUANTD 17
175 :			MMX_QUANTD 18
176 :			MMX_QUANTD 19
177 :			MMX_QUANTD 20
178 :			MMX_QUANTD 21
179 :			MMX_QUANTD 22
180 :			MMX_QUANTD 23
181 :			MMX_QUANTD 24
182 :			MMX_QUANTD 25
183 :			MMX_QUANTD 26
184 :			MMX_QUANTD 27
185 :			MMX_QUANTD 28
186 :			MMX_QUANTD 29
187 :			MMX_QUANTD 30
188 :			MMX_QUANTD 31
189 :
190 :
191 :			;===========================================================================
192 :			;
193 :			; multiple by 2Q table
194 :			;
195 :			;===========================================================================
196 :
197 :			%macro MMX_MUL_QUANT 1
198 :			times 4 dw %1
199 :			%endmacro
200 :
201 :			mmx_mul_quant
202 :			MMX_MUL_QUANT 1
203 :			MMX_MUL_QUANT 2
204 :			MMX_MUL_QUANT 3
205 :			MMX_MUL_QUANT 4
206 :			MMX_MUL_QUANT 5
207 :			MMX_MUL_QUANT 6
208 :			MMX_MUL_QUANT 7
209 :			MMX_MUL_QUANT 8
210 :			MMX_MUL_QUANT 9
211 :			MMX_MUL_QUANT 10
212 :			MMX_MUL_QUANT 11
213 :			MMX_MUL_QUANT 12
214 :			MMX_MUL_QUANT 13
215 :			MMX_MUL_QUANT 14
216 :			MMX_MUL_QUANT 15
217 :			MMX_MUL_QUANT 16
218 :			MMX_MUL_QUANT 17
219 :			MMX_MUL_QUANT 18
220 :			MMX_MUL_QUANT 19
221 :			MMX_MUL_QUANT 20
222 :			MMX_MUL_QUANT 21
223 :			MMX_MUL_QUANT 22
224 :			MMX_MUL_QUANT 23
225 :			MMX_MUL_QUANT 24
226 :			MMX_MUL_QUANT 25
227 :			MMX_MUL_QUANT 26
228 :			MMX_MUL_QUANT 27
229 :			MMX_MUL_QUANT 28
230 :			MMX_MUL_QUANT 29
231 :			MMX_MUL_QUANT 30
232 :			MMX_MUL_QUANT 31
233 :
234 :			;===========================================================================
235 :			;
236 :			; saturation limits
237 :			;
238 :			;===========================================================================
239 :
240 :			align 16
241 :
242 :	Isibaar	262	mmx_32767_minus_2047 times 4 dw (32767-2047)
243 :			mmx_32768_minus_2048 times 4 dw (32768-2048)
244 :			mmx_2047 times 4 dw 2047
245 :			mmx_minus_2048 times 4 dw (-2048)
246 :			zero times 4 dw 0
247 :
248 :	Isibaar	3	section .text
249 :
250 :			;===========================================================================
251 :			;
252 :			; void quant_intra4_mmx(int16_t * coeff,
253 :			; const int16_t const * data,
254 :			; const uint32_t quant,
255 :			; const uint32_t dcscalar);
256 :			;
257 :			;===========================================================================
258 :
259 :			align ALIGN
260 :			cglobal quant4_intra_mmx
261 :			quant4_intra_mmx
262 :
263 :			push ecx
264 :			push esi
265 :			push edi
266 :
267 :			mov edi, [esp + 12 + 4] ; coeff
268 :			mov esi, [esp + 12 + 8] ; data
269 :			mov eax, [esp + 12 + 12] ; quant
270 :
271 :			movq mm5, [quantd + eax * 8 - 8] ; quantd -> mm5
272 :
273 :			xor ecx, ecx
274 :			cmp al, 1
275 :			jz near .q1loop
276 :
277 :			cmp al, 2
278 :			jz near .q2loop
279 :
280 :			movq mm7, [mmx_div + eax * 8 - 8] ; multipliers[quant] -> mm7
281 :
282 :			align ALIGN
283 :			.loop
284 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
285 :			movq mm3, [esi + 8*ecx + 8] ;
286 :
287 :			pxor mm1, mm1 ; mm1 = 0
288 :			pxor mm4, mm4
289 :
290 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
291 :			pcmpgtw mm4, mm3
292 :
293 :			pxor mm0, mm1 ; mm0 = \|mm0\|
294 :			pxor mm3, mm4 ;
295 :			psubw mm0, mm1 ; displace
296 :			psubw mm3, mm4 ;
297 :
298 :			psllw mm0, 4 ; level << 4
299 :			psllw mm3, 4 ;
300 :
301 :	Isibaar	4	movq mm2, [intra_matrix + 8*ecx]
302 :	Isibaar	3	psrlw mm2, 1 ; intra_matrix[i]>>1
303 :			paddw mm0, mm2
304 :
305 :	Isibaar	4	movq mm2, [intra_matrix_fix + ecx*8]
306 :	Isibaar	3	pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
307 :
308 :	Isibaar	4	movq mm2, [intra_matrix + 8*ecx + 8]
309 :	Isibaar	3	psrlw mm2, 1
310 :			paddw mm3, mm2
311 :
312 :	Isibaar	4	movq mm2, [intra_matrix_fix + ecx*8 + 8]
313 :	Isibaar	3	pmulhw mm3, mm2
314 :
315 :			paddw mm0, mm5 ; + quantd
316 :			paddw mm3, mm5
317 :
318 :			pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
319 :			pmulhw mm3, mm7 ;
320 :			psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17
321 :			psrlw mm3, 1
322 :
323 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
324 :			pxor mm3, mm4 ;
325 :			psubw mm0, mm1 ; undisplace
326 :			psubw mm3, mm4 ;
327 :
328 :			movq [edi + 8*ecx], mm0
329 :			movq [edi + 8*ecx + 8], mm3
330 :
331 :			add ecx,2
332 :			cmp ecx,16
333 :			jnz near .loop
334 :
335 :			.done
336 :			; caclulate data[0] // (int32_t)dcscalar)
337 :
338 :			mov ecx, [esp + 12 + 16] ; dcscalar
339 :			mov edx, ecx
340 :			movsx eax, word [esi] ; data[0]
341 :			shr edx, 1 ; edx = dcscalar /2
342 :			cmp eax, 0
343 :			jg .gtzero
344 :
345 :			sub eax, edx
346 :			jmp short .mul
347 :			.gtzero
348 :			add eax, edx
349 :			.mul
350 :			cdq ; expand eax -> edx:eax
351 :			idiv ecx ; eax = edx:eax / dcscalar
352 :
353 :			mov [edi], ax ; coeff[0] = ax
354 :
355 :			pop edi
356 :			pop esi
357 :			pop ecx
358 :
359 :			ret
360 :
361 :			align ALIGN
362 :			.q1loop
363 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
364 :			movq mm3, [esi + 8*ecx + 8] ;
365 :
366 :			pxor mm1, mm1 ; mm1 = 0
367 :			pxor mm4, mm4 ;
368 :
369 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
370 :			pcmpgtw mm4, mm3 ;
371 :
372 :			pxor mm0, mm1 ; mm0 = \|mm0\|
373 :			pxor mm3, mm4 ;
374 :			psubw mm0, mm1 ; displace
375 :			psubw mm3, mm4 ;
376 :
377 :			psllw mm0, 4
378 :			psllw mm3, 4
379 :
380 :	Isibaar	4	movq mm2, [intra_matrix + 8*ecx]
381 :	Isibaar	3	psrlw mm2, 1
382 :			paddw mm0, mm2
383 :
384 :	Isibaar	4	movq mm2, [intra_matrix_fix + ecx*8]
385 :	Isibaar	3	pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
386 :
387 :	Isibaar	4	movq mm2, [intra_matrix + 8*ecx + 8]
388 :	Isibaar	3	psrlw mm2, 1
389 :			paddw mm3, mm2
390 :
391 :	Isibaar	4	movq mm2, [intra_matrix_fix + ecx*8 + 8]
392 :	Isibaar	3	pmulhw mm3, mm2
393 :
394 :			paddw mm0, mm5
395 :			paddw mm3, mm5
396 :
397 :			psrlw mm0, 1 ; mm0 >>= 1 (/2)
398 :			psrlw mm3, 1 ;
399 :
400 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
401 :			pxor mm3, mm4 ;
402 :			psubw mm0, mm1 ; undisplace
403 :			psubw mm3, mm4 ;
404 :
405 :			movq [edi + 8*ecx], mm0
406 :			movq [edi + 8*ecx + 8], mm3
407 :
408 :			add ecx,2
409 :			cmp ecx,16
410 :			jnz near .q1loop
411 :			jmp near .done
412 :
413 :
414 :			align ALIGN
415 :			.q2loop
416 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
417 :			movq mm3, [esi + 8*ecx + 8] ;
418 :
419 :			pxor mm1, mm1 ; mm1 = 0
420 :			pxor mm4, mm4 ;
421 :
422 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
423 :			pcmpgtw mm4, mm3 ;
424 :
425 :			pxor mm0, mm1 ; mm0 = \|mm0\|
426 :			pxor mm3, mm4 ;
427 :			psubw mm0, mm1 ; displace
428 :			psubw mm3, mm4 ;
429 :
430 :			psllw mm0, 4
431 :			psllw mm3, 4
432 :
433 :	Isibaar	4	movq mm2, [intra_matrix + 8*ecx]
434 :	Isibaar	3	psrlw mm2, 1
435 :			paddw mm0, mm2
436 :
437 :	Isibaar	4	movq mm2, [intra_matrix_fix + ecx*8]
438 :	Isibaar	3	pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
439 :
440 :	Isibaar	4	movq mm2, [intra_matrix + 8*ecx + 8]
441 :	Isibaar	3	psrlw mm2, 1
442 :			paddw mm3, mm2
443 :
444 :	Isibaar	4	movq mm2, [intra_matrix_fix + ecx*8 + 8]
445 :	Isibaar	3	pmulhw mm3, mm2
446 :
447 :			paddw mm0, mm5
448 :			paddw mm3, mm5
449 :
450 :			psrlw mm0, 2 ; mm0 >>= 1 (/4)
451 :			psrlw mm3, 2 ;
452 :
453 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
454 :			pxor mm3, mm4 ;
455 :			psubw mm0, mm1 ; undisplace
456 :			psubw mm3, mm4 ;
457 :
458 :			movq [edi + 8*ecx], mm0
459 :			movq [edi + 8*ecx + 8], mm3
460 :
461 :			add ecx,2
462 :			cmp ecx,16
463 :			jnz near .q2loop
464 :			jmp near .done
465 :
466 :
467 :			;===========================================================================
468 :			;
469 :			; uint32_t quant4_inter_mmx(int16_t * coeff,
470 :			; const int16_t const * data,
471 :			; const uint32_t quant);
472 :			;
473 :			;===========================================================================
474 :
475 :			align ALIGN
476 :			cglobal quant4_inter_mmx
477 :			quant4_inter_mmx
478 :
479 :			push ecx
480 :			push esi
481 :			push edi
482 :
483 :			mov edi, [esp + 12 + 4] ; coeff
484 :			mov esi, [esp + 12 + 8] ; data
485 :			mov eax, [esp + 12 + 12] ; quant
486 :
487 :			xor ecx, ecx
488 :
489 :			pxor mm5, mm5 ; sum
490 :
491 :			cmp al, 1
492 :			jz near .q1loop
493 :
494 :			cmp al, 2
495 :			jz near .q2loop
496 :
497 :			movq mm7, [mmx_div + eax * 8 - 8] ; divider
498 :
499 :			align ALIGN
500 :			.loop
501 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
502 :			movq mm3, [esi + 8*ecx + 8] ;
503 :			pxor mm1, mm1 ; mm1 = 0
504 :			pxor mm4, mm4 ;
505 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
506 :			pcmpgtw mm4, mm3 ;
507 :			pxor mm0, mm1 ; mm0 = \|mm0\|
508 :			pxor mm3, mm4 ;
509 :			psubw mm0, mm1 ; displace
510 :			psubw mm3, mm4 ;
511 :
512 :			psllw mm0, 4
513 :			psllw mm3, 4
514 :
515 :	Isibaar	4	movq mm2, [inter_matrix + 8*ecx]
516 :	Isibaar	3	psrlw mm2, 1
517 :			paddw mm0, mm2
518 :
519 :	Isibaar	4	movq mm2, [inter_matrix_fix + ecx*8]
520 :	Isibaar	3	pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
521 :
522 :	Isibaar	4	movq mm2, [inter_matrix + 8*ecx + 8]
523 :	Isibaar	3	psrlw mm2, 1
524 :			paddw mm3, mm2
525 :
526 :	Isibaar	4	movq mm2, [inter_matrix_fix + ecx*8 + 8]
527 :	Isibaar	3	pmulhw mm3, mm2
528 :
529 :			pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
530 :			pmulhw mm3, mm7 ;
531 :			psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17
532 :			psrlw mm3, 1
533 :
534 :			paddw mm5, mm0 ; sum += mm0
535 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
536 :			paddw mm5, mm3 ;
537 :			pxor mm3, mm4 ;
538 :			psubw mm0, mm1 ; undisplace
539 :			psubw mm3, mm4
540 :			movq [edi + 8*ecx], mm0
541 :			movq [edi + 8*ecx + 8], mm3
542 :
543 :			add ecx, 2
544 :			cmp ecx, 16
545 :			jnz near .loop
546 :
547 :			.done
548 :			pmaddwd mm5, [mmx_one]
549 :			movq mm0, mm5
550 :			psrlq mm5, 32
551 :			paddd mm0, mm5
552 :			movd eax, mm0 ; return sum
553 :
554 :			pop edi
555 :			pop esi
556 :			pop ecx
557 :
558 :			ret
559 :
560 :			align ALIGN
561 :			.q1loop
562 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
563 :			movq mm3, [esi + 8*ecx+ 8]
564 :			;
565 :			pxor mm1, mm1 ; mm1 = 0
566 :			pxor mm4, mm4 ;
567 :
568 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
569 :			pcmpgtw mm4, mm3 ;
570 :
571 :			pxor mm0, mm1 ; mm0 = \|mm0\|
572 :			pxor mm3, mm4 ;
573 :			psubw mm0, mm1 ; displace
574 :			psubw mm3, mm4 ;
575 :
576 :			psllw mm0, 4
577 :			psllw mm3, 4
578 :
579 :	Isibaar	4	movq mm2, [inter_matrix + 8*ecx]
580 :	Isibaar	3	psrlw mm2, 1
581 :			paddw mm0, mm2
582 :
583 :	Isibaar	4	movq mm2, [inter_matrix_fix + ecx*8]
584 :	Isibaar	3	pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
585 :
586 :	Isibaar	4	movq mm2, [inter_matrix + 8*ecx + 8]
587 :	Isibaar	3	psrlw mm2, 1
588 :			paddw mm3, mm2
589 :
590 :	Isibaar	4	movq mm2, [inter_matrix_fix + ecx*8 + 8]
591 :	Isibaar	3	pmulhw mm3, mm2
592 :
593 :			psrlw mm0, 1 ; mm0 >>= 1 (/2)
594 :			psrlw mm3, 1 ;
595 :
596 :			paddw mm5, mm0 ; sum += mm0
597 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
598 :			paddw mm5, mm3 ;
599 :			pxor mm3, mm4 ;
600 :			psubw mm0, mm1 ; undisplace
601 :			psubw mm3, mm4
602 :
603 :			movq [edi + 8*ecx], mm0
604 :			movq [edi + 8*ecx + 8], mm3
605 :
606 :			add ecx,2
607 :			cmp ecx,16
608 :			jnz near .q1loop
609 :
610 :			jmp .done
611 :
612 :
613 :			align ALIGN
614 :			.q2loop
615 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
616 :			movq mm3, [esi + 8*ecx+ 8]
617 :			;
618 :			pxor mm1, mm1 ; mm1 = 0
619 :			pxor mm4, mm4 ;
620 :
621 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
622 :			pcmpgtw mm4, mm3 ;
623 :
624 :			pxor mm0, mm1 ; mm0 = \|mm0\|
625 :			pxor mm3, mm4 ;
626 :			psubw mm0, mm1 ; displace
627 :			psubw mm3, mm4 ;
628 :
629 :			psllw mm0, 4
630 :			psllw mm3, 4
631 :
632 :	Isibaar	4	movq mm2, [inter_matrix + 8*ecx]
633 :	Isibaar	3	psrlw mm2, 1
634 :			paddw mm0, mm2
635 :
636 :	Isibaar	4	movq mm2, [inter_matrix_fix + ecx*8]
637 :	Isibaar	3	pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
638 :
639 :	Isibaar	4	movq mm2, [inter_matrix + 8*ecx + 8]
640 :	Isibaar	3	psrlw mm2, 1
641 :			paddw mm3, mm2
642 :
643 :	Isibaar	4	movq mm2, [inter_matrix_fix + ecx*8 + 8]
644 :	Isibaar	3	pmulhw mm3, mm2
645 :
646 :			psrlw mm0, 2 ; mm0 >>= 1 (/2)
647 :			psrlw mm3, 2 ;
648 :
649 :			paddw mm5, mm0 ; sum += mm0
650 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
651 :			paddw mm5, mm3 ;
652 :			pxor mm3, mm4 ;
653 :			psubw mm0, mm1 ; undisplace
654 :			psubw mm3, mm4
655 :
656 :			movq [edi + 8*ecx], mm0
657 :			movq [edi + 8*ecx + 8], mm3
658 :
659 :			add ecx,2
660 :			cmp ecx,16
661 :			jnz near .q2loop
662 :
663 :			jmp .done
664 :
665 :
666 :			;===========================================================================
667 :			;
668 :			; void dequant4_intra_mmx(int16_t *data,
669 :			; const int16_t const *coeff,
670 :			; const uint32_t quant,
671 :			; const uint32_t dcscalar);
672 :			;
673 :			;===========================================================================
674 :
675 :	Isibaar	262	; Note: in order to saturate 'easily', we pre-shift the quantifier
676 :			; by 4. Then, the high-word of (coeff[]matrix[i]quant) are used to
677 :			; build a saturating mask. It is non-zero only when an overflow occured.
678 :			; We thus avoid packing/unpacking toward double-word.
679 :			; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g.,
680 :			; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not
681 :			; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a
682 :			; and quant in [1..31].
683 :			;
684 :			; The original loop is:
685 :			;
686 :			%if 0
687 :			movq mm0, [ecx+8eax + 816] ; mm0 = coeff[i]
688 :			pxor mm1, mm1
689 :			pcmpgtw mm1, mm0
690 :			pxor mm0, mm1 ; change sign if negative
691 :			psubw mm0, mm1 ; -> mm0 = abs(coeff[i]), mm1 = sign of coeff[i]
692 :
693 :			movq mm2, mm7 ; mm2 = quant
694 :			pmullw mm2, [intra_matrix + 8eax + 816 ] ; matrix[i]*quant.
695 :
696 :			movq mm6, mm2
697 :			pmulhw mm2, mm0 ; high of coeff(matrixquant) (should be 0 if no overflow)
698 :			pmullw mm0, mm6 ; low of coeff(matrixquant)
699 :
700 :			pxor mm5, mm5
701 :			pcmpgtw mm2, mm5 ; otherflow?
702 :			psrlw mm2, 5 ; =0 if no clamp, 2047 otherwise
703 :			psrlw mm0, 5
704 :			paddw mm0, mm1 ; start restoring sign
705 :			por mm0, mm2 ; saturate to 2047 if needed
706 :			pxor mm0, mm1 ; finish negating back
707 :
708 :			movq [edx + 8eax + 816], mm0 ; data[i]
709 :			add eax, 1
710 :			%endif
711 :
712 :			;********************************************************************
713 :
714 :	Isibaar	3	align 16
715 :			cglobal dequant4_intra_mmx
716 :	Isibaar	262	dequant4_intra_mmx:
717 :	Isibaar	3
718 :	Isibaar	262	mov edx, [esp+4] ; data
719 :			mov ecx, [esp+8] ; coeff
720 :			mov eax, [esp+12] ; quant
721 :	Isibaar	3
722 :	Isibaar	262	movq mm7, [mmx_mul_quant + eax*8 - 8]
723 :			mov eax, -16 ; to keep aligned, we regularly process coeff[0]
724 :			psllw mm7, 2 ; << 2. See comment.
725 :			pxor mm6, mm6 ; this is a NOP
726 :	Isibaar	3
727 :	Isibaar	262	align 16
728 :	Isibaar	3	.loop
729 :	Isibaar	262	movq mm0, [ecx+8eax + 816] ; mm0 = c = coeff[i]
730 :			movq mm3, [ecx+8eax + 816 +8]; mm3 = c' = coeff[i+1]
731 :			pxor mm1, mm1
732 :			pxor mm4, mm4
733 :			pcmpgtw mm1, mm0 ; mm1 = sgn(c)
734 :			movq mm2, mm7 ; mm2 = quant
735 :
736 :			pcmpgtw mm4, mm3 ; mm4 = sgn(c')
737 :			pmullw mm2, [intra_matrix + 8eax + 816 ] ; matrix[i]*quant
738 :	Isibaar	3
739 :	Isibaar	262	pxor mm0, mm1 ; negate if negative
740 :			pxor mm3, mm4 ; negate if negative
741 :
742 :			psubw mm0, mm1
743 :			psubw mm3, mm4
744 :
745 :			; we're short on register, here. Poor pairing...
746 :	Isibaar	3
747 :	Isibaar	262	movq mm5, mm2
748 :			pmullw mm2, mm0 ; low of coeff(matrixquant)
749 :	Isibaar	3
750 :	Isibaar	262	pmulhw mm0, mm5 ; high of coeff(matrixquant)
751 :			movq mm5, mm7 ; mm2 = quant
752 :	Isibaar	3
753 :	Isibaar	262	pmullw mm5, [intra_matrix + 8eax + 816 +8] ; matrix[i+1]*quant
754 :	Isibaar	3
755 :	Isibaar	262	movq mm6, mm5
756 :			add eax,2 ; z-flag will be tested later
757 :	Isibaar	3
758 :	Isibaar	262	pmullw mm6, mm3 ; low of coeff(matrixquant)
759 :			pmulhw mm3, mm5 ; high of coeff(matrixquant)
760 :	Isibaar	3
761 :	Isibaar	262	pcmpgtw mm0, [zero]
762 :			paddusw mm2, mm0
763 :			psrlw mm2, 5
764 :	Isibaar	3
765 :	Isibaar	262	pcmpgtw mm3, [zero]
766 :			paddusw mm6, mm3
767 :			psrlw mm6, 5
768 :	Isibaar	3
769 :	Isibaar	262	pxor mm2, mm1 ; start negating back
770 :			pxor mm6, mm4 ; start negating back
771 :	Isibaar	3
772 :	Isibaar	262	psubusw mm1, mm0
773 :			psubusw mm4, mm3
774 :	Isibaar	3
775 :	Isibaar	262	psubw mm2, mm1 ; finish negating back
776 :			psubw mm6, mm4 ; finish negating back
777 :	Isibaar	3
778 :	Isibaar	262	movq [edx + 8eax + 816 -2*8 ], mm2 ; data[i]
779 :			movq [edx + 8eax + 816 -2*8 +8], mm6 ; data[i+1]
780 :	Isibaar	3
781 :	Isibaar	268	jnz near .loop
782 :	Isibaar	3
783 :	Isibaar	262	; deal with DC
784 :	Isibaar	3
785 :	Isibaar	262	movd mm0, [ecx]
786 :			pmullw mm0, [esp+16] ; dcscalar
787 :			movq mm2, [mmx_32767_minus_2047]
788 :			paddsw mm0, mm2
789 :			psubsw mm0, mm2
790 :			movq mm2, [mmx_32768_minus_2048]
791 :			psubsw mm0, mm2
792 :			paddsw mm0, mm2
793 :			movd eax, mm0
794 :			mov [edx], ax
795 :	Isibaar	3
796 :	Isibaar	262	ret
797 :
798 :	Isibaar	3	;===========================================================================
799 :			;
800 :			; void dequant4_inter_mmx(int16_t * data,
801 :			; const int16_t * const coeff,
802 :			; const uint32_t quant);
803 :			;
804 :			;===========================================================================
805 :
806 :	Isibaar	262	; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier
807 :			; so we handle the 3 cases: c<0, c==0, and c>0 in one shot.
808 :			; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0.
809 :			; It's mixed with the extraction of the absolute value.
810 :
811 :	Isibaar	3	align 16
812 :			cglobal dequant4_inter_mmx
813 :	Isibaar	262	dequant4_inter_mmx:
814 :	Isibaar	3
815 :	Isibaar	262	mov edx, [esp+ 4] ; data
816 :			mov ecx, [esp+ 8] ; coeff
817 :			mov eax, [esp+12] ; quant
818 :			movq mm7, [mmx_mul_quant + eax*8 - 8]
819 :			mov eax, -16
820 :			paddw mm7, mm7 ; << 1
821 :			pxor mm6, mm6 ; mismatch sum
822 :	Isibaar	3
823 :	Isibaar	262	align 16
824 :	Isibaar	3	.loop
825 :	Isibaar	262	movq mm0, [ecx+8eax + 816 ] ; mm0 = coeff[i]
826 :			movq mm2, [ecx+8eax + 816 +8] ; mm2 = coeff[i+1]
827 :			add eax,2
828 :	Isibaar	3
829 :	Isibaar	262	pxor mm1, mm1
830 :			pxor mm3, mm3
831 :			pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved)
832 :			pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved)
833 :			paddsw mm0, mm1 ; c += sgn(c)
834 :			paddsw mm2, mm3 ; c += sgn(c')
835 :			paddw mm0, mm0 ; c *= 2
836 :			paddw mm2, mm2 ; c'*= 2
837 :	Isibaar	3
838 :	Isibaar	262	pxor mm4, mm4
839 :			pxor mm5, mm5
840 :			psubw mm4, mm0 ; -c
841 :			psubw mm5, mm2 ; -c'
842 :			psraw mm4, 16 ; mm4 = sgn(-c)
843 :			psraw mm5, 16 ; mm5 = sgn(-c')
844 :			psubsw mm0, mm4 ; c -= sgn(-c)
845 :			psubsw mm2, mm5 ; c' -= sgn(-c')
846 :			pxor mm0, mm1 ; finish changing sign if needed
847 :			pxor mm2, mm3 ; finish changing sign if needed
848 :	Isibaar	3
849 :	Isibaar	262	; we're short on register, here. Poor pairing...
850 :	Isibaar	3
851 :	Isibaar	262	movq mm4, mm7 ; (matrix*quant)
852 :			pmullw mm4, [inter_matrix + 8eax + 816 -2*8]
853 :			movq mm5, mm4
854 :			pmulhw mm5, mm0 ; high of c(matrixquant)
855 :			pmullw mm0, mm4 ; low of c(matrixquant)
856 :	Isibaar	3
857 :	Isibaar	262	movq mm4, mm7 ; (matrix*quant)
858 :			pmullw mm4, [inter_matrix + 8eax + 816 -2*8 + 8]
859 :	Isibaar	3
860 :	Isibaar	262	pcmpgtw mm5, [zero]
861 :			paddusw mm0, mm5
862 :			psrlw mm0, 5
863 :			pxor mm0, mm1 ; start restoring sign
864 :			psubusw mm1, mm5
865 :	Isibaar	3
866 :	Isibaar	262	movq mm5, mm4
867 :			pmulhw mm5, mm2 ; high of c(matrixquant)
868 :			pmullw mm2, mm4 ; low of c(matrixquant)
869 :			psubw mm0, mm1 ; finish restoring sign
870 :	Isibaar	3
871 :	Isibaar	262	pcmpgtw mm5, [zero]
872 :			paddusw mm2, mm5
873 :			psrlw mm2, 5
874 :			pxor mm2, mm3 ; start restoring sign
875 :			psubusw mm3, mm5
876 :			psubw mm2, mm3 ; finish restoring sign
877 :	Isibaar	3
878 :	Isibaar	262	pxor mm6, mm0 ; mismatch control
879 :			movq [edx + 8eax + 816 -2*8 ], mm0 ; data[i]
880 :			pxor mm6, mm2 ; mismatch control
881 :			movq [edx + 8eax + 816 -2*8 +8], mm2 ; data[i+1]
882 :	Isibaar	3
883 :	Isibaar	268	jnz near .loop
884 :	Isibaar	3
885 :	Isibaar	262	; mismatch control
886 :	Isibaar	3
887 :	Isibaar	262	movq mm0, mm6
888 :			psrlq mm0, 48
889 :			movq mm1, mm6
890 :			movq mm2, mm6
891 :			psrlq mm1, 32
892 :			pxor mm6, mm0
893 :			psrlq mm2, 16
894 :			pxor mm6, mm1
895 :			pxor mm6, mm2
896 :			movd eax, mm6
897 :			and eax, 1
898 :			xor eax, 1
899 :			xor word [edx + 2*63], ax
900 :	Isibaar	3
901 :	Isibaar	262	ret
902 :	Isibaar	3

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4