Annotation of /trunk/xvidcore/src/quant/x86_64_asm/quantize_h263_mmx.asm

Revision 1793 - (view) (download)

1 :	edgomez	1586	;/*****************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - MPEG4 Quantization H263 implementation / MMX optimized -
5 :			; *
6 :			; * Copyright(C) 2001-2003 Peter Ross <pross@xvid.org>
7 :			; * 2002-2003 Pascal Massimino <skal@planet-d.net>
8 :			; * 2004 Andre Werthmann <wertmann@aei.mpg.de>
9 :			; *
10 :			; * This program is free software ; you can redistribute it and/or modify
11 :			; * it under the terms of the GNU General Public License as published by
12 :			; * the Free Software Foundation ; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :			; *
15 :			; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :			; *
20 :			; * You should have received a copy of the GNU General Public License
21 :			; * along with this program ; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :			; *
24 :	Isibaar	1793	; * $Id: quantize_h263_mmx.asm,v 1.3 2008-11-11 20:46:24 Isibaar Exp $
25 :	edgomez	1586	; *
26 :			; ****************************************************************************/
27 :
28 :			; enable dequant saturate [-2048,2047], test purposes only.
29 :			%define SATURATE
30 :
31 :			BITS 64
32 :
33 :			%macro cglobal 1
34 :			%ifdef PREFIX
35 :			%ifdef MARK_FUNCS
36 :			global _%1:function %1.endfunc-%1
37 :			%define %1 _%1:function %1.endfunc-%1
38 :	Isibaar	1793	%define ENDFUNC .endfunc
39 :	edgomez	1586	%else
40 :			global _%1
41 :			%define %1 _%1
42 :	Isibaar	1793	%define ENDFUNC
43 :	edgomez	1586	%endif
44 :			%else
45 :			%ifdef MARK_FUNCS
46 :			global %1:function %1.endfunc-%1
47 :	Isibaar	1793	%define ENDFUNC .endfunc
48 :	edgomez	1586	%else
49 :			global %1
50 :	Isibaar	1793	%define ENDFUNC
51 :	edgomez	1586	%endif
52 :			%endif
53 :			%endmacro
54 :
55 :			;=============================================================================
56 :			; Read only Local data
57 :			;=============================================================================
58 :
59 :			%ifdef FORMAT_COFF
60 :			SECTION .rodata
61 :			%else
62 :			SECTION .rodata align=16
63 :			%endif
64 :
65 :			ALIGN 16
66 :			plus_one:
67 :			times 8 dw 1
68 :
69 :			;-----------------------------------------------------------------------------
70 :			;
71 :			; subtract by Q/2 table
72 :			;
73 :			;-----------------------------------------------------------------------------
74 :
75 :			ALIGN 16
76 :			mmx_sub:
77 :			%assign quant 1
78 :			%rep 31
79 :			times 4 dw quant / 2
80 :			%assign quant quant+1
81 :			%endrep
82 :
83 :			;-----------------------------------------------------------------------------
84 :			;
85 :			; divide by 2Q table
86 :			;
87 :			; use a shift of 16 to take full advantage of _pmulhw_
88 :			; for q=1, _pmulhw_ will overflow so it is treated seperately
89 :			; (3dnow2 provides _pmulhuw_ which wont cause overflow)
90 :			;
91 :			;-----------------------------------------------------------------------------
92 :
93 :			ALIGN 16
94 :			mmx_div:
95 :			%assign quant 1
96 :			%rep 31
97 :			times 4 dw (1<<16) / (quant*2) + 1
98 :			%assign quant quant+1
99 :			%endrep
100 :
101 :			;-----------------------------------------------------------------------------
102 :			;
103 :			; add by (odd(Q) ? Q : Q - 1) table
104 :			;
105 :			;-----------------------------------------------------------------------------
106 :
107 :			ALIGN 16
108 :			mmx_add:
109 :			%assign quant 1
110 :			%rep 31
111 :			%if quant % 2 != 0
112 :			times 4 dw quant
113 :			%else
114 :			times 4 dw quant - 1
115 :			%endif
116 :			%assign quant quant+1
117 :			%endrep
118 :
119 :			;-----------------------------------------------------------------------------
120 :			;
121 :			; multiple by 2Q table
122 :			;
123 :			;-----------------------------------------------------------------------------
124 :
125 :			ALIGN 16
126 :			mmx_mul:
127 :			%assign quant 1
128 :			%rep 31
129 :			times 4 dw quant*2
130 :			%assign quant quant+1
131 :			%endrep
132 :
133 :			;-----------------------------------------------------------------------------
134 :			;
135 :			; saturation limits
136 :			;
137 :			;-----------------------------------------------------------------------------
138 :
139 :			ALIGN 16
140 :			sse2_2047:
141 :			times 8 dw 2047
142 :
143 :			ALIGN 16
144 :			mmx_2047:
145 :			times 4 dw 2047
146 :
147 :			ALIGN 8
148 :			mmx_32768_minus_2048:
149 :			times 4 dw (32768-2048)
150 :
151 :			mmx_32767_minus_2047:
152 :			times 4 dw (32767-2047)
153 :
154 :
155 :			;=============================================================================
156 :			; Code
157 :			;=============================================================================
158 :
159 :			SECTION .text align=16
160 :
161 :			cglobal quant_h263_intra_x86_64
162 :			cglobal quant_h263_inter_x86_64
163 :			cglobal dequant_h263_intra_x86_64
164 :			cglobal dequant_h263_inter_x86_64
165 :
166 :			;-----------------------------------------------------------------------------
167 :			;
168 :			; uint32_t quant_h263_intra_x86_64(int16_t * coeff,
169 :			; const int16_t const * data,
170 :			; const uint32_t quant,
171 :			; const uint32_t dcscalar,
172 :			; const uint16_t *mpeg_matrices);
173 :			; Port of the 32bit mmx cousin
174 :			;-----------------------------------------------------------------------------
175 :
176 :			ALIGN 16
177 :			quant_h263_intra_x86_64:
178 :			mov rax, rdx ; quant
179 :			; rsi is data
180 :			; rdi is coeff
181 :			mov r8, rcx ; save dscalar
182 :
183 :			xor rcx, rcx
184 :			cmp rax, 1
185 :			jz .q1loop
186 :
187 :			lea r9, [mmx_div wrt rip]
188 :			movq mm7, [r9 + rax * 8 - 8]
189 :
190 :			ALIGN 16
191 :	Isibaar	1793	.loop:
192 :	edgomez	1586	movq mm0, [rsi + 8*rcx] ; mm0 = [1st]
193 :			movq mm3, [rsi + 8*rcx + 8]
194 :			pxor mm1, mm1 ; mm1 = 0
195 :			pxor mm4, mm4 ;
196 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
197 :			pcmpgtw mm4, mm3 ;
198 :			pxor mm0, mm1 ; mm0 = \|mm0\|
199 :			pxor mm3, mm4 ;
200 :			psubw mm0, mm1 ; displace
201 :			psubw mm3, mm4 ;
202 :			pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
203 :			pmulhw mm3, mm7 ;
204 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
205 :			pxor mm3, mm4 ;
206 :			psubw mm0, mm1 ; undisplace
207 :			psubw mm3, mm4 ;
208 :			movq [rdi + 8*rcx], mm0
209 :			movq [rdi + 8*rcx + 8], mm3
210 :
211 :			add rcx, 2
212 :			cmp rcx, 16
213 :			jnz .loop
214 :
215 :	Isibaar	1793	.done:
216 :	edgomez	1586
217 :			; caclulate data[0] // (int32_t)dcscalar)
218 :			mov rcx, r8 ; dscalar
219 :			mov rdx, rcx
220 :			movsx eax, word [rsi] ; data[0] with sign extend
221 :			shr rdx, 1 ; edx = dcscalar /2
222 :			cmp eax, 0
223 :			jg .gtzero
224 :
225 :			sub rax, rdx
226 :			jmp short .mul
227 :
228 :	Isibaar	1793	.gtzero:
229 :	edgomez	1586	add rax, rdx
230 :	Isibaar	1793	.mul:
231 :	edgomez	1586	cdq ; expand eax -> edx:eax
232 :			idiv ecx ; eax = edx:eax / dcscalar
233 :			mov [rdi], ax ; coeff[0] = ax
234 :
235 :			xor rax, rax ; return(0);
236 :
237 :			ret
238 :
239 :			ALIGN 16
240 :	Isibaar	1793	.q1loop:
241 :	edgomez	1586	movq mm0, [rsi + 8*rcx] ; mm0 = [1st]
242 :			movq mm3, [rsi + 8*rcx + 8]
243 :			pxor mm1, mm1 ; mm1 = 0
244 :			pxor mm4, mm4 ;
245 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
246 :			pcmpgtw mm4, mm3 ;
247 :			pxor mm0, mm1 ; mm0 = \|mm0\|
248 :			pxor mm3, mm4 ;
249 :			psubw mm0, mm1 ; displace
250 :			psubw mm3, mm4 ;
251 :			psrlw mm0, 1 ; mm0 >>= 1 (/2)
252 :			psrlw mm3, 1 ;
253 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
254 :			pxor mm3, mm4
255 :			psubw mm0, mm1 ; undisplace
256 :			psubw mm3, mm4 ;
257 :			movq [rdi + 8*rcx], mm0
258 :			movq [rdi + 8*rcx + 8], mm3
259 :
260 :			add rcx, 2
261 :			cmp rcx, 16
262 :			jnz .q1loop
263 :
264 :			jmp .done
265 :	Isibaar	1793	ENDFUNC
266 :	edgomez	1586
267 :
268 :			;-----------------------------------------------------------------------------
269 :			;
270 :			; uint32_t quant_h263_inter_x86_64(int16_t * coeff,
271 :			; const int16_t const * data,
272 :			; const uint32_t quant,
273 :			; const uint16_t *mpeg_matrices);
274 :			; Port of the 32bit mmx cousin
275 :			;-----------------------------------------------------------------------------
276 :
277 :			ALIGN 16
278 :			quant_h263_inter_x86_64:
279 :			mov rax, rdx ; quant
280 :			; rsi is data
281 :			; rdi is coeff
282 :
283 :			xor rcx, rcx
284 :
285 :			pxor mm5, mm5 ; sum
286 :			lea r9, [mmx_sub wrt rip]
287 :			movq mm6, [r9 + rax * 8 - 8] ; sub
288 :
289 :			cmp rax, 1
290 :			jz .q1loop
291 :
292 :			lea r9, [mmx_div wrt rip]
293 :			movq mm7, [r9 + rax * 8 - 8] ; divider
294 :
295 :			ALIGN 8
296 :	Isibaar	1793	.loop:
297 :	edgomez	1586	movq mm0, [rsi + 8*rcx] ; mm0 = [1st]
298 :			movq mm3, [rsi + 8*rcx + 8]
299 :			pxor mm1, mm1 ; mm1 = 0
300 :			pxor mm4, mm4 ;
301 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
302 :			pcmpgtw mm4, mm3 ;
303 :			pxor mm0, mm1 ; mm0 = \|mm0\|
304 :			pxor mm3, mm4 ;
305 :			psubw mm0, mm1 ; displace
306 :			psubw mm3, mm4 ;
307 :			psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0)
308 :			psubusw mm3, mm6 ;
309 :			pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
310 :			pmulhw mm3, mm7 ;
311 :			paddw mm5, mm0 ; sum += mm0
312 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
313 :			paddw mm5, mm3 ;
314 :			pxor mm3, mm4 ;
315 :			psubw mm0, mm1 ; undisplace
316 :			psubw mm3, mm4
317 :			movq [rdi + 8*rcx], mm0
318 :			movq [rdi + 8*rcx + 8], mm3
319 :
320 :			add rcx, 2
321 :			cmp rcx, 16
322 :			jnz .loop
323 :
324 :	Isibaar	1793	.done:
325 :	edgomez	1586	pmaddwd mm5, [plus_one wrt rip]
326 :			movq mm0, mm5
327 :			psrlq mm5, 32
328 :			paddd mm0, mm5
329 :
330 :			movd rax, mm0 ; return sum
331 :
332 :			ret
333 :
334 :			ALIGN 8
335 :	Isibaar	1793	.q1loop:
336 :	edgomez	1586	movq mm0, [rsi + 8*rcx] ; mm0 = [1st]
337 :			movq mm3, [rsi + 8*rcx+ 8] ;
338 :			pxor mm1, mm1 ; mm1 = 0
339 :			pxor mm4, mm4 ;
340 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
341 :			pcmpgtw mm4, mm3 ;
342 :			pxor mm0, mm1 ; mm0 = \|mm0\|
343 :			pxor mm3, mm4 ;
344 :			psubw mm0, mm1 ; displace
345 :			psubw mm3, mm4 ;
346 :			psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0)
347 :			psubusw mm3, mm6 ;
348 :			psrlw mm0, 1 ; mm0 >>= 1 (/2)
349 :			psrlw mm3, 1 ;
350 :			paddw mm5, mm0 ; sum += mm0
351 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
352 :			paddw mm5, mm3 ;
353 :			pxor mm3, mm4 ;
354 :			psubw mm0, mm1 ; undisplace
355 :			psubw mm3, mm4
356 :			movq [rdi + 8*rcx], mm0
357 :			movq [rdi + 8*rcx + 8], mm3
358 :
359 :			add rcx, 2
360 :			cmp rcx, 16
361 :			jnz .q1loop
362 :
363 :			jmp .done
364 :	Isibaar	1793	ENDFUNC
365 :	edgomez	1586
366 :
367 :			;-----------------------------------------------------------------------------
368 :			;
369 :			; uint32_t dequant_h263_intra_x86_64(int16_t *data,
370 :			; const int16_t const *coeff,
371 :			; const uint32_t quant,
372 :			; const uint32_t dcscalar,
373 :			; const uint16_t *mpeg_matrices);
374 :			; port of the 32bit xmm cousin
375 :			;-----------------------------------------------------------------------------
376 :
377 :			; this is the same as dequant_inter_mmx, except that we're
378 :			; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
379 :
380 :			ALIGN 16
381 :			dequant_h263_intra_x86_64:
382 :
383 :			mov rax, rdx ; quant
384 :			mov [rsp-8], rcx ; save dscalar
385 :			mov rcx, rsi ; coeff
386 :			mov rdx, rdi ; data
387 :
388 :			lea r9, [mmx_add wrt rip]
389 :			movq mm6, [r9 + rax*8 - 8] ; quant or quant-1
390 :			lea r9, [mmx_mul wrt rip]
391 :			movq mm7, [r9 + rax8 - 8] ; 2quant
392 :			mov rax, -16
393 :
394 :			ALIGN 16
395 :	Isibaar	1793	.loop:
396 :	edgomez	1586	movq mm0, [rcx+8rax+816] ; c = coeff[i]
397 :			movq mm3, [rcx+8rax+816 + 8] ; c' = coeff[i+1]
398 :			pxor mm1, mm1
399 :			pxor mm4, mm4
400 :			pcmpgtw mm1, mm0 ; sign(c)
401 :			pcmpgtw mm4, mm3 ; sign(c')
402 :			pxor mm2, mm2
403 :			pxor mm5, mm5
404 :			pcmpeqw mm2, mm0 ; c is zero
405 :			pcmpeqw mm5, mm3 ; c' is zero
406 :			pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
407 :			pandn mm5, mm6
408 :			pxor mm0, mm1 ; negate if negative
409 :			pxor mm3, mm4 ; negate if negative
410 :			psubw mm0, mm1
411 :			psubw mm3, mm4
412 :			pmullw mm0, mm7 ; *= 2Q
413 :			pmullw mm3, mm7 ; *= 2Q
414 :			paddw mm0, mm2 ; + offset
415 :			paddw mm3, mm5 ; + offset
416 :			paddw mm0, mm1 ; negate back
417 :			paddw mm3, mm4 ; negate back
418 :
419 :			; saturates to +2047
420 :			movq mm2, [mmx_2047 wrt rip]
421 :			pminsw mm0, mm2
422 :			add rax, 2
423 :			pminsw mm3, mm2
424 :
425 :			pxor mm0, mm1
426 :			pxor mm3, mm4
427 :			movq [rdx + 8rax + 816 - 2*8], mm0
428 :			movq [rdx + 8rax + 816+8 - 2*8], mm3
429 :			jnz near .loop
430 :
431 :			; deal with DC
432 :			movd mm0, [rcx]
433 :			pmullw mm0, [rsp-8] ; dscalar
434 :			movq mm2, [mmx_32767_minus_2047 wrt rip]
435 :			paddsw mm0, mm2
436 :			psubsw mm0, mm2
437 :			movq mm2, [mmx_32768_minus_2048 wrt rip]
438 :			psubsw mm0, mm2
439 :			paddsw mm0, mm2
440 :			movd rax, mm0
441 :			mov [rdx], ax
442 :
443 :			xor rax, rax
444 :			ret
445 :	Isibaar	1793	ENDFUNC
446 :	edgomez	1586
447 :
448 :			;-----------------------------------------------------------------------------
449 :			;
450 :			; uint32_t dequant_h263_inter_x86_64(int16_t * data,
451 :			; const int16_t * const coeff,
452 :			; const uint32_t quant,
453 :			; const uint16_t *mpeg_matrices);
454 :			; Port of the 32bit xmm cousin
455 :			;-----------------------------------------------------------------------------
456 :
457 :			; this is the same as dequant_inter_mmx,
458 :			; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
459 :
460 :			ALIGN 16
461 :			dequant_h263_inter_x86_64:
462 :
463 :			mov rax, rdx ; quant
464 :			mov rcx, rsi ; coeff
465 :			mov rdx, rdi ; data
466 :
467 :			lea r9, [mmx_add wrt rip]
468 :			movq mm6, [r9 + rax*8 - 8] ; quant or quant-1
469 :			lea r9, [mmx_mul wrt rip]
470 :			movq mm7, [r9 + rax8 - 8] ; 2quant
471 :			mov rax, -16
472 :
473 :			ALIGN 16
474 :	Isibaar	1793	.loop:
475 :	edgomez	1586	movq mm0, [rcx+8rax+816] ; c = coeff[i]
476 :			movq mm3, [rcx+8rax+816 + 8] ; c' = coeff[i+1]
477 :			pxor mm1, mm1
478 :			pxor mm4, mm4
479 :			pcmpgtw mm1, mm0 ; sign(c)
480 :			pcmpgtw mm4, mm3 ; sign(c')
481 :			pxor mm2, mm2
482 :			pxor mm5, mm5
483 :			pcmpeqw mm2, mm0 ; c is zero
484 :			pcmpeqw mm5, mm3 ; c' is zero
485 :			pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
486 :			pandn mm5, mm6
487 :			pxor mm0, mm1 ; negate if negative
488 :			pxor mm3, mm4 ; negate if negative
489 :			psubw mm0, mm1
490 :			psubw mm3, mm4
491 :			pmullw mm0, mm7 ; *= 2Q
492 :			pmullw mm3, mm7 ; *= 2Q
493 :			paddw mm0, mm2 ; + offset
494 :			paddw mm3, mm5 ; + offset
495 :			paddw mm0, mm1 ; start restoring sign
496 :			paddw mm3, mm4 ; start restoring sign
497 :			; saturates to +2047
498 :			movq mm2, [mmx_2047 wrt rip]
499 :			pminsw mm0, mm2
500 :			add rax, 2
501 :			pminsw mm3, mm2
502 :
503 :			pxor mm0, mm1 ; finish restoring sign
504 :			pxor mm3, mm4 ; finish restoring sign
505 :			movq [rdx + 8rax + 816 - 2*8], mm0
506 :			movq [rdx + 8rax + 816+8 - 2*8], mm3
507 :			jnz near .loop
508 :
509 :			xor rax, rax
510 :			ret
511 :	Isibaar	1793	ENDFUNC
512 :	Isibaar	1790
513 :			%ifidn __OUTPUT_FORMAT__,elf
514 :			section ".note.GNU-stack" noalloc noexec nowrite progbits
515 :			%endif
516 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4