Annotation of /trunk/xvidcore/src/quant/x86_64_asm/quantize_h263_mmx.asm

Revision 1586 - (view) (download)

1 :	edgomez	1586	;/*****************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - MPEG4 Quantization H263 implementation / MMX optimized -
5 :			; *
6 :			; * Copyright(C) 2001-2003 Peter Ross <pross@xvid.org>
7 :			; * 2002-2003 Pascal Massimino <skal@planet-d.net>
8 :			; * 2004 Andre Werthmann <wertmann@aei.mpg.de>
9 :			; *
10 :			; * This program is free software ; you can redistribute it and/or modify
11 :			; * it under the terms of the GNU General Public License as published by
12 :			; * the Free Software Foundation ; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :			; *
15 :			; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :			; *
20 :			; * You should have received a copy of the GNU General Public License
21 :			; * along with this program ; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :			; *
24 :			; * $Id: quantize_h263_mmx.asm,v 1.1 2005-01-05 23:02:15 edgomez Exp $
25 :			; *
26 :			; ****************************************************************************/
27 :
28 :			; enable dequant saturate [-2048,2047], test purposes only.
29 :			%define SATURATE
30 :
31 :			BITS 64
32 :
33 :			%macro cglobal 1
34 :			%ifdef PREFIX
35 :			%ifdef MARK_FUNCS
36 :			global _%1:function %1.endfunc-%1
37 :			%define %1 _%1:function %1.endfunc-%1
38 :			%else
39 :			global _%1
40 :			%define %1 _%1
41 :			%endif
42 :			%else
43 :			%ifdef MARK_FUNCS
44 :			global %1:function %1.endfunc-%1
45 :			%else
46 :			global %1
47 :			%endif
48 :			%endif
49 :			%endmacro
50 :
51 :			;=============================================================================
52 :			; Read only Local data
53 :			;=============================================================================
54 :
55 :			%ifdef FORMAT_COFF
56 :			SECTION .rodata
57 :			%else
58 :			SECTION .rodata align=16
59 :			%endif
60 :
61 :			ALIGN 16
62 :			plus_one:
63 :			times 8 dw 1
64 :
65 :			;-----------------------------------------------------------------------------
66 :			;
67 :			; subtract by Q/2 table
68 :			;
69 :			;-----------------------------------------------------------------------------
70 :
71 :			ALIGN 16
72 :			mmx_sub:
73 :			%assign quant 1
74 :			%rep 31
75 :			times 4 dw quant / 2
76 :			%assign quant quant+1
77 :			%endrep
78 :
79 :			;-----------------------------------------------------------------------------
80 :			;
81 :			; divide by 2Q table
82 :			;
83 :			; use a shift of 16 to take full advantage of _pmulhw_
84 :			; for q=1, _pmulhw_ will overflow so it is treated seperately
85 :			; (3dnow2 provides _pmulhuw_ which wont cause overflow)
86 :			;
87 :			;-----------------------------------------------------------------------------
88 :
89 :			ALIGN 16
90 :			mmx_div:
91 :			%assign quant 1
92 :			%rep 31
93 :			times 4 dw (1<<16) / (quant*2) + 1
94 :			%assign quant quant+1
95 :			%endrep
96 :
97 :			;-----------------------------------------------------------------------------
98 :			;
99 :			; add by (odd(Q) ? Q : Q - 1) table
100 :			;
101 :			;-----------------------------------------------------------------------------
102 :
103 :			ALIGN 16
104 :			mmx_add:
105 :			%assign quant 1
106 :			%rep 31
107 :			%if quant % 2 != 0
108 :			times 4 dw quant
109 :			%else
110 :			times 4 dw quant - 1
111 :			%endif
112 :			%assign quant quant+1
113 :			%endrep
114 :
115 :			;-----------------------------------------------------------------------------
116 :			;
117 :			; multiple by 2Q table
118 :			;
119 :			;-----------------------------------------------------------------------------
120 :
121 :			ALIGN 16
122 :			mmx_mul:
123 :			%assign quant 1
124 :			%rep 31
125 :			times 4 dw quant*2
126 :			%assign quant quant+1
127 :			%endrep
128 :
129 :			;-----------------------------------------------------------------------------
130 :			;
131 :			; saturation limits
132 :			;
133 :			;-----------------------------------------------------------------------------
134 :
135 :			ALIGN 16
136 :			sse2_2047:
137 :			times 8 dw 2047
138 :
139 :			ALIGN 16
140 :			mmx_2047:
141 :			times 4 dw 2047
142 :
143 :			ALIGN 8
144 :			mmx_32768_minus_2048:
145 :			times 4 dw (32768-2048)
146 :
147 :			mmx_32767_minus_2047:
148 :			times 4 dw (32767-2047)
149 :
150 :
151 :			;=============================================================================
152 :			; Code
153 :			;=============================================================================
154 :
155 :			SECTION .text align=16
156 :
157 :			cglobal quant_h263_intra_x86_64
158 :			cglobal quant_h263_inter_x86_64
159 :			cglobal dequant_h263_intra_x86_64
160 :			cglobal dequant_h263_inter_x86_64
161 :
162 :			;-----------------------------------------------------------------------------
163 :			;
164 :			; uint32_t quant_h263_intra_x86_64(int16_t * coeff,
165 :			; const int16_t const * data,
166 :			; const uint32_t quant,
167 :			; const uint32_t dcscalar,
168 :			; const uint16_t *mpeg_matrices);
169 :			; Port of the 32bit mmx cousin
170 :			;-----------------------------------------------------------------------------
171 :
172 :			ALIGN 16
173 :			quant_h263_intra_x86_64:
174 :			mov rax, rdx ; quant
175 :			; rsi is data
176 :			; rdi is coeff
177 :			mov r8, rcx ; save dscalar
178 :
179 :			xor rcx, rcx
180 :			cmp rax, 1
181 :			jz .q1loop
182 :
183 :			lea r9, [mmx_div wrt rip]
184 :			movq mm7, [r9 + rax * 8 - 8]
185 :
186 :			ALIGN 16
187 :			.loop
188 :			movq mm0, [rsi + 8*rcx] ; mm0 = [1st]
189 :			movq mm3, [rsi + 8*rcx + 8]
190 :			pxor mm1, mm1 ; mm1 = 0
191 :			pxor mm4, mm4 ;
192 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
193 :			pcmpgtw mm4, mm3 ;
194 :			pxor mm0, mm1 ; mm0 = \|mm0\|
195 :			pxor mm3, mm4 ;
196 :			psubw mm0, mm1 ; displace
197 :			psubw mm3, mm4 ;
198 :			pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
199 :			pmulhw mm3, mm7 ;
200 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
201 :			pxor mm3, mm4 ;
202 :			psubw mm0, mm1 ; undisplace
203 :			psubw mm3, mm4 ;
204 :			movq [rdi + 8*rcx], mm0
205 :			movq [rdi + 8*rcx + 8], mm3
206 :
207 :			add rcx, 2
208 :			cmp rcx, 16
209 :			jnz .loop
210 :
211 :			.done
212 :
213 :			; caclulate data[0] // (int32_t)dcscalar)
214 :			mov rcx, r8 ; dscalar
215 :			mov rdx, rcx
216 :			movsx eax, word [rsi] ; data[0] with sign extend
217 :			shr rdx, 1 ; edx = dcscalar /2
218 :			cmp eax, 0
219 :			jg .gtzero
220 :
221 :			sub rax, rdx
222 :			jmp short .mul
223 :
224 :			.gtzero
225 :			add rax, rdx
226 :			.mul
227 :			cdq ; expand eax -> edx:eax
228 :			idiv ecx ; eax = edx:eax / dcscalar
229 :			mov [rdi], ax ; coeff[0] = ax
230 :
231 :			xor rax, rax ; return(0);
232 :
233 :			ret
234 :
235 :			ALIGN 16
236 :			.q1loop
237 :			movq mm0, [rsi + 8*rcx] ; mm0 = [1st]
238 :			movq mm3, [rsi + 8*rcx + 8]
239 :			pxor mm1, mm1 ; mm1 = 0
240 :			pxor mm4, mm4 ;
241 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
242 :			pcmpgtw mm4, mm3 ;
243 :			pxor mm0, mm1 ; mm0 = \|mm0\|
244 :			pxor mm3, mm4 ;
245 :			psubw mm0, mm1 ; displace
246 :			psubw mm3, mm4 ;
247 :			psrlw mm0, 1 ; mm0 >>= 1 (/2)
248 :			psrlw mm3, 1 ;
249 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
250 :			pxor mm3, mm4
251 :			psubw mm0, mm1 ; undisplace
252 :			psubw mm3, mm4 ;
253 :			movq [rdi + 8*rcx], mm0
254 :			movq [rdi + 8*rcx + 8], mm3
255 :
256 :			add rcx, 2
257 :			cmp rcx, 16
258 :			jnz .q1loop
259 :
260 :			jmp .done
261 :			.endfunc
262 :
263 :
264 :			;-----------------------------------------------------------------------------
265 :			;
266 :			; uint32_t quant_h263_inter_x86_64(int16_t * coeff,
267 :			; const int16_t const * data,
268 :			; const uint32_t quant,
269 :			; const uint16_t *mpeg_matrices);
270 :			; Port of the 32bit mmx cousin
271 :			;-----------------------------------------------------------------------------
272 :
273 :			ALIGN 16
274 :			quant_h263_inter_x86_64:
275 :			mov rax, rdx ; quant
276 :			; rsi is data
277 :			; rdi is coeff
278 :
279 :			xor rcx, rcx
280 :
281 :			pxor mm5, mm5 ; sum
282 :			lea r9, [mmx_sub wrt rip]
283 :			movq mm6, [r9 + rax * 8 - 8] ; sub
284 :
285 :			cmp rax, 1
286 :			jz .q1loop
287 :
288 :			lea r9, [mmx_div wrt rip]
289 :			movq mm7, [r9 + rax * 8 - 8] ; divider
290 :
291 :			ALIGN 8
292 :			.loop
293 :			movq mm0, [rsi + 8*rcx] ; mm0 = [1st]
294 :			movq mm3, [rsi + 8*rcx + 8]
295 :			pxor mm1, mm1 ; mm1 = 0
296 :			pxor mm4, mm4 ;
297 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
298 :			pcmpgtw mm4, mm3 ;
299 :			pxor mm0, mm1 ; mm0 = \|mm0\|
300 :			pxor mm3, mm4 ;
301 :			psubw mm0, mm1 ; displace
302 :			psubw mm3, mm4 ;
303 :			psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0)
304 :			psubusw mm3, mm6 ;
305 :			pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
306 :			pmulhw mm3, mm7 ;
307 :			paddw mm5, mm0 ; sum += mm0
308 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
309 :			paddw mm5, mm3 ;
310 :			pxor mm3, mm4 ;
311 :			psubw mm0, mm1 ; undisplace
312 :			psubw mm3, mm4
313 :			movq [rdi + 8*rcx], mm0
314 :			movq [rdi + 8*rcx + 8], mm3
315 :
316 :			add rcx, 2
317 :			cmp rcx, 16
318 :			jnz .loop
319 :
320 :			.done
321 :			pmaddwd mm5, [plus_one wrt rip]
322 :			movq mm0, mm5
323 :			psrlq mm5, 32
324 :			paddd mm0, mm5
325 :
326 :			movd rax, mm0 ; return sum
327 :
328 :			ret
329 :
330 :			ALIGN 8
331 :			.q1loop
332 :			movq mm0, [rsi + 8*rcx] ; mm0 = [1st]
333 :			movq mm3, [rsi + 8*rcx+ 8] ;
334 :			pxor mm1, mm1 ; mm1 = 0
335 :			pxor mm4, mm4 ;
336 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
337 :			pcmpgtw mm4, mm3 ;
338 :			pxor mm0, mm1 ; mm0 = \|mm0\|
339 :			pxor mm3, mm4 ;
340 :			psubw mm0, mm1 ; displace
341 :			psubw mm3, mm4 ;
342 :			psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0)
343 :			psubusw mm3, mm6 ;
344 :			psrlw mm0, 1 ; mm0 >>= 1 (/2)
345 :			psrlw mm3, 1 ;
346 :			paddw mm5, mm0 ; sum += mm0
347 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
348 :			paddw mm5, mm3 ;
349 :			pxor mm3, mm4 ;
350 :			psubw mm0, mm1 ; undisplace
351 :			psubw mm3, mm4
352 :			movq [rdi + 8*rcx], mm0
353 :			movq [rdi + 8*rcx + 8], mm3
354 :
355 :			add rcx, 2
356 :			cmp rcx, 16
357 :			jnz .q1loop
358 :
359 :			jmp .done
360 :			.endfunc
361 :
362 :
363 :			;-----------------------------------------------------------------------------
364 :			;
365 :			; uint32_t dequant_h263_intra_x86_64(int16_t *data,
366 :			; const int16_t const *coeff,
367 :			; const uint32_t quant,
368 :			; const uint32_t dcscalar,
369 :			; const uint16_t *mpeg_matrices);
370 :			; port of the 32bit xmm cousin
371 :			;-----------------------------------------------------------------------------
372 :
373 :			; this is the same as dequant_inter_mmx, except that we're
374 :			; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
375 :
376 :			ALIGN 16
377 :			dequant_h263_intra_x86_64:
378 :
379 :			mov rax, rdx ; quant
380 :			mov [rsp-8], rcx ; save dscalar
381 :			mov rcx, rsi ; coeff
382 :			mov rdx, rdi ; data
383 :
384 :			lea r9, [mmx_add wrt rip]
385 :			movq mm6, [r9 + rax*8 - 8] ; quant or quant-1
386 :			lea r9, [mmx_mul wrt rip]
387 :			movq mm7, [r9 + rax8 - 8] ; 2quant
388 :			mov rax, -16
389 :
390 :			ALIGN 16
391 :			.loop
392 :			movq mm0, [rcx+8rax+816] ; c = coeff[i]
393 :			movq mm3, [rcx+8rax+816 + 8] ; c' = coeff[i+1]
394 :			pxor mm1, mm1
395 :			pxor mm4, mm4
396 :			pcmpgtw mm1, mm0 ; sign(c)
397 :			pcmpgtw mm4, mm3 ; sign(c')
398 :			pxor mm2, mm2
399 :			pxor mm5, mm5
400 :			pcmpeqw mm2, mm0 ; c is zero
401 :			pcmpeqw mm5, mm3 ; c' is zero
402 :			pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
403 :			pandn mm5, mm6
404 :			pxor mm0, mm1 ; negate if negative
405 :			pxor mm3, mm4 ; negate if negative
406 :			psubw mm0, mm1
407 :			psubw mm3, mm4
408 :			pmullw mm0, mm7 ; *= 2Q
409 :			pmullw mm3, mm7 ; *= 2Q
410 :			paddw mm0, mm2 ; + offset
411 :			paddw mm3, mm5 ; + offset
412 :			paddw mm0, mm1 ; negate back
413 :			paddw mm3, mm4 ; negate back
414 :
415 :			; saturates to +2047
416 :			movq mm2, [mmx_2047 wrt rip]
417 :			pminsw mm0, mm2
418 :			add rax, 2
419 :			pminsw mm3, mm2
420 :
421 :			pxor mm0, mm1
422 :			pxor mm3, mm4
423 :			movq [rdx + 8rax + 816 - 2*8], mm0
424 :			movq [rdx + 8rax + 816+8 - 2*8], mm3
425 :			jnz near .loop
426 :
427 :			; deal with DC
428 :			movd mm0, [rcx]
429 :			pmullw mm0, [rsp-8] ; dscalar
430 :			movq mm2, [mmx_32767_minus_2047 wrt rip]
431 :			paddsw mm0, mm2
432 :			psubsw mm0, mm2
433 :			movq mm2, [mmx_32768_minus_2048 wrt rip]
434 :			psubsw mm0, mm2
435 :			paddsw mm0, mm2
436 :			movd rax, mm0
437 :			mov [rdx], ax
438 :
439 :			xor rax, rax
440 :			ret
441 :			.endfunc
442 :
443 :
444 :			;-----------------------------------------------------------------------------
445 :			;
446 :			; uint32_t dequant_h263_inter_x86_64(int16_t * data,
447 :			; const int16_t * const coeff,
448 :			; const uint32_t quant,
449 :			; const uint16_t *mpeg_matrices);
450 :			; Port of the 32bit xmm cousin
451 :			;-----------------------------------------------------------------------------
452 :
453 :			; this is the same as dequant_inter_mmx,
454 :			; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
455 :
456 :			ALIGN 16
457 :			dequant_h263_inter_x86_64:
458 :
459 :			mov rax, rdx ; quant
460 :			mov rcx, rsi ; coeff
461 :			mov rdx, rdi ; data
462 :
463 :			lea r9, [mmx_add wrt rip]
464 :			movq mm6, [r9 + rax*8 - 8] ; quant or quant-1
465 :			lea r9, [mmx_mul wrt rip]
466 :			movq mm7, [r9 + rax8 - 8] ; 2quant
467 :			mov rax, -16
468 :
469 :			ALIGN 16
470 :			.loop
471 :			movq mm0, [rcx+8rax+816] ; c = coeff[i]
472 :			movq mm3, [rcx+8rax+816 + 8] ; c' = coeff[i+1]
473 :			pxor mm1, mm1
474 :			pxor mm4, mm4
475 :			pcmpgtw mm1, mm0 ; sign(c)
476 :			pcmpgtw mm4, mm3 ; sign(c')
477 :			pxor mm2, mm2
478 :			pxor mm5, mm5
479 :			pcmpeqw mm2, mm0 ; c is zero
480 :			pcmpeqw mm5, mm3 ; c' is zero
481 :			pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
482 :			pandn mm5, mm6
483 :			pxor mm0, mm1 ; negate if negative
484 :			pxor mm3, mm4 ; negate if negative
485 :			psubw mm0, mm1
486 :			psubw mm3, mm4
487 :			pmullw mm0, mm7 ; *= 2Q
488 :			pmullw mm3, mm7 ; *= 2Q
489 :			paddw mm0, mm2 ; + offset
490 :			paddw mm3, mm5 ; + offset
491 :			paddw mm0, mm1 ; start restoring sign
492 :			paddw mm3, mm4 ; start restoring sign
493 :			; saturates to +2047
494 :			movq mm2, [mmx_2047 wrt rip]
495 :			pminsw mm0, mm2
496 :			add rax, 2
497 :			pminsw mm3, mm2
498 :
499 :			pxor mm0, mm1 ; finish restoring sign
500 :			pxor mm3, mm4 ; finish restoring sign
501 :			movq [rdx + 8rax + 816 - 2*8], mm0
502 :			movq [rdx + 8rax + 816+8 - 2*8], mm3
503 :			jnz near .loop
504 :
505 :			xor rax, rax
506 :			ret
507 :			.endfunc

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4