19 |
; * along with this program; if not, write to the Free Software |
; * along with this program; if not, write to the Free Software |
20 |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
; * |
; * |
22 |
; * $Id: reduced_mmx.asm,v 1.4 2004-07-24 11:46:08 edgomez Exp $ |
; * $Id: reduced_mmx.asm,v 1.9 2008-11-26 01:04:34 Isibaar Exp $ |
23 |
; * |
; * |
24 |
; *************************************************************************/ |
; *************************************************************************/ |
25 |
|
|
26 |
BITS 32 |
%include "nasm.inc" |
|
|
|
|
%macro cglobal 1 |
|
|
%ifdef PREFIX |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%else |
|
|
global %1 |
|
|
%endif |
|
|
%endmacro |
|
27 |
|
|
28 |
;=========================================================================== |
;=========================================================================== |
29 |
|
|
30 |
%ifdef FORMAT_COFF |
DATA |
|
SECTION .rodata |
|
|
%else |
|
|
SECTION .rodata align=16 |
|
|
%endif |
|
31 |
|
|
32 |
align 16 |
align SECTION_ALIGN |
33 |
Up31 dw 3, 1, 3, 1 |
Up31 dw 3, 1, 3, 1 |
34 |
Up13 dw 1, 3, 1, 3 |
Up13 dw 1, 3, 1, 3 |
35 |
Up93 dw 9, 3, 9, 3 |
Up93 dw 9, 3, 9, 3 |
45 |
|
|
46 |
;=========================================================================== |
;=========================================================================== |
47 |
|
|
48 |
SECTION .text |
SECTION .rotext align=SECTION_ALIGN |
49 |
|
|
50 |
cglobal xvid_Copy_Upsampled_8x8_16To8_mmx |
cglobal xvid_Copy_Upsampled_8x8_16To8_mmx |
51 |
cglobal xvid_Add_Upsampled_8x8_16To8_mmx |
cglobal xvid_Add_Upsampled_8x8_16To8_mmx |
78 |
; MMX-way of reordering columns... |
; MMX-way of reordering columns... |
79 |
|
|
80 |
%macro COL03 3 ;%1/%2: regs, %3: row -output: mm4/mm5 |
%macro COL03 3 ;%1/%2: regs, %3: row -output: mm4/mm5 |
81 |
movq %1, [edx+%3*16+0*2] ; %1 = 0|1|2|3 |
movq %1, [TMP1+%3*16+0*2] ; %1 = 0|1|2|3 |
82 |
movq %2,[edx+%3*16+1*2] ; %2 = 1|2|3|4 |
movq %2,[TMP1+%3*16+1*2] ; %2 = 1|2|3|4 |
83 |
movq mm5, %1 ; mm5 = 0|1|2|3 |
movq mm5, %1 ; mm5 = 0|1|2|3 |
84 |
movq mm4, %1 ; mm4 = 0|1|2|3 |
movq mm4, %1 ; mm4 = 0|1|2|3 |
85 |
punpckhwd mm5,%2 ; mm5 = 2|3|3|4 |
punpckhwd mm5,%2 ; mm5 = 2|3|3|4 |
90 |
%endmacro |
%endmacro |
91 |
|
|
92 |
%macro COL47 3 ;%1-%2: regs, %3: row -output: mm4/mm5 |
%macro COL47 3 ;%1-%2: regs, %3: row -output: mm4/mm5 |
93 |
movq mm5, [edx+%3*16+4*2] ; mm5 = 4|5|6|7 |
movq mm5, [TMP1+%3*16+4*2] ; mm5 = 4|5|6|7 |
94 |
movq %1, [edx+%3*16+3*2] ; %1 = 3|4|5|6 |
movq %1, [TMP1+%3*16+3*2] ; %1 = 3|4|5|6 |
95 |
movq %2, mm5 ; %2 = 4|5|6|7 |
movq %2, mm5 ; %2 = 4|5|6|7 |
96 |
movq mm4, mm5 ; mm4 = 4|5|6|7 |
movq mm4, mm5 ; mm4 = 4|5|6|7 |
97 |
punpckhwd %2, %2 ; %2 = 6|6|7|7 |
punpckhwd %2, %2 ; %2 = 6|6|7|7 |
130 |
psraw %1, 2 |
psraw %1, 2 |
131 |
psraw %2, 2 |
psraw %2, 2 |
132 |
packuswb %1,%2 |
packuswb %1,%2 |
133 |
movq [ecx], %1 |
movq [TMP0], %1 |
134 |
%endmacro |
%endmacro |
135 |
|
|
136 |
%macro STORE_2 2 ; pack and store (%1,%2) + (mm4,mm5) |
%macro STORE_2 2 ; pack and store (%1,%2) + (mm4,mm5) |
140 |
psraw mm5, 4 |
psraw mm5, 4 |
141 |
packuswb %1,%2 |
packuswb %1,%2 |
142 |
packuswb mm4, mm5 |
packuswb mm4, mm5 |
143 |
movq [ecx], %1 |
movq [TMP0], %1 |
144 |
movq [ecx+eax], mm4 |
movq [TMP0+_EAX], mm4 |
145 |
lea ecx, [ecx+2*eax] |
lea TMP0, [TMP0+2*_EAX] |
146 |
%endmacro |
%endmacro |
147 |
|
|
148 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
149 |
|
|
150 |
align 16 |
align SECTION_ALIGN |
151 |
xvid_Copy_Upsampled_8x8_16To8_mmx: ; 344c |
xvid_Copy_Upsampled_8x8_16To8_mmx: ; 344c |
152 |
|
|
153 |
mov ecx, [esp+4] ; Dst |
mov TMP0, prm1 ; Dst |
154 |
mov edx, [esp+8] ; Src |
mov TMP1, prm2 ; Src |
155 |
mov eax, [esp+12] ; BpS |
mov _EAX, prm3 ; BpS |
156 |
|
|
157 |
movq mm6, [Up13] |
movq mm6, [Up13] |
158 |
movq mm7, [Up31] |
movq mm7, [Up31] |
162 |
movq mm4, mm0 |
movq mm4, mm0 |
163 |
movq mm5, mm1 |
movq mm5, mm1 |
164 |
STORE_1 mm4, mm5 |
STORE_1 mm4, mm5 |
165 |
add ecx, eax |
add TMP0, _EAX |
166 |
|
|
167 |
COL03 mm2, mm3, 1 |
COL03 mm2, mm3, 1 |
168 |
MUL_PACK mm2,mm3, mm6, mm7 |
MUL_PACK mm2,mm3, mm6, mm7 |
201 |
|
|
202 |
STORE_1 mm2, mm3 |
STORE_1 mm2, mm3 |
203 |
|
|
204 |
mov ecx, [esp+4] |
mov TMP0, prm1 |
205 |
add ecx, 8 |
add TMP0, 8 |
206 |
|
|
207 |
COL47 mm0, mm1, 0 |
COL47 mm0, mm1, 0 |
208 |
MUL_PACK mm0,mm1, mm6, mm7 |
MUL_PACK mm0,mm1, mm6, mm7 |
209 |
movq mm4, mm0 |
movq mm4, mm0 |
210 |
movq mm5, mm1 |
movq mm5, mm1 |
211 |
STORE_1 mm4, mm5 |
STORE_1 mm4, mm5 |
212 |
add ecx, eax |
add TMP0, _EAX |
213 |
|
|
214 |
COL47 mm2, mm3, 1 |
COL47 mm2, mm3, 1 |
215 |
MUL_PACK mm2,mm3, mm6, mm7 |
MUL_PACK mm2,mm3, mm6, mm7 |
249 |
STORE_1 mm2, mm3 |
STORE_1 mm2, mm3 |
250 |
|
|
251 |
ret |
ret |
252 |
|
ENDFUNC |
253 |
|
|
254 |
;=========================================================================== |
;=========================================================================== |
255 |
; |
; |
283 |
psubsw %1, mm6 |
psubsw %1, mm6 |
284 |
psubsw %2, mm7 |
psubsw %2, mm7 |
285 |
|
|
286 |
; mix with destination [ecx] |
; mix with destination [TMP0] |
287 |
movq mm6, [ecx] |
movq mm6, [TMP0] |
288 |
movq mm7, [ecx] |
movq mm7, [TMP0] |
289 |
punpcklbw mm6, [Cst0] |
punpcklbw mm6, [Cst0] |
290 |
punpckhbw mm7, [Cst0] |
punpckhbw mm7, [Cst0] |
291 |
paddsw %1, mm6 |
paddsw %1, mm6 |
292 |
paddsw %2, mm7 |
paddsw %2, mm7 |
293 |
packuswb %1,%2 |
packuswb %1,%2 |
294 |
movq [ecx], %1 |
movq [TMP0], %1 |
295 |
%endmacro |
%endmacro |
296 |
|
|
297 |
%macro STORE_ADD_2 2 |
%macro STORE_ADD_2 2 |
318 |
psubsw mm5, mm7 |
psubsw mm5, mm7 |
319 |
|
|
320 |
; mix with destination |
; mix with destination |
321 |
movq mm6, [ecx] |
movq mm6, [TMP0] |
322 |
movq mm7, [ecx] |
movq mm7, [TMP0] |
323 |
punpcklbw mm6, [Cst0] |
punpcklbw mm6, [Cst0] |
324 |
punpckhbw mm7, [Cst0] |
punpckhbw mm7, [Cst0] |
325 |
paddsw %1, mm6 |
paddsw %1, mm6 |
326 |
paddsw %2, mm7 |
paddsw %2, mm7 |
327 |
|
|
328 |
movq mm6, [ecx+eax] |
movq mm6, [TMP0+_EAX] |
329 |
movq mm7, [ecx+eax] |
movq mm7, [TMP0+_EAX] |
330 |
|
|
331 |
punpcklbw mm6, [Cst0] |
punpcklbw mm6, [Cst0] |
332 |
punpckhbw mm7, [Cst0] |
punpckhbw mm7, [Cst0] |
336 |
packuswb %1,%2 |
packuswb %1,%2 |
337 |
packuswb mm4, mm5 |
packuswb mm4, mm5 |
338 |
|
|
339 |
movq [ecx], %1 |
movq [TMP0], %1 |
340 |
movq [ecx+eax], mm4 |
movq [TMP0+_EAX], mm4 |
341 |
|
|
342 |
lea ecx, [ecx+2*eax] |
lea TMP0, [TMP0+2*_EAX] |
343 |
%endmacro |
%endmacro |
344 |
|
|
345 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
346 |
|
|
347 |
align 16 |
align SECTION_ALIGN |
348 |
xvid_Add_Upsampled_8x8_16To8_mmx: ; 579c |
xvid_Add_Upsampled_8x8_16To8_mmx: ; 579c |
349 |
|
|
350 |
mov ecx, [esp+4] ; Dst |
mov TMP0, prm1 ; Dst |
351 |
mov edx, [esp+8] ; Src |
mov TMP1, prm2 ; Src |
352 |
mov eax, [esp+12] ; BpS |
mov _EAX, prm3 ; BpS |
353 |
|
|
354 |
COL03 mm0, mm1, 0 |
COL03 mm0, mm1, 0 |
355 |
MUL_PACK mm0,mm1, [Up13], [Up31] |
MUL_PACK mm0,mm1, [Up13], [Up31] |
356 |
movq mm4, mm0 |
movq mm4, mm0 |
357 |
movq mm5, mm1 |
movq mm5, mm1 |
358 |
STORE_ADD_1 mm4, mm5 |
STORE_ADD_1 mm4, mm5 |
359 |
add ecx, eax |
add TMP0, _EAX |
360 |
|
|
361 |
COL03 mm2, mm3, 1 |
COL03 mm2, mm3, 1 |
362 |
MUL_PACK mm2,mm3, [Up13], [Up31] |
MUL_PACK mm2,mm3, [Up13], [Up31] |
396 |
STORE_ADD_1 mm2, mm3 |
STORE_ADD_1 mm2, mm3 |
397 |
|
|
398 |
|
|
399 |
mov ecx, [esp+4] |
mov TMP0, prm1 |
400 |
add ecx, 8 |
add TMP0, 8 |
401 |
|
|
402 |
COL47 mm0, mm1, 0 |
COL47 mm0, mm1, 0 |
403 |
MUL_PACK mm0,mm1, [Up13], [Up31] |
MUL_PACK mm0,mm1, [Up13], [Up31] |
404 |
movq mm4, mm0 |
movq mm4, mm0 |
405 |
movq mm5, mm1 |
movq mm5, mm1 |
406 |
STORE_ADD_1 mm4, mm5 |
STORE_ADD_1 mm4, mm5 |
407 |
add ecx, eax |
add TMP0, _EAX |
408 |
|
|
409 |
COL47 mm2, mm3, 1 |
COL47 mm2, mm3, 1 |
410 |
MUL_PACK mm2,mm3, [Up13], [Up31] |
MUL_PACK mm2,mm3, [Up13], [Up31] |
444 |
STORE_ADD_1 mm2, mm3 |
STORE_ADD_1 mm2, mm3 |
445 |
|
|
446 |
ret |
ret |
447 |
|
ENDFUNC |
448 |
|
|
449 |
;=========================================================================== |
;=========================================================================== |
450 |
; |
; |
456 |
; xmm version can take (little) advantage of 'pshufw' |
; xmm version can take (little) advantage of 'pshufw' |
457 |
|
|
458 |
%macro COL03_SSE 3 ;%1/%2: regs, %3: row -trashes mm4/mm5 |
%macro COL03_SSE 3 ;%1/%2: regs, %3: row -trashes mm4/mm5 |
459 |
movq %2, [edx+%3*16+0*2] ; <- 0|1|2|3 |
movq %2, [TMP1+%3*16+0*2] ; <- 0|1|2|3 |
460 |
pshufw %1, %2, (0+0*4+0*16+1*64) ; %1 = 0|0|0|1 |
pshufw %1, %2, (0+0*4+0*16+1*64) ; %1 = 0|0|0|1 |
461 |
pshufw mm4, %2, (0+1*4+1*16+2*64) ; mm4= 0|1|1|2 |
pshufw mm4, %2, (0+1*4+1*16+2*64) ; mm4= 0|1|1|2 |
462 |
pshufw %2, %2, (1+2*4+2*16+3*64) ; %2 = 1|2|2|3 |
pshufw %2, %2, (1+2*4+2*16+3*64) ; %2 = 1|2|2|3 |
463 |
pshufw mm5, [edx+%3*16+2*2], (0+1*4+1*16+2*64) ; mm5 = 2|3|3|4 |
pshufw mm5, [TMP1+%3*16+2*2], (0+1*4+1*16+2*64) ; mm5 = 2|3|3|4 |
464 |
%endmacro |
%endmacro |
465 |
|
|
466 |
%macro COL47_SSE 3 ;%1-%2: regs, %3: row -trashes mm4/mm5 |
%macro COL47_SSE 3 ;%1-%2: regs, %3: row -trashes mm4/mm5 |
467 |
pshufw %1, [edx+%3*16+2*2], (1+2*4+2*16+3*64) ; 3|4|4|5 |
pshufw %1, [TMP1+%3*16+2*2], (1+2*4+2*16+3*64) ; 3|4|4|5 |
468 |
movq mm5, [edx+%3*16+2*4] ; <- 4|5|6|7 |
movq mm5, [TMP1+%3*16+2*4] ; <- 4|5|6|7 |
469 |
pshufw mm4, mm5, (0+1*4+1*16+2*64) ; 4|5|5|6 |
pshufw mm4, mm5, (0+1*4+1*16+2*64) ; 4|5|5|6 |
470 |
pshufw %2, mm5, (1+2*4+2*16+3*64) ; 5|6|6|7 |
pshufw %2, mm5, (1+2*4+2*16+3*64) ; 5|6|6|7 |
471 |
pshufw mm5, mm5, (2+3*4+3*16+3*64) ; 6|7|7|7 |
pshufw mm5, mm5, (2+3*4+3*16+3*64) ; 6|7|7|7 |
474 |
|
|
475 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
476 |
|
|
477 |
align 16 |
align SECTION_ALIGN |
478 |
xvid_Copy_Upsampled_8x8_16To8_xmm: ; 315c |
xvid_Copy_Upsampled_8x8_16To8_xmm: ; 315c |
479 |
|
|
480 |
mov ecx, [esp+4] ; Dst |
mov TMP0, prm1 ; Dst |
481 |
mov edx, [esp+8] ; Src |
mov TMP1, prm2 ; Src |
482 |
mov eax, [esp+12] ; BpS |
mov _EAX, prm3 ; BpS |
483 |
|
|
484 |
movq mm6, [Up13] |
movq mm6, [Up13] |
485 |
movq mm7, [Up31] |
movq mm7, [Up31] |
489 |
movq mm4, mm0 |
movq mm4, mm0 |
490 |
movq mm5, mm1 |
movq mm5, mm1 |
491 |
STORE_1 mm4, mm5 |
STORE_1 mm4, mm5 |
492 |
add ecx, eax |
add TMP0, _EAX |
493 |
|
|
494 |
COL03_SSE mm2, mm3, 1 |
COL03_SSE mm2, mm3, 1 |
495 |
MUL_PACK mm2,mm3, mm6, mm7 |
MUL_PACK mm2,mm3, mm6, mm7 |
528 |
|
|
529 |
STORE_1 mm2, mm3 |
STORE_1 mm2, mm3 |
530 |
|
|
531 |
mov ecx, [esp+4] |
mov TMP0, prm1 |
532 |
add ecx, 8 |
add TMP0, 8 |
533 |
|
|
534 |
COL47_SSE mm0, mm1, 0 |
COL47_SSE mm0, mm1, 0 |
535 |
MUL_PACK mm0,mm1, mm6, mm7 |
MUL_PACK mm0,mm1, mm6, mm7 |
536 |
movq mm4, mm0 |
movq mm4, mm0 |
537 |
movq mm5, mm1 |
movq mm5, mm1 |
538 |
STORE_1 mm4, mm5 |
STORE_1 mm4, mm5 |
539 |
add ecx, eax |
add TMP0, _EAX |
540 |
|
|
541 |
COL47_SSE mm2, mm3, 1 |
COL47_SSE mm2, mm3, 1 |
542 |
MUL_PACK mm2,mm3, mm6, mm7 |
MUL_PACK mm2,mm3, mm6, mm7 |
576 |
STORE_1 mm2, mm3 |
STORE_1 mm2, mm3 |
577 |
|
|
578 |
ret |
ret |
579 |
|
ENDFUNC |
580 |
|
|
581 |
;=========================================================================== |
;=========================================================================== |
582 |
; |
; |
585 |
; |
; |
586 |
;=========================================================================== |
;=========================================================================== |
587 |
|
|
588 |
align 16 |
align SECTION_ALIGN |
589 |
xvid_Add_Upsampled_8x8_16To8_xmm: ; 549c |
xvid_Add_Upsampled_8x8_16To8_xmm: ; 549c |
590 |
|
|
591 |
mov ecx, [esp+4] ; Dst |
mov TMP0, prm1 ; Dst |
592 |
mov edx, [esp+8] ; Src |
mov TMP1, prm2 ; Src |
593 |
mov eax, [esp+12] ; BpS |
mov _EAX, prm3 ; BpS |
594 |
|
|
595 |
COL03_SSE mm0, mm1, 0 |
COL03_SSE mm0, mm1, 0 |
596 |
MUL_PACK mm0,mm1, [Up13], [Up31] |
MUL_PACK mm0,mm1, [Up13], [Up31] |
597 |
movq mm4, mm0 |
movq mm4, mm0 |
598 |
movq mm5, mm1 |
movq mm5, mm1 |
599 |
STORE_ADD_1 mm4, mm5 |
STORE_ADD_1 mm4, mm5 |
600 |
add ecx, eax |
add TMP0, _EAX |
601 |
|
|
602 |
COL03_SSE mm2, mm3, 1 |
COL03_SSE mm2, mm3, 1 |
603 |
MUL_PACK mm2,mm3, [Up13], [Up31] |
MUL_PACK mm2,mm3, [Up13], [Up31] |
637 |
STORE_ADD_1 mm2, mm3 |
STORE_ADD_1 mm2, mm3 |
638 |
|
|
639 |
|
|
640 |
mov ecx, [esp+4] |
mov TMP0, prm1 |
641 |
add ecx, 8 |
add TMP0, 8 |
642 |
|
|
643 |
COL47_SSE mm0, mm1, 0 |
COL47_SSE mm0, mm1, 0 |
644 |
MUL_PACK mm0,mm1, [Up13], [Up31] |
MUL_PACK mm0,mm1, [Up13], [Up31] |
645 |
movq mm4, mm0 |
movq mm4, mm0 |
646 |
movq mm5, mm1 |
movq mm5, mm1 |
647 |
STORE_ADD_1 mm4, mm5 |
STORE_ADD_1 mm4, mm5 |
648 |
add ecx, eax |
add TMP0, _EAX |
649 |
|
|
650 |
COL47_SSE mm2, mm3, 1 |
COL47_SSE mm2, mm3, 1 |
651 |
MUL_PACK mm2,mm3, [Up13], [Up31] |
MUL_PACK mm2,mm3, [Up13], [Up31] |
685 |
STORE_ADD_1 mm2, mm3 |
STORE_ADD_1 mm2, mm3 |
686 |
|
|
687 |
ret |
ret |
688 |
|
ENDFUNC |
689 |
|
|
690 |
|
|
691 |
;=========================================================================== |
;=========================================================================== |
702 |
;// We use the trick: tmp = (x+y+2) -> [x = (tmp+2x)>>2, y = (tmp+2y)>>2] |
;// We use the trick: tmp = (x+y+2) -> [x = (tmp+2x)>>2, y = (tmp+2y)>>2] |
703 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
704 |
|
|
705 |
align 16 |
align SECTION_ALIGN |
706 |
xvid_HFilter_31_mmx: |
xvid_HFilter_31_mmx: |
707 |
push esi |
|
708 |
push edi |
mov TMP0, prm1 ; Src1 |
709 |
mov esi, [esp+4 +8] ; Src1 |
mov TMP1, prm2 ; Src2 |
710 |
mov edi, [esp+8 +8] ; Src2 |
mov _EAX, prm3 ; Nb_Blks |
711 |
mov eax, [esp+12 +8] ; Nb_Blks |
lea _EAX, [_EAX*2] |
|
lea eax,[eax*2] |
|
712 |
movq mm5, [Cst2] |
movq mm5, [Cst2] |
713 |
pxor mm7, mm7 |
pxor mm7, mm7 |
714 |
|
|
715 |
lea esi, [esi+eax*4] |
lea TMP0, [TMP0+_EAX*4] |
716 |
lea edi, [edi+eax*4] |
lea TMP1, [TMP1+_EAX*4] |
717 |
|
|
718 |
neg eax |
neg _EAX |
719 |
|
|
720 |
.Loop: ;12c |
.Loop: ;12c |
721 |
movd mm0, [esi+eax*4] |
movd mm0, [TMP0+_EAX*4] |
722 |
movd mm1, [edi+eax*4] |
movd mm1, [TMP1+_EAX*4] |
723 |
movq mm2, mm5 |
movq mm2, mm5 |
724 |
punpcklbw mm0, mm7 |
punpcklbw mm0, mm7 |
725 |
punpcklbw mm1, mm7 |
punpcklbw mm1, mm7 |
733 |
psraw mm1, 2 |
psraw mm1, 2 |
734 |
packuswb mm0, mm7 |
packuswb mm0, mm7 |
735 |
packuswb mm1, mm7 |
packuswb mm1, mm7 |
736 |
movd [esi+eax*4], mm0 |
movd [TMP0+_EAX*4], mm0 |
737 |
movd [edi+eax*4], mm1 |
movd [TMP1+_EAX*4], mm1 |
738 |
add eax,1 |
add _EAX,1 |
739 |
jl .Loop |
jl .Loop |
740 |
|
|
|
pop edi |
|
|
pop esi |
|
741 |
ret |
ret |
742 |
|
ENDFUNC |
743 |
|
|
744 |
; mmx is of no use here. Better use plain ASM. Moreover, |
; mmx is of no use here. Better use plain ASM. Moreover, |
745 |
; this is for the fun of ASM coding, coz' every modern compiler can |
; this is for the fun of ASM coding, coz' every modern compiler can |
746 |
; end up with a code that looks very much like this one... |
; end up with a code that looks very much like this one... |
747 |
|
|
748 |
align 16 |
align SECTION_ALIGN |
749 |
xvid_VFilter_31_x86: |
xvid_VFilter_31_x86: |
750 |
push esi |
mov TMP0, prm1 ; Src1 |
751 |
push edi |
mov TMP1, prm2 ; Src2 |
752 |
push ebx |
mov _EAX, prm4 ; Nb_Blks |
753 |
push ebp |
lea _EAX, [_EAX*8] |
754 |
mov esi, [esp+4 +16] ; Src1 |
|
755 |
mov edi, [esp+8 +16] ; Src2 |
push _ESI |
756 |
mov ebp, [esp+12 +16] ; BpS |
push _EDI |
757 |
mov eax, [esp+16 +16] ; Nb_Blks |
push _EBX |
758 |
lea eax,[eax*8] |
push _EBP |
759 |
|
|
760 |
|
%ifdef ARCH_IS_X86_64 |
761 |
|
mov _EBP, prm3 |
762 |
|
%else |
763 |
|
mov _EBP, [_ESP+12 +16] ; BpS |
764 |
|
%endif |
765 |
|
|
766 |
.Loop: ;7c |
.Loop: ;7c |
767 |
movzx ecx, byte [esi] |
movzx _ESI, byte [TMP0] |
768 |
movzx edx, byte [edi] |
movzx _EDI, byte [TMP1] |
769 |
|
|
770 |
lea ebx, [ecx+edx+2] |
lea _EBX,[_ESI+_EDI+2] |
771 |
lea ecx,[ebx+2*ecx] |
lea _ESI,[_EBX+2*_ESI] |
772 |
lea edx,[ebx+2*edx] |
lea _EDI,[_EBX+2*_EDI] |
773 |
|
|
774 |
shr ecx,2 |
shr _ESI,2 |
775 |
shr edx,2 |
shr _EDI,2 |
776 |
mov [esi], cl |
mov [TMP0], cl |
777 |
mov [edi], dl |
mov [TMP1], dl |
778 |
lea esi, [esi+ebp] |
lea TMP0, [TMP0+_EBP] |
779 |
lea edi, [edi+ebp] |
lea TMP1, [TMP1+_EBP] |
780 |
dec eax |
dec _EAX |
781 |
jg .Loop |
jg .Loop |
782 |
|
|
783 |
pop ebp |
pop _EBP |
784 |
pop ebx |
pop _EBX |
785 |
pop edi |
pop _EDI |
786 |
pop esi |
pop _ESI |
787 |
ret |
ret |
788 |
|
ENDFUNC |
789 |
|
|
790 |
; this one's just a little faster than gcc's code. Very little. |
; this one's just a little faster than gcc's code. Very little. |
791 |
|
|
792 |
align 16 |
align SECTION_ALIGN |
793 |
xvid_HFilter_31_x86: |
xvid_HFilter_31_x86: |
794 |
push esi |
|
795 |
push edi |
mov TMP0, prm1 ; Src1 |
796 |
push ebx |
mov TMP1, prm2 ; Src2 |
797 |
mov esi, [esp+4 +12] ; Src1 |
mov _EAX, prm3 ; Nb_Blks |
798 |
mov edi, [esp+8 +12] ; Src2 |
|
799 |
mov eax, [esp+12 +12] ; Nb_Blks |
lea _EAX,[_EAX*8] |
800 |
|
lea TMP0, [TMP0+_EAX] |
801 |
lea eax,[eax*8] |
lea TMP1, [TMP0+_EAX] |
802 |
lea esi, [esi+eax] |
neg _EAX |
803 |
lea edi, [esi+eax] |
|
804 |
neg eax |
push _ESI |
805 |
|
push _EDI |
806 |
|
push _EBX |
807 |
|
|
808 |
.Loop: ; 6c |
.Loop: ; 6c |
809 |
movzx ecx, byte [esi+eax] |
movzx _ESI, byte [TMP0+_EAX] |
810 |
movzx edx, byte [edi+eax] |
movzx _EDI, byte [TMP1+_EAX] |
811 |
|
|
812 |
lea ebx, [ecx+edx+2] |
lea _EBX, [_ESI+_EDI+2] |
813 |
lea ecx,[ebx+2*ecx] |
lea _ESI,[_EBX+2*_ESI] |
814 |
lea edx,[ebx+2*edx] |
lea _EDI,[_EBX+2*_EDI] |
815 |
shr ecx,2 |
shr _ESI,2 |
816 |
shr edx,2 |
shr _EDI,2 |
817 |
mov [esi+eax], cl |
mov [TMP0+_EAX], cl |
818 |
mov [edi+eax], dl |
mov [TMP1+_EAX], dl |
819 |
inc eax |
inc _EAX |
820 |
|
|
821 |
jl .Loop |
jl .Loop |
822 |
|
|
823 |
pop ebx |
pop _EBX |
824 |
pop edi |
pop _EDI |
825 |
pop esi |
pop _ESI |
826 |
ret |
ret |
827 |
|
ENDFUNC |
828 |
|
|
829 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
830 |
;// 16b downsampling 16x16 -> 8x8 |
;// 16b downsampling 16x16 -> 8x8 |
862 |
;=========================================================================== |
;=========================================================================== |
863 |
|
|
864 |
%macro COPY_TWO_LINES_1331 1 ; %1: dst |
%macro COPY_TWO_LINES_1331 1 ; %1: dst |
865 |
HFILTER_1331 edx , mm5 |
HFILTER_1331 TMP1 , mm5 |
866 |
HFILTER_1331 edx+eax, mm6 |
HFILTER_1331 TMP1+_EAX, mm6 |
867 |
lea edx, [edx+2*eax] |
lea TMP1, [TMP1+2*_EAX] |
868 |
VFILTER_1331 mm3,mm4,mm5, mm6 |
VFILTER_1331 mm3,mm4,mm5, mm6 |
869 |
movq [%1], mm3 |
movq [%1], mm3 |
870 |
|
|
871 |
HFILTER_1331 edx , mm3 |
HFILTER_1331 TMP1 , mm3 |
872 |
HFILTER_1331 edx+eax, mm4 |
HFILTER_1331 TMP1+_EAX, mm4 |
873 |
lea edx, [edx+2*eax] |
lea TMP1, [TMP1+2*_EAX] |
874 |
VFILTER_1331 mm5,mm6,mm3,mm4 |
VFILTER_1331 mm5,mm6,mm3,mm4 |
875 |
movq [%1+16], mm5 |
movq [%1+16], mm5 |
876 |
%endmacro |
%endmacro |
877 |
|
|
878 |
align 16 |
align SECTION_ALIGN |
879 |
xvid_Filter_18x18_To_8x8_mmx: ; 283c (~4.4c per output pixel) |
xvid_Filter_18x18_To_8x8_mmx: ; 283c (~4.4c per output pixel) |
880 |
|
|
881 |
mov ecx, [esp+4] ; Dst |
mov TMP0, prm1 ; Dst |
882 |
mov edx, [esp+8] ; Src |
mov TMP1, prm2 ; Src |
883 |
mov eax, [esp+12] ; BpS |
mov _EAX, prm3 ; BpS |
884 |
|
|
885 |
movq mm7, [Cst3] |
movq mm7, [Cst3] |
886 |
sub edx, eax |
sub TMP1, _EAX |
887 |
|
|
888 |
; mm3/mm4/mm5/mm6 is used as a 4-samples delay line. |
; mm3/mm4/mm5/mm6 is used as a 4-samples delay line. |
889 |
|
|
890 |
; process columns 0-3 |
; process columns 0-3 |
891 |
|
|
892 |
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4 |
HFILTER_1331 TMP1 , mm3 ; pre-load mm3/mm4 |
893 |
HFILTER_1331 edx+eax, mm4 |
HFILTER_1331 TMP1+_EAX, mm4 |
894 |
lea edx, [edx+2*eax] |
lea TMP1, [TMP1+2*_EAX] |
895 |
|
|
896 |
COPY_TWO_LINES_1331 ecx + 0*16 |
COPY_TWO_LINES_1331 TMP0 + 0*16 |
897 |
COPY_TWO_LINES_1331 ecx + 2*16 |
COPY_TWO_LINES_1331 TMP0 + 2*16 |
898 |
COPY_TWO_LINES_1331 ecx + 4*16 |
COPY_TWO_LINES_1331 TMP0 + 4*16 |
899 |
COPY_TWO_LINES_1331 ecx + 6*16 |
COPY_TWO_LINES_1331 TMP0 + 6*16 |
900 |
|
|
901 |
; process columns 4-7 |
; process columns 4-7 |
902 |
|
|
903 |
mov edx, [esp+8] |
mov TMP1, prm2 |
904 |
sub edx, eax |
sub TMP1, _EAX |
905 |
add edx, 8 |
add TMP1, 8 |
906 |
|
|
907 |
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4 |
HFILTER_1331 TMP1 , mm3 ; pre-load mm3/mm4 |
908 |
HFILTER_1331 edx+eax, mm4 |
HFILTER_1331 TMP1+_EAX, mm4 |
909 |
lea edx, [edx+2*eax] |
lea TMP1, [TMP1+2*_EAX] |
910 |
|
|
911 |
COPY_TWO_LINES_1331 ecx + 0*16 +8 |
COPY_TWO_LINES_1331 TMP0 + 0*16 +8 |
912 |
COPY_TWO_LINES_1331 ecx + 2*16 +8 |
COPY_TWO_LINES_1331 TMP0 + 2*16 +8 |
913 |
COPY_TWO_LINES_1331 ecx + 4*16 +8 |
COPY_TWO_LINES_1331 TMP0 + 4*16 +8 |
914 |
COPY_TWO_LINES_1331 ecx + 6*16 +8 |
COPY_TWO_LINES_1331 TMP0 + 6*16 +8 |
915 |
|
|
916 |
ret |
ret |
917 |
|
ENDFUNC |
918 |
|
|
919 |
;=========================================================================== |
;=========================================================================== |
920 |
; |
; |
924 |
;=========================================================================== |
;=========================================================================== |
925 |
|
|
926 |
%macro DIFF_TWO_LINES_1331 1 ; %1: dst |
%macro DIFF_TWO_LINES_1331 1 ; %1: dst |
927 |
HFILTER_1331 edx , mm5 |
HFILTER_1331 TMP1 , mm5 |
928 |
HFILTER_1331 edx+eax, mm6 |
HFILTER_1331 TMP1+_EAX, mm6 |
929 |
lea edx, [edx+2*eax] |
lea TMP1, [TMP1+2*_EAX] |
930 |
movq mm2, [%1] |
movq mm2, [%1] |
931 |
VFILTER_1331 mm3,mm4,mm5, mm6 |
VFILTER_1331 mm3,mm4,mm5, mm6 |
932 |
psubsw mm2, mm3 |
psubsw mm2, mm3 |
933 |
movq [%1], mm2 |
movq [%1], mm2 |
934 |
|
|
935 |
HFILTER_1331 edx , mm3 |
HFILTER_1331 TMP1 , mm3 |
936 |
HFILTER_1331 edx+eax, mm4 |
HFILTER_1331 TMP1+_EAX, mm4 |
937 |
lea edx, [edx+2*eax] |
lea TMP1, [TMP1+2*_EAX] |
938 |
movq mm2, [%1+16] |
movq mm2, [%1+16] |
939 |
VFILTER_1331 mm5,mm6,mm3,mm4 |
VFILTER_1331 mm5,mm6,mm3,mm4 |
940 |
psubsw mm2, mm5 |
psubsw mm2, mm5 |
941 |
movq [%1+16], mm2 |
movq [%1+16], mm2 |
942 |
%endmacro |
%endmacro |
943 |
|
|
944 |
align 16 |
align SECTION_ALIGN |
945 |
xvid_Filter_Diff_18x18_To_8x8_mmx: ; 302c |
xvid_Filter_Diff_18x18_To_8x8_mmx: ; 302c |
946 |
|
|
947 |
mov ecx, [esp+4] ; Dst |
mov TMP0, prm1 ; Dst |
948 |
mov edx, [esp+8] ; Src |
mov TMP1, prm2 ; Src |
949 |
mov eax, [esp+12] ; BpS |
mov _EAX, prm3 ; BpS |
950 |
|
|
951 |
movq mm7, [Cst3] |
movq mm7, [Cst3] |
952 |
sub edx, eax |
sub TMP1, _EAX |
953 |
|
|
954 |
; mm3/mm4/mm5/mm6 is used as a 4-samples delay line. |
; mm3/mm4/mm5/mm6 is used as a 4-samples delay line. |
955 |
|
|
956 |
; process columns 0-3 |
; process columns 0-3 |
957 |
|
|
958 |
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4 |
HFILTER_1331 TMP1 , mm3 ; pre-load mm3/mm4 |
959 |
HFILTER_1331 edx+eax, mm4 |
HFILTER_1331 TMP1+_EAX, mm4 |
960 |
lea edx, [edx+2*eax] |
lea TMP1, [TMP1+2*_EAX] |
961 |
|
|
962 |
DIFF_TWO_LINES_1331 ecx + 0*16 |
DIFF_TWO_LINES_1331 TMP0 + 0*16 |
963 |
DIFF_TWO_LINES_1331 ecx + 2*16 |
DIFF_TWO_LINES_1331 TMP0 + 2*16 |
964 |
DIFF_TWO_LINES_1331 ecx + 4*16 |
DIFF_TWO_LINES_1331 TMP0 + 4*16 |
965 |
DIFF_TWO_LINES_1331 ecx + 6*16 |
DIFF_TWO_LINES_1331 TMP0 + 6*16 |
966 |
|
|
967 |
; process columns 4-7 |
; process columns 4-7 |
968 |
mov edx, [esp+8] |
mov TMP1, prm2 |
969 |
sub edx, eax |
sub TMP1, _EAX |
970 |
add edx, 8 |
add TMP1, 8 |
971 |
|
|
972 |
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4 |
HFILTER_1331 TMP1 , mm3 ; pre-load mm3/mm4 |
973 |
HFILTER_1331 edx+eax, mm4 |
HFILTER_1331 TMP1+_EAX, mm4 |
974 |
lea edx, [edx+2*eax] |
lea TMP1, [TMP1+2*_EAX] |
975 |
|
|
976 |
DIFF_TWO_LINES_1331 ecx + 0*16 +8 |
DIFF_TWO_LINES_1331 TMP0 + 0*16 +8 |
977 |
DIFF_TWO_LINES_1331 ecx + 2*16 +8 |
DIFF_TWO_LINES_1331 TMP0 + 2*16 +8 |
978 |
DIFF_TWO_LINES_1331 ecx + 4*16 +8 |
DIFF_TWO_LINES_1331 TMP0 + 4*16 +8 |
979 |
DIFF_TWO_LINES_1331 ecx + 6*16 +8 |
DIFF_TWO_LINES_1331 TMP0 + 6*16 +8 |
980 |
|
|
981 |
ret |
ret |
982 |
|
ENDFUNC |
983 |
|
|
984 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
985 |
|
|
986 |
; pfeewwww... Never Do That On Stage Again. :) |
; pfeewwww... Never Do That On Stage Again. :) |
987 |
|
|
988 |
|
|
989 |
|
%ifidn __OUTPUT_FORMAT__,elf |
990 |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
991 |
|
%endif |
992 |
|
|