1 |
;/************************************************************************** |
;/***************************************************************************** |
2 |
; * |
; * |
3 |
; * XVID MPEG-4 VIDEO CODEC |
; * XVID MPEG-4 VIDEO CODEC |
4 |
; * xmm 8x8 block-based halfpel interpolation |
; * - 3dne pipeline optimized 8x8 block-based halfpel interpolation - |
5 |
; * |
; * |
6 |
; * This program is an implementation of a part of one or more MPEG-4 |
; * Copyright(C) 2002 Jaan Kalda |
|
; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending |
|
|
; * to use this software module in hardware or software products are |
|
|
; * advised that its use may infringe existing patents or copyrights, and |
|
|
; * any such use would be at such party's own risk. The original |
|
|
; * developer of this software module and his/her company, and subsequent |
|
|
; * editors and their companies, will have no liability for use of this |
|
|
; * software or modifications or derivatives thereof. |
|
7 |
; * |
; * |
8 |
; * This program is free software; you can redistribute it and/or modify |
; * This program is free software; you can redistribute it and/or modify |
9 |
; * it under the terms of the GNU General Public License as published by |
; * it under the terms of the GNU General Public License as published by |
17 |
; * |
; * |
18 |
; * You should have received a copy of the GNU General Public License |
; * You should have received a copy of the GNU General Public License |
19 |
; * along with this program; if not, write to the Free Software |
; * along with this program; if not, write to the Free Software |
20 |
; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
; * |
; * |
22 |
; *************************************************************************/ |
; ****************************************************************************/ |
23 |
|
|
24 |
; these 3dne functions are compatible with iSSE, but are optimized specifically for |
; these 3dne functions are compatible with iSSE, but are optimized specifically |
25 |
; K7 pipelines |
; for K7 pipelines |
|
; |
|
|
;------------------------------------------------------------------------------ |
|
|
; 09.12.2002 Athlon optimizations contributed by Jaan Kalda |
|
|
;------------------------------------------------------------------------------ |
|
26 |
|
|
27 |
bits 32 |
BITS 32 |
28 |
|
|
29 |
%macro cglobal 1 |
%macro cglobal 1 |
30 |
%ifdef PREFIX |
%ifdef PREFIX |
31 |
|
%ifdef MARK_FUNCS |
32 |
|
global _%1:function %1.endfunc-%1 |
33 |
|
%define %1 _%1:function %1.endfunc-%1 |
34 |
|
%else |
35 |
global _%1 |
global _%1 |
36 |
%define %1 _%1 |
%define %1 _%1 |
37 |
|
%endif |
38 |
|
%else |
39 |
|
%ifdef MARK_FUNCS |
40 |
|
global %1:function %1.endfunc-%1 |
41 |
%else |
%else |
42 |
global %1 |
global %1 |
43 |
%endif |
%endif |
44 |
|
%endif |
45 |
%endmacro |
%endmacro |
46 |
%macro nop4 0 |
|
47 |
DB 08Dh,074h,026h,0 |
;============================================================================= |
48 |
%endmacro |
; Read only data |
49 |
|
;============================================================================= |
50 |
|
|
51 |
%ifdef FORMAT_COFF |
%ifdef FORMAT_COFF |
52 |
section .data data |
SECTION .rodata |
53 |
%else |
%else |
54 |
section .data data align=16 |
SECTION .rodata align=16 |
55 |
%endif |
%endif |
56 |
|
|
57 |
|
ALIGN 16 |
58 |
align 16 |
mmx_one: |
|
mmx_one |
|
59 |
times 8 db 1 |
times 8 db 1 |
60 |
|
|
61 |
align 8 |
ALIGN 8 |
62 |
mm_minusone: |
mm_minusone: |
63 |
dd -1,-1 |
dd -1,-1 |
64 |
|
|
65 |
section .text |
;============================================================================= |
66 |
|
; Macros |
67 |
|
;============================================================================= |
68 |
|
|
69 |
|
%macro nop4 0 |
70 |
|
DB 08Dh,074h,026h,0 |
71 |
|
%endmacro |
72 |
|
|
73 |
|
;============================================================================= |
74 |
|
; Macros |
75 |
|
;============================================================================= |
76 |
|
|
77 |
|
SECTION .text |
78 |
|
|
79 |
cglobal interpolate8x8_halfpel_h_3dne |
cglobal interpolate8x8_halfpel_h_3dne |
80 |
cglobal interpolate8x8_halfpel_v_3dne |
cglobal interpolate8x8_halfpel_v_3dne |
81 |
cglobal interpolate8x8_halfpel_hv_3dne |
cglobal interpolate8x8_halfpel_hv_3dne |
82 |
|
|
83 |
;=========================================================================== |
cglobal interpolate8x4_halfpel_h_3dne |
84 |
|
cglobal interpolate8x4_halfpel_v_3dne |
85 |
|
cglobal interpolate8x4_halfpel_hv_3dne |
86 |
|
|
87 |
|
;----------------------------------------------------------------------------- |
88 |
; |
; |
89 |
; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst, |
; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst, |
90 |
; const uint8_t * const src, |
; const uint8_t * const src, |
91 |
; const uint32_t stride, |
; const uint32_t stride, |
92 |
; const uint32_t rounding); |
; const uint32_t rounding); |
93 |
; |
; |
94 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
95 |
|
|
96 |
%macro COPY_H_SSE_RND0 1 |
%macro COPY_H_SSE_RND0 1 |
97 |
%if (%1) |
%if (%1) |
98 |
movq mm0, [eax] |
movq mm0, [eax] |
99 |
%else |
%else |
100 |
movq mm0, [dword eax] |
movq mm0, [eax+0] |
101 |
|
; --- |
102 |
|
; nasm >0.99.x rejects the original statement: |
103 |
|
; movq mm0, [dword eax] |
104 |
|
; as it is ambiguous. for this statement nasm <0.99.x would |
105 |
|
; generate "movq mm0,[eax+0]" |
106 |
|
; --- |
107 |
%endif |
%endif |
108 |
pavgb mm0, [eax+1] |
pavgb mm0, [eax+1] |
109 |
movq mm1, [eax+edx] |
movq mm1, [eax+edx] |
133 |
movq [ecx+edx], mm1 |
movq [ecx+edx], mm1 |
134 |
%endmacro |
%endmacro |
135 |
|
|
136 |
align 16 |
ALIGN 16 |
137 |
interpolate8x8_halfpel_h_3dne: |
interpolate8x8_halfpel_h_3dne: |
138 |
|
|
139 |
mov eax, [esp+ 8] ; Src |
mov eax, [esp+ 8] ; Src |
164 |
lea ecx,[ecx+2*edx] |
lea ecx,[ecx+2*edx] |
165 |
COPY_H_SSE_RND1 |
COPY_H_SSE_RND1 |
166 |
ret |
ret |
167 |
|
.endfunc |
168 |
|
|
169 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
170 |
; |
; |
171 |
; void interpolate8x8_halfpel_v_3dne(uint8_t * const dst, |
; void interpolate8x8_halfpel_v_3dne(uint8_t * const dst, |
172 |
; const uint8_t * const src, |
; const uint8_t * const src, |
173 |
; const uint32_t stride, |
; const uint32_t stride, |
174 |
; const uint32_t rounding); |
; const uint32_t rounding); |
175 |
; |
; |
176 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
177 |
|
|
178 |
align 16 |
ALIGN 16 |
179 |
interpolate8x8_halfpel_v_3dne: |
interpolate8x8_halfpel_v_3dne: |
180 |
|
|
181 |
mov eax, [esp+ 8] ; Src |
mov eax, [esp+ 8] ; Src |
223 |
movq [ecx+edx],mm7 |
movq [ecx+edx],mm7 |
224 |
ret |
ret |
225 |
|
|
226 |
align 8 |
ALIGN 8 |
227 |
.rounding1 |
.rounding1 |
228 |
pcmpeqb mm0,mm0 |
pcmpeqb mm0,mm0 |
229 |
psubusb mm0,[eax] |
psubusb mm0,[eax] |
288 |
movq [ecx], mm4 |
movq [ecx], mm4 |
289 |
movq [ecx+edx], mm5 |
movq [ecx+edx], mm5 |
290 |
ret |
ret |
291 |
;=========================================================================== |
.endfunc |
292 |
|
|
293 |
|
;----------------------------------------------------------------------------- |
294 |
; |
; |
295 |
; void interpolate8x8_halfpel_hv_3dne(uint8_t * const dst, |
; void interpolate8x8_halfpel_hv_3dne(uint8_t * const dst, |
296 |
; const uint8_t * const src, |
; const uint8_t * const src, |
298 |
; const uint32_t rounding); |
; const uint32_t rounding); |
299 |
; |
; |
300 |
; |
; |
301 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
302 |
|
|
303 |
; The trick is to correct the result of 'pavgb' with some combination of the |
; The trick is to correct the result of 'pavgb' with some combination of the |
304 |
; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). |
; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). |
384 |
movq [ecx+edx], mm0 |
movq [ecx+edx], mm0 |
385 |
%endmacro |
%endmacro |
386 |
|
|
387 |
align 16 |
ALIGN 16 |
388 |
interpolate8x8_halfpel_hv_3dne: |
interpolate8x8_halfpel_hv_3dne: |
389 |
mov eax, [esp+ 8] ; Src |
mov eax, [esp+ 8] ; Src |
390 |
mov edx, [esp+12] ; stride |
mov edx, [esp+12] ; stride |
410 |
COPY_HV_SSE_RND0 |
COPY_HV_SSE_RND0 |
411 |
ret |
ret |
412 |
|
|
413 |
align 16 |
ALIGN 16 |
414 |
.rounding1 |
.rounding1 |
415 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
416 |
lea ecx,[ecx+2*edx] |
lea ecx,[ecx+2*edx] |
420 |
lea ecx,[ecx+2*edx] |
lea ecx,[ecx+2*edx] |
421 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
422 |
ret |
ret |
423 |
|
.endfunc |
424 |
|
|
425 |
|
;----------------------------------------------------------------------------- |
426 |
|
; |
427 |
|
; void interpolate8x4_halfpel_h_3dne(uint8_t * const dst, |
428 |
|
; const uint8_t * const src, |
429 |
|
; const uint32_t stride, |
430 |
|
; const uint32_t rounding); |
431 |
|
; |
432 |
|
;----------------------------------------------------------------------------- |
433 |
|
|
434 |
|
ALIGN 16 |
435 |
|
interpolate8x4_halfpel_h_3dne: |
436 |
|
|
437 |
|
mov eax, [esp+ 8] ; Src |
438 |
|
mov edx, [esp+12] ; stride |
439 |
|
dec dword [esp+16]; rounding |
440 |
|
|
441 |
|
jz .rounding1 |
442 |
|
mov ecx, [esp+ 4] ; Dst |
443 |
|
|
444 |
|
COPY_H_SSE_RND0 0 |
445 |
|
lea ecx,[ecx+2*edx] |
446 |
|
COPY_H_SSE_RND0 1 |
447 |
|
ret |
448 |
|
|
449 |
|
.rounding1 |
450 |
|
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
451 |
|
mov ecx, [esp+ 4] ; Dst |
452 |
|
movq mm7, [mmx_one] |
453 |
|
COPY_H_SSE_RND1 |
454 |
|
lea ecx, [ecx+2*edx] |
455 |
|
COPY_H_SSE_RND1 |
456 |
|
ret |
457 |
|
.endfunc |
458 |
|
|
459 |
|
;----------------------------------------------------------------------------- |
460 |
|
; |
461 |
|
; void interpolate8x4_halfpel_v_3dne(uint8_t * const dst, |
462 |
|
; const uint8_t * const src, |
463 |
|
; const uint32_t stride, |
464 |
|
; const uint32_t rounding); |
465 |
|
; |
466 |
|
;----------------------------------------------------------------------------- |
467 |
|
|
468 |
|
ALIGN 16 |
469 |
|
interpolate8x4_halfpel_v_3dne: |
470 |
|
|
471 |
|
mov eax, [esp+ 8] ; Src |
472 |
|
mov edx, [esp+12] ; stride |
473 |
|
dec dword [esp+16]; rounding |
474 |
|
|
475 |
|
; we process 2 line at a time |
476 |
|
|
477 |
|
jz .rounding1 |
478 |
|
pxor mm2,mm2 |
479 |
|
movq mm0, [eax] |
480 |
|
movq mm1, [eax+edx] |
481 |
|
por mm2, [eax+2*edx] ; Something like preload (pipelining) |
482 |
|
mov ecx, [esp+ 4] ; Dst |
483 |
|
lea eax, [eax+2*edx] |
484 |
|
pxor mm4, mm4 |
485 |
|
pavgb mm0, mm1 |
486 |
|
pavgb mm1, mm2 |
487 |
|
movq [byte ecx], mm0 |
488 |
|
movq [ecx+edx], mm1 |
489 |
|
|
490 |
|
pxor mm6, mm6 |
491 |
|
add eax, edx |
492 |
|
lea ecx, [ecx+2*edx] |
493 |
|
movq mm3, [byte eax] |
494 |
|
por mm4, [eax+edx] |
495 |
|
lea eax, [eax+2*edx] |
496 |
|
pavgb mm2, mm3 |
497 |
|
pavgb mm3, mm4 |
498 |
|
movq [ecx], mm2 |
499 |
|
movq [ecx+edx], mm3 |
500 |
|
|
501 |
|
ret |
502 |
|
|
503 |
|
ALIGN 8 |
504 |
|
.rounding1 |
505 |
|
pcmpeqb mm0, mm0 |
506 |
|
psubusb mm0, [eax] ; eax==line0 |
507 |
|
add eax, edx ; eax==line1 |
508 |
|
mov ecx, [esp+ 4] ; Dst |
509 |
|
|
510 |
|
push esi |
511 |
|
|
512 |
|
pcmpeqb mm1, mm1 |
513 |
|
pcmpeqb mm2, mm2 |
514 |
|
mov esi, mm_minusone |
515 |
|
psubusb mm1, [byte eax] ; line1 |
516 |
|
psubusb mm2, [eax+edx] ; line2 |
517 |
|
lea eax, [eax+2*edx] ; eax==line3 |
518 |
|
movq mm6, [esi] |
519 |
|
movq mm7, [esi] |
520 |
|
pavgb mm0, mm1 |
521 |
|
pavgb mm1, mm2 |
522 |
|
psubusb mm6, mm0 |
523 |
|
psubusb mm7, mm1 |
524 |
|
movq [ecx], mm6 ; store line0 |
525 |
|
movq [ecx+edx], mm7 ; store line1 |
526 |
|
|
527 |
|
lea ecx, [ecx+2*edx] |
528 |
|
pcmpeqb mm3, mm3 |
529 |
|
pcmpeqb mm4, mm4 |
530 |
|
psubusb mm3, [eax] ; line3 |
531 |
|
psubusb mm4, [eax+edx] ; line4 |
532 |
|
lea eax, [eax+2*edx] ; eax==line 5 |
533 |
|
pavgb mm2, mm3 |
534 |
|
pavgb mm3, mm4 |
535 |
|
movq mm0, [esi] |
536 |
|
movq mm1, [esi] |
537 |
|
psubusb mm0, mm2 |
538 |
|
psubusb mm1, mm3 |
539 |
|
movq [ecx], mm0 |
540 |
|
movq [ecx+edx], mm1 |
541 |
|
|
542 |
|
pop esi |
543 |
|
|
544 |
|
ret |
545 |
|
|
546 |
|
.endfunc |
547 |
|
|
548 |
|
;----------------------------------------------------------------------------- |
549 |
|
; |
550 |
|
; void interpolate8x4_halfpel_hv_3dne(uint8_t * const dst, |
551 |
|
; const uint8_t * const src, |
552 |
|
; const uint32_t stride, |
553 |
|
; const uint32_t rounding); |
554 |
|
; |
555 |
|
; |
556 |
|
;----------------------------------------------------------------------------- |
557 |
|
|
558 |
|
ALIGN 16 |
559 |
|
interpolate8x4_halfpel_hv_3dne: |
560 |
|
mov eax, [esp+ 8] ; Src |
561 |
|
mov edx, [esp+12] ; stride |
562 |
|
dec dword [esp+16] ; rounding |
563 |
|
|
564 |
|
; loop invariants: mm2=(i+j+1)/2 and mm3= i^j |
565 |
|
movq mm2, [eax] |
566 |
|
movq mm3, [eax+1] |
567 |
|
movq mm6, mm2 |
568 |
|
pavgb mm2, mm3 |
569 |
|
pxor mm3, mm6 ; mm2/mm3 ready |
570 |
|
mov ecx, [esp+ 4] ; Dst |
571 |
|
movq mm7, [mmx_one] |
572 |
|
|
573 |
|
jz near .rounding1 |
574 |
|
lea ebp,[byte ebp] |
575 |
|
COPY_HV_SSE_RND0 |
576 |
|
lea ecx,[ecx+2*edx] |
577 |
|
COPY_HV_SSE_RND0 |
578 |
|
ret |
579 |
|
|
580 |
|
ALIGN 16 |
581 |
|
.rounding1 |
582 |
|
COPY_HV_SSE_RND1 |
583 |
|
lea ecx,[ecx+2*edx] |
584 |
|
COPY_HV_SSE_RND1 |
585 |
|
ret |
586 |
|
.endfunc |
587 |
|
|