1 |
;/************************************************************************** |
;/***************************************************************************** |
2 |
; * |
; * |
3 |
; * XVID MPEG-4 VIDEO CODEC |
; * XVID MPEG-4 VIDEO CODEC |
4 |
; * xmm 8x8 block-based halfpel interpolation |
; * - 3dne pipeline optimized 8x8 block-based halfpel interpolation - |
5 |
; * |
; * |
6 |
; * This program is an implementation of a part of one or more MPEG-4 |
; * Copyright(C) 2002 Jaan Kalda |
|
; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending |
|
|
; * to use this software module in hardware or software products are |
|
|
; * advised that its use may infringe existing patents or copyrights, and |
|
|
; * any such use would be at such party's own risk. The original |
|
|
; * developer of this software module and his/her company, and subsequent |
|
|
; * editors and their companies, will have no liability for use of this |
|
|
; * software or modifications or derivatives thereof. |
|
7 |
; * |
; * |
8 |
; * This program is free software; you can redistribute it and/or modify |
; * This program is free software; you can redistribute it and/or modify |
9 |
; * it under the terms of the GNU General Public License as published by |
; * it under the terms of the GNU General Public License as published by |
17 |
; * |
; * |
18 |
; * You should have received a copy of the GNU General Public License |
; * You should have received a copy of the GNU General Public License |
19 |
; * along with this program; if not, write to the Free Software |
; * along with this program; if not, write to the Free Software |
20 |
; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
; * |
; * |
22 |
; *************************************************************************/ |
; ****************************************************************************/ |
23 |
|
|
24 |
; these 3dne functions are compatible with iSSE, but are optimized specifically for |
; these 3dne functions are compatible with iSSE, but are optimized specifically |
25 |
; K7 pipelines |
; for K7 pipelines |
|
; |
|
|
;------------------------------------------------------------------------------ |
|
|
; 09.12.2002 Athlon optimizations contributed by Jaan Kalda |
|
|
;------------------------------------------------------------------------------ |
|
26 |
|
|
27 |
bits 32 |
BITS 32 |
28 |
|
|
29 |
%macro cglobal 1 |
%macro cglobal 1 |
30 |
%ifdef PREFIX |
%ifdef PREFIX |
31 |
|
%ifdef MARK_FUNCS |
32 |
|
global _%1:function %1.endfunc-%1 |
33 |
|
%define %1 _%1:function %1.endfunc-%1 |
34 |
|
%else |
35 |
global _%1 |
global _%1 |
36 |
%define %1 _%1 |
%define %1 _%1 |
37 |
|
%endif |
38 |
|
%else |
39 |
|
%ifdef MARK_FUNCS |
40 |
|
global %1:function %1.endfunc-%1 |
41 |
%else |
%else |
42 |
global %1 |
global %1 |
43 |
%endif |
%endif |
44 |
|
%endif |
45 |
%endmacro |
%endmacro |
46 |
%macro nop4 0 |
|
47 |
DB 08Dh,074h,026h,0 |
;============================================================================= |
48 |
%endmacro |
; Read only data |
49 |
|
;============================================================================= |
50 |
|
|
51 |
%ifdef FORMAT_COFF |
%ifdef FORMAT_COFF |
52 |
section .data data |
SECTION .rodata |
53 |
%else |
%else |
54 |
section .data data align=16 |
SECTION .rodata align=16 |
55 |
%endif |
%endif |
56 |
|
|
57 |
|
ALIGN 16 |
58 |
align 16 |
mmx_one: |
|
mmx_one |
|
59 |
times 8 db 1 |
times 8 db 1 |
60 |
|
|
61 |
align 8 |
ALIGN 8 |
62 |
mm_minusone: |
mm_minusone: |
63 |
dd -1,-1 |
dd -1,-1 |
64 |
|
|
65 |
section .text |
;============================================================================= |
66 |
|
; Macros |
67 |
|
;============================================================================= |
68 |
|
|
69 |
|
%macro nop4 0 |
70 |
|
DB 08Dh,074h,026h,0 |
71 |
|
%endmacro |
72 |
|
|
73 |
|
;============================================================================= |
74 |
|
; Macros |
75 |
|
;============================================================================= |
76 |
|
|
77 |
|
SECTION .text |
78 |
|
|
79 |
cglobal interpolate8x8_halfpel_h_3dne |
cglobal interpolate8x8_halfpel_h_3dne |
80 |
cglobal interpolate8x8_halfpel_v_3dne |
cglobal interpolate8x8_halfpel_v_3dne |
81 |
cglobal interpolate8x8_halfpel_hv_3dne |
cglobal interpolate8x8_halfpel_hv_3dne |
82 |
|
|
83 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
84 |
; |
; |
85 |
; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst, |
; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst, |
86 |
; const uint8_t * const src, |
; const uint8_t * const src, |
87 |
; const uint32_t stride, |
; const uint32_t stride, |
88 |
; const uint32_t rounding); |
; const uint32_t rounding); |
89 |
; |
; |
90 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
91 |
|
|
92 |
%macro COPY_H_SSE_RND0 1 |
%macro COPY_H_SSE_RND0 1 |
93 |
%if (%1) |
%if (%1) |
123 |
movq [ecx+edx], mm1 |
movq [ecx+edx], mm1 |
124 |
%endmacro |
%endmacro |
125 |
|
|
126 |
align 16 |
ALIGN 16 |
127 |
interpolate8x8_halfpel_h_3dne: |
interpolate8x8_halfpel_h_3dne: |
128 |
|
|
129 |
mov eax, [esp+ 8] ; Src |
mov eax, [esp+ 8] ; Src |
154 |
lea ecx,[ecx+2*edx] |
lea ecx,[ecx+2*edx] |
155 |
COPY_H_SSE_RND1 |
COPY_H_SSE_RND1 |
156 |
ret |
ret |
157 |
|
.endfunc |
158 |
|
|
159 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
160 |
; |
; |
161 |
; void interpolate8x8_halfpel_v_3dne(uint8_t * const dst, |
; void interpolate8x8_halfpel_v_3dne(uint8_t * const dst, |
162 |
; const uint8_t * const src, |
; const uint8_t * const src, |
163 |
; const uint32_t stride, |
; const uint32_t stride, |
164 |
; const uint32_t rounding); |
; const uint32_t rounding); |
165 |
; |
; |
166 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
167 |
|
|
168 |
align 16 |
ALIGN 16 |
169 |
interpolate8x8_halfpel_v_3dne: |
interpolate8x8_halfpel_v_3dne: |
170 |
|
|
171 |
mov eax, [esp+ 8] ; Src |
mov eax, [esp+ 8] ; Src |
213 |
movq [ecx+edx],mm7 |
movq [ecx+edx],mm7 |
214 |
ret |
ret |
215 |
|
|
216 |
align 8 |
ALIGN 8 |
217 |
.rounding1 |
.rounding1 |
218 |
pcmpeqb mm0,mm0 |
pcmpeqb mm0,mm0 |
219 |
psubusb mm0,[eax] |
psubusb mm0,[eax] |
278 |
movq [ecx], mm4 |
movq [ecx], mm4 |
279 |
movq [ecx+edx], mm5 |
movq [ecx+edx], mm5 |
280 |
ret |
ret |
281 |
;=========================================================================== |
.endfunc |
282 |
|
|
283 |
|
;----------------------------------------------------------------------------- |
284 |
; |
; |
285 |
; void interpolate8x8_halfpel_hv_3dne(uint8_t * const dst, |
; void interpolate8x8_halfpel_hv_3dne(uint8_t * const dst, |
286 |
; const uint8_t * const src, |
; const uint8_t * const src, |
288 |
; const uint32_t rounding); |
; const uint32_t rounding); |
289 |
; |
; |
290 |
; |
; |
291 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
292 |
|
|
293 |
; The trick is to correct the result of 'pavgb' with some combination of the |
; The trick is to correct the result of 'pavgb' with some combination of the |
294 |
; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). |
; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). |
374 |
movq [ecx+edx], mm0 |
movq [ecx+edx], mm0 |
375 |
%endmacro |
%endmacro |
376 |
|
|
377 |
align 16 |
ALIGN 16 |
378 |
interpolate8x8_halfpel_hv_3dne: |
interpolate8x8_halfpel_hv_3dne: |
379 |
mov eax, [esp+ 8] ; Src |
mov eax, [esp+ 8] ; Src |
380 |
mov edx, [esp+12] ; stride |
mov edx, [esp+12] ; stride |
400 |
COPY_HV_SSE_RND0 |
COPY_HV_SSE_RND0 |
401 |
ret |
ret |
402 |
|
|
403 |
align 16 |
ALIGN 16 |
404 |
.rounding1 |
.rounding1 |
405 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
406 |
lea ecx,[ecx+2*edx] |
lea ecx,[ecx+2*edx] |
410 |
lea ecx,[ecx+2*edx] |
lea ecx,[ecx+2*edx] |
411 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
412 |
ret |
ret |
413 |
|
.endfunc |
414 |
|
|