6 |
; * |
; * |
7 |
; * This file is part of XviD, a free MPEG-4 video encoder/decoder |
; * This file is part of XviD, a free MPEG-4 video encoder/decoder |
8 |
; * |
; * |
9 |
; * XviD is free software; you can redistribute it and/or modify it |
; * XviD is free software; you can rDST_PTRstribute it and/or modify it |
10 |
; * under the terms of the GNU General Public License as published by |
; * under the terms of the GNU General Public License as published by |
11 |
; * the Free Software Foundation; either version 2 of the License, or |
; * the Free Software Foundation; either version 2 of the License, or |
12 |
; * (at your option) any later version. |
; * (at your option) any later version. |
20 |
; * along with this program; if not, write to the Free Software |
; * along with this program; if not, write to the Free Software |
21 |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
22 |
; * |
; * |
23 |
; * $Id: qpel_mmx.asm,v 1.6 2008-08-19 09:06:48 Isibaar Exp $ |
; * $Id: qpel_mmx.asm,v 1.8 2008-11-26 01:04:34 Isibaar Exp $ |
24 |
; * |
; * |
25 |
; *************************************************************************/ |
; *************************************************************************/ |
26 |
|
|
38 |
; instead of xvid_Expand_mmx... |
; instead of xvid_Expand_mmx... |
39 |
|
|
40 |
|
|
41 |
bits 32 |
%include "nasm.inc" |
|
|
|
|
%macro cglobal 1 |
|
|
%ifdef PREFIX |
|
|
%ifdef MARK_FUNCS |
|
|
global _%1:function %1.endfunc-%1 |
|
|
%define %1 _%1:function %1.endfunc-%1 |
|
|
%else |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%endif |
|
|
%else |
|
|
%ifdef MARK_FUNCS |
|
|
global %1:function %1.endfunc-%1 |
|
|
%else |
|
|
global %1 |
|
|
%endif |
|
|
%endif |
|
|
%endmacro |
|
|
%macro cextern 1 |
|
|
%ifdef PREFIX |
|
|
extern _%1 |
|
|
%define %1 _%1 |
|
|
%else |
|
|
extern %1 |
|
|
%endif |
|
|
%endmacro |
|
|
|
|
42 |
|
|
43 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
44 |
;// Declarations |
;// Declarations |
73 |
cglobal xvid_V_Pass_Avrg_8_Add_mmx |
cglobal xvid_V_Pass_Avrg_8_Add_mmx |
74 |
cglobal xvid_V_Pass_Avrg_Up_8_Add_mmx |
cglobal xvid_V_Pass_Avrg_Up_8_Add_mmx |
75 |
|
|
76 |
cextern xvid_Expand_mmx |
cglobal xvid_Expand_mmx |
77 |
|
|
78 |
%ifdef USE_TABLES |
cglobal xvid_FIR_1_0_0_0 |
79 |
|
cglobal xvid_FIR_3_1_0_0 |
80 |
cextern xvid_FIR_1_0_0_0 |
cglobal xvid_FIR_6_3_1_0 |
81 |
cextern xvid_FIR_3_1_0_0 |
cglobal xvid_FIR_14_3_2_1 |
82 |
cextern xvid_FIR_6_3_1_0 |
cglobal xvid_FIR_20_6_3_1 |
83 |
cextern xvid_FIR_14_3_2_1 |
cglobal xvid_FIR_20_20_6_3 |
84 |
cextern xvid_FIR_20_6_3_1 |
cglobal xvid_FIR_23_19_6_3 |
85 |
cextern xvid_FIR_20_20_6_3 |
cglobal xvid_FIR_7_20_20_6 |
86 |
cextern xvid_FIR_23_19_6_3 |
cglobal xvid_FIR_6_20_20_6 |
87 |
cextern xvid_FIR_7_20_20_6 |
cglobal xvid_FIR_6_20_20_7 |
88 |
cextern xvid_FIR_6_20_20_6 |
cglobal xvid_FIR_3_6_20_20 |
89 |
cextern xvid_FIR_6_20_20_7 |
cglobal xvid_FIR_3_6_19_23 |
90 |
cextern xvid_FIR_3_6_20_20 |
cglobal xvid_FIR_1_3_6_20 |
91 |
cextern xvid_FIR_3_6_19_23 |
cglobal xvid_FIR_1_2_3_14 |
92 |
cextern xvid_FIR_1_3_6_20 |
cglobal xvid_FIR_0_1_3_6 |
93 |
cextern xvid_FIR_1_2_3_14 |
cglobal xvid_FIR_0_0_1_3 |
94 |
cextern xvid_FIR_0_1_3_6 |
cglobal xvid_FIR_0_0_0_1 |
95 |
cextern xvid_FIR_0_0_1_3 |
|
96 |
cextern xvid_FIR_0_0_0_1 |
SECTION .data align=SECTION_ALIGN |
97 |
|
|
98 |
%endif |
align SECTION_ALIGN |
99 |
|
xvid_Expand_mmx: |
100 |
|
times 256*4 dw 0 ; uint16_t xvid_Expand_mmx[256][4] |
101 |
|
ENDFUNC |
102 |
|
|
103 |
|
xvid_FIR_1_0_0_0: |
104 |
|
times 256*4 dw 0 |
105 |
|
ENDFUNC |
106 |
|
|
107 |
|
xvid_FIR_3_1_0_0: |
108 |
|
times 256*4 dw 0 |
109 |
|
ENDFUNC |
110 |
|
|
111 |
|
xvid_FIR_6_3_1_0: |
112 |
|
times 256*4 dw 0 |
113 |
|
ENDFUNC |
114 |
|
|
115 |
|
xvid_FIR_14_3_2_1: |
116 |
|
times 256*4 dw 0 |
117 |
|
ENDFUNC |
118 |
|
|
119 |
|
xvid_FIR_20_6_3_1: |
120 |
|
times 256*4 dw 0 |
121 |
|
ENDFUNC |
122 |
|
|
123 |
|
xvid_FIR_20_20_6_3: |
124 |
|
times 256*4 dw 0 |
125 |
|
ENDFUNC |
126 |
|
|
127 |
|
xvid_FIR_23_19_6_3: |
128 |
|
times 256*4 dw 0 |
129 |
|
ENDFUNC |
130 |
|
|
131 |
|
xvid_FIR_7_20_20_6: |
132 |
|
times 256*4 dw 0 |
133 |
|
ENDFUNC |
134 |
|
|
135 |
|
xvid_FIR_6_20_20_6: |
136 |
|
times 256*4 dw 0 |
137 |
|
ENDFUNC |
138 |
|
|
139 |
|
xvid_FIR_6_20_20_7: |
140 |
|
times 256*4 dw 0 |
141 |
|
ENDFUNC |
142 |
|
|
143 |
|
xvid_FIR_3_6_20_20: |
144 |
|
times 256*4 dw 0 |
145 |
|
ENDFUNC |
146 |
|
|
147 |
|
xvid_FIR_3_6_19_23: |
148 |
|
times 256*4 dw 0 |
149 |
|
ENDFUNC |
150 |
|
|
151 |
|
xvid_FIR_1_3_6_20: |
152 |
|
times 256*4 dw 0 |
153 |
|
ENDFUNC |
154 |
|
|
155 |
|
xvid_FIR_1_2_3_14: |
156 |
|
times 256*4 dw 0 |
157 |
|
ENDFUNC |
158 |
|
|
159 |
|
xvid_FIR_0_1_3_6: |
160 |
|
times 256*4 dw 0 |
161 |
|
ENDFUNC |
162 |
|
|
163 |
|
xvid_FIR_0_0_1_3: |
164 |
|
times 256*4 dw 0 |
165 |
|
ENDFUNC |
166 |
|
|
167 |
|
xvid_FIR_0_0_0_1: |
168 |
|
times 256*4 dw 0 |
169 |
|
ENDFUNC |
170 |
|
|
171 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
172 |
|
|
173 |
%ifdef FORMAT_COFF |
DATA |
|
SECTION .rodata |
|
|
%else |
|
|
SECTION .rodata align=16 |
|
|
%endif |
|
174 |
|
|
175 |
align 16 |
align SECTION_ALIGN |
176 |
Rounder1_MMX: |
Rounder1_MMX: |
177 |
times 4 dw 1 |
times 4 dw 1 |
178 |
Rounder0_MMX: |
Rounder0_MMX: |
179 |
times 4 dw 0 |
times 4 dw 0 |
180 |
|
|
181 |
align 16 |
align SECTION_ALIGN |
182 |
Rounder_QP_MMX |
Rounder_QP_MMX: |
183 |
times 4 dw 16 |
times 4 dw 16 |
184 |
times 4 dw 15 |
times 4 dw 15 |
185 |
|
|
186 |
%ifndef USE_TABLES |
%ifndef USE_TABLES |
187 |
|
|
188 |
align 16 |
align SECTION_ALIGN |
189 |
|
|
190 |
; H-Pass table shared by 16x? and 8x? filters |
; H-Pass table shared by 16x? and 8x? filters |
191 |
|
|
192 |
FIR_R0: dw 14, -3, 2, -1 |
FIR_R0: dw 14, -3, 2, -1 |
193 |
align 16 |
align SECTION_ALIGN |
194 |
FIR_R1: dw 23, 19, -6, 3, -1, 0, 0, 0 |
FIR_R1: dw 23, 19, -6, 3, -1, 0, 0, 0 |
195 |
|
|
196 |
FIR_R2: dw -7, 20, 20, -6, 3, -1, 0, 0 |
FIR_R2: dw -7, 20, 20, -6, 3, -1, 0, 0 |
200 |
FIR_R4: dw -1, 3, -6, 20, 20, -6, 3, -1 |
FIR_R4: dw -1, 3, -6, 20, 20, -6, 3, -1 |
201 |
|
|
202 |
FIR_R5: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 |
FIR_R5: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 |
203 |
align 16 |
align SECTION_ALIGN |
204 |
FIR_R6: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 |
FIR_R6: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 |
205 |
align 16 |
align SECTION_ALIGN |
206 |
FIR_R7: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 |
FIR_R7: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 |
207 |
align 16 |
align SECTION_ALIGN |
208 |
FIR_R8: dw -1, 3, -6, 20, 20, -6, 3, -1 |
FIR_R8: dw -1, 3, -6, 20, 20, -6, 3, -1 |
209 |
|
|
210 |
FIR_R9: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 |
FIR_R9: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 |
211 |
align 16 |
align SECTION_ALIGN |
212 |
FIR_R10: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 |
FIR_R10: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 |
213 |
align 16 |
align SECTION_ALIGN |
214 |
FIR_R11: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 |
FIR_R11: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 |
215 |
align 16 |
align SECTION_ALIGN |
216 |
FIR_R12: dw -1, 3, -6, 20, 20, -6, 3, -1 |
FIR_R12: dw -1, 3, -6, 20, 20, -6, 3, -1 |
217 |
|
|
218 |
FIR_R13: dw 0, -1, 3, -6, 20, 20, -6, 3 |
FIR_R13: dw 0, -1, 3, -6, 20, 20, -6, 3 |
227 |
|
|
228 |
; V-Pass taps |
; V-Pass taps |
229 |
|
|
230 |
align 16 |
align SECTION_ALIGN |
231 |
FIR_Cm7: times 4 dw -7 |
FIR_Cm7: times 4 dw -7 |
232 |
FIR_Cm6: times 4 dw -6 |
FIR_Cm6: times 4 dw -6 |
233 |
FIR_Cm3: times 4 dw -3 |
FIR_Cm3: times 4 dw -3 |
239 |
FIR_C20: times 4 dw 20 |
FIR_C20: times 4 dw 20 |
240 |
FIR_C23: times 4 dw 23 |
FIR_C23: times 4 dw 23 |
241 |
|
|
242 |
SECTION .text |
SECTION .rotext align=SECTION_ALIGN |
243 |
|
|
244 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
245 |
;// Here we go with the Q-Pel mess. |
;// Here we go with the Q-Pel mess. |
247 |
;// For vertical ones, we process 4 *input* pixel in parallel. |
;// For vertical ones, we process 4 *input* pixel in parallel. |
248 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
249 |
|
|
250 |
|
%ifdef ARCH_IS_X86_64 |
251 |
|
%macro XVID_MOVQ 3 |
252 |
|
lea r9, [%2] |
253 |
|
movq %1, [r9 + %3] |
254 |
|
%endmacro |
255 |
|
%macro XVID_PADDW 3 |
256 |
|
lea r9, [%2] |
257 |
|
paddw %1, [r9 + %3] |
258 |
|
%endmacro |
259 |
|
%ifdef WINDOWS |
260 |
|
%define SRC_PTR _EDX |
261 |
|
%define DST_PTR _ECX |
262 |
|
%else |
263 |
|
%define SRC_PTR _ESI |
264 |
|
%define DST_PTR _EDI |
265 |
|
%endif |
266 |
|
%else |
267 |
|
%macro XVID_MOVQ 3 |
268 |
|
movq %1, [%2 + %3] |
269 |
|
%endmacro |
270 |
|
%macro XVID_PADDW 3 |
271 |
|
paddw %1, [%2 + %3] |
272 |
|
%endmacro |
273 |
|
%define SRC_PTR _ESI |
274 |
|
%define DST_PTR _EDI |
275 |
|
%endif |
276 |
|
|
277 |
%macro PROLOG_NO_AVRG 0 |
%macro PROLOG_NO_AVRG 0 |
278 |
push esi |
mov TMP0, prm3 ; Size |
279 |
push edi |
mov TMP1, prm4 ; BpS |
280 |
push ebp |
mov eax, prm5d ; Rnd |
281 |
mov edi, [esp+16 + 0*4] ; Dst |
|
282 |
mov esi, [esp+16 + 1*4] ; Src |
%ifndef ARCH_IS_X86_64 |
283 |
mov ecx, [esp+16 + 2*4] ; Size |
push SRC_PTR |
284 |
mov ebp, [esp+16 + 3*4] ; BpS |
push DST_PTR |
285 |
mov eax, [esp+16 + 4*4] ; Rnd |
%endif |
286 |
and eax, 1 |
push _EBP |
287 |
movq mm7, [Rounder_QP_MMX+eax*8] ; rounder |
mov _EBP, TMP1 |
288 |
|
|
289 |
|
%ifndef ARCH_IS_X86_64 |
290 |
|
mov DST_PTR, [_ESP+16 + 0*4] ; Dst |
291 |
|
mov SRC_PTR, [_ESP+16 + 1*4] ; Src |
292 |
|
%endif |
293 |
|
|
294 |
|
and _EAX, 1 |
295 |
|
lea TMP1, [Rounder_QP_MMX] |
296 |
|
movq mm7, [TMP1+_EAX*8] ; rounder |
297 |
%endmacro |
%endmacro |
298 |
|
|
299 |
%macro EPILOG_NO_AVRG 0 |
%macro EPILOG_NO_AVRG 0 |
300 |
pop ebp |
pop _EBP |
301 |
pop edi |
%ifndef ARCH_IS_X86_64 |
302 |
pop esi |
pop DST_PTR |
303 |
|
pop SRC_PTR |
304 |
|
%endif |
305 |
ret |
ret |
306 |
%endmacro |
%endmacro |
307 |
|
|
308 |
%macro PROLOG_AVRG 0 |
%macro PROLOG_AVRG 0 |
309 |
push ebx |
mov TMP0, prm3 ; Size |
310 |
push esi |
mov TMP1, prm4 ; BpS |
311 |
push edi |
mov eax, prm5d ; Rnd |
312 |
push ebp |
|
313 |
mov edi, [esp+20 + 0*4] ; Dst |
push _EBX |
314 |
mov esi, [esp+20 + 1*4] ; Src |
push _EBP |
315 |
mov ecx, [esp+20 + 2*4] ; Size |
%ifndef ARCH_IS_X86_64 |
316 |
mov ebp, [esp+20 + 3*4] ; BpS |
push SRC_PTR |
317 |
mov eax, [esp+20 + 4*4] ; Rnd |
push DST_PTR |
318 |
and eax, 1 |
%endif |
319 |
movq mm7, [Rounder_QP_MMX+eax*8] ; rounder |
mov _EBP, TMP1 |
320 |
lea ebx, [Rounder1_MMX+eax*8] ; *Rounder2 |
|
321 |
|
%ifndef ARCH_IS_X86_64 |
322 |
|
mov DST_PTR, [_ESP+20 + 0*4] ; Dst |
323 |
|
mov SRC_PTR, [_ESP+20 + 1*4] ; Src |
324 |
|
%endif |
325 |
|
|
326 |
|
and _EAX, 1 |
327 |
|
lea TMP1, [Rounder_QP_MMX] |
328 |
|
movq mm7, [TMP1+_EAX*8] ; rounder |
329 |
|
lea TMP1, [Rounder1_MMX] |
330 |
|
lea _EBX, [TMP1+_EAX*8] ; *Rounder2 |
331 |
%endmacro |
%endmacro |
332 |
|
|
333 |
%macro EPILOG_AVRG 0 |
%macro EPILOG_AVRG 0 |
334 |
pop ebp |
%ifndef ARCH_IS_X86_64 |
335 |
pop edi |
pop DST_PTR |
336 |
pop esi |
pop SRC_PTR |
337 |
pop ebx |
%endif |
338 |
|
pop _EBP |
339 |
|
pop _EBX |
340 |
ret |
ret |
341 |
%endmacro |
%endmacro |
342 |
|
|
349 |
; macros for USE_TABLES |
; macros for USE_TABLES |
350 |
|
|
351 |
%macro TLOAD 2 ; %1,%2: src pixels |
%macro TLOAD 2 ; %1,%2: src pixels |
352 |
movzx eax, byte [esi+%1] |
movzx _EAX, byte [SRC_PTR+%1] |
353 |
movzx edx, byte [esi+%2] |
movzx TMP1, byte [SRC_PTR+%2] |
354 |
movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ] |
XVID_MOVQ mm0, xvid_FIR_14_3_2_1, _EAX*8 |
355 |
movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ] |
XVID_MOVQ mm3, xvid_FIR_1_2_3_14, TMP1*8 |
356 |
paddw mm0, mm7 |
paddw mm0, mm7 |
357 |
paddw mm3, mm7 |
paddw mm3, mm7 |
358 |
%endmacro |
%endmacro |
359 |
|
|
360 |
%macro TACCUM2 5 ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs |
%macro TACCUM2 5 ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs |
361 |
movzx eax, byte [esi+%1] |
movzx _EAX, byte [SRC_PTR+%1] |
362 |
paddw %4, [%2 + eax*8] |
XVID_PADDW %4, %2, _EAX*8 |
363 |
paddw %5, [%3 + eax*8] |
XVID_PADDW %5, %3, _EAX*8 |
364 |
%endmacro |
%endmacro |
365 |
|
|
366 |
%macro TACCUM3 7 ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs |
%macro TACCUM3 7 ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs |
367 |
movzx eax, byte [esi+%1] |
movzx _EAX, byte [SRC_PTR+%1] |
368 |
paddw %5, [%2 + eax*8] |
XVID_PADDW %5, %2, _EAX*8 |
369 |
paddw %6, [%3 + eax*8] |
XVID_PADDW %6, %3, _EAX*8 |
370 |
paddw %7, [%4 + eax*8] |
XVID_PADDW %7, %4, _EAX*8 |
371 |
%endmacro |
%endmacro |
372 |
|
|
373 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
375 |
; macros without USE_TABLES |
; macros without USE_TABLES |
376 |
|
|
377 |
%macro LOAD 2 ; %1,%2: src pixels |
%macro LOAD 2 ; %1,%2: src pixels |
378 |
movzx eax, byte [esi+%1] |
movzx _EAX, byte [SRC_PTR+%1] |
379 |
movzx edx, byte [esi+%2] |
movzx TMP1, byte [SRC_PTR+%2] |
380 |
movq mm0, [xvid_Expand_mmx + eax*8] |
XVID_MOVQ mm0, xvid_Expand_mmx, _EAX*8 |
381 |
movq mm3, [xvid_Expand_mmx + edx*8] |
XVID_MOVQ mm3, xvid_Expand_mmx, TMP1*8 |
382 |
pmullw mm0, [FIR_R0 ] |
pmullw mm0, [FIR_R0 ] |
383 |
pmullw mm3, [FIR_R16] |
pmullw mm3, [FIR_R16] |
384 |
paddw mm0, mm7 |
paddw mm0, mm7 |
386 |
%endmacro |
%endmacro |
387 |
|
|
388 |
%macro ACCUM2 4 ;src pixel/Taps/dst regs #1-#2 |
%macro ACCUM2 4 ;src pixel/Taps/dst regs #1-#2 |
389 |
movzx eax, byte [esi+%1] |
movzx _EAX, byte [SRC_PTR+%1] |
390 |
movq mm4, [xvid_Expand_mmx + eax*8] |
XVID_MOVQ mm4, xvid_Expand_mmx, _EAX*8 |
391 |
movq mm5, mm4 |
movq mm5, mm4 |
392 |
pmullw mm4, [%2] |
pmullw mm4, [%2] |
393 |
pmullw mm5, [%2+8] |
pmullw mm5, [%2+8] |
396 |
%endmacro |
%endmacro |
397 |
|
|
398 |
%macro ACCUM3 5 ;src pixel/Taps/dst regs #1-#2-#3 |
%macro ACCUM3 5 ;src pixel/Taps/dst regs #1-#2-#3 |
399 |
movzx eax, byte [esi+%1] |
movzx _EAX, byte [SRC_PTR+%1] |
400 |
movq mm4, [xvid_Expand_mmx + eax*8] |
XVID_MOVQ mm4, xvid_Expand_mmx, _EAX*8 |
401 |
movq mm5, mm4 |
movq mm5, mm4 |
402 |
movq mm6, mm5 |
movq mm6, mm5 |
403 |
pmullw mm4, [%2 ] |
pmullw mm4, [%2 ] |
439 |
PROLOG_AVRG |
PROLOG_AVRG |
440 |
%endif |
%endif |
441 |
|
|
442 |
.Loop |
.Loop: |
443 |
|
|
444 |
; mm0..mm3 serves as a 4x4 delay line |
; mm0..mm3 serves as a 4x4 delay line |
445 |
|
|
503 |
packuswb mm2, mm3 |
packuswb mm2, mm3 |
504 |
|
|
505 |
%if (%1==1) |
%if (%1==1) |
506 |
MIX mm0, esi, ebx |
MIX mm0, SRC_PTR, _EBX |
507 |
%elif (%1==2) |
%elif (%1==2) |
508 |
MIX mm0, esi+1, ebx |
MIX mm0, SRC_PTR+1, _EBX |
509 |
%endif |
%endif |
510 |
%if (%2==1) |
%if (%2==1) |
511 |
MIX mm0, edi, Rounder1_MMX |
MIX mm0, DST_PTR, Rounder1_MMX |
512 |
%endif |
%endif |
513 |
|
|
514 |
%if (%1==1) |
%if (%1==1) |
515 |
MIX mm2, esi+8, ebx |
MIX mm2, SRC_PTR+8, _EBX |
516 |
%elif (%1==2) |
%elif (%1==2) |
517 |
MIX mm2, esi+9, ebx |
MIX mm2, SRC_PTR+9, _EBX |
518 |
%endif |
%endif |
519 |
%if (%2==1) |
%if (%2==1) |
520 |
MIX mm2, edi+8, Rounder1_MMX |
MIX mm2, DST_PTR+8, Rounder1_MMX |
521 |
%endif |
%endif |
522 |
|
|
523 |
lea esi, [esi+ebp] |
lea SRC_PTR, [SRC_PTR+_EBP] |
524 |
|
|
525 |
movq [edi+0], mm0 |
movq [DST_PTR+0], mm0 |
526 |
movq [edi+8], mm2 |
movq [DST_PTR+8], mm2 |
527 |
|
|
528 |
add edi, ebp |
add DST_PTR, _EBP |
529 |
dec ecx |
dec TMP0 |
530 |
jg .Loop |
jg .Loop |
531 |
|
|
532 |
%if (%2==0) && (%1==0) |
%if (%2==0) && (%1==0) |
548 |
PROLOG_AVRG |
PROLOG_AVRG |
549 |
%endif |
%endif |
550 |
|
|
551 |
.Loop |
.Loop: |
552 |
; mm0..mm3 serves as a 4x4 delay line |
; mm0..mm3 serves as a 4x4 delay line |
553 |
|
|
554 |
%ifndef USE_TABLES |
%ifndef USE_TABLES |
578 |
|
|
579 |
%else ; test with unrolling (little faster, but not much) |
%else ; test with unrolling (little faster, but not much) |
580 |
|
|
581 |
movzx eax, byte [esi] |
movzx _EAX, byte [SRC_PTR] |
582 |
movzx edx, byte [esi+8] |
movzx TMP1, byte [SRC_PTR+8] |
583 |
movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ] |
XVID_MOVQ mm0, xvid_FIR_14_3_2_1, _EAX*8 |
584 |
movzx eax, byte [esi+1] |
movzx _EAX, byte [SRC_PTR+1] |
585 |
movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ] |
XVID_MOVQ mm3, xvid_FIR_1_2_3_14, TMP1*8 |
586 |
paddw mm0, mm7 |
paddw mm0, mm7 |
587 |
paddw mm3, mm7 |
paddw mm3, mm7 |
588 |
|
|
589 |
movzx edx, byte [esi+2] |
movzx TMP1, byte [SRC_PTR+2] |
590 |
paddw mm0, [xvid_FIR_23_19_6_3 + eax*8] |
XVID_PADDW mm0, xvid_FIR_23_19_6_3, _EAX*8 |
591 |
paddw mm3, [xvid_FIR_1_0_0_0 + eax*8] |
XVID_PADDW mm3, xvid_FIR_1_0_0_0, _EAX*8 |
592 |
|
|
593 |
movzx eax, byte [esi+3] |
movzx _EAX, byte [SRC_PTR+3] |
594 |
paddw mm0, [xvid_FIR_7_20_20_6 + edx*8] |
XVID_PADDW mm0, xvid_FIR_7_20_20_6, TMP1*8 |
595 |
paddw mm3, [xvid_FIR_3_1_0_0 + edx*8] |
XVID_PADDW mm3, xvid_FIR_3_1_0_0, TMP1*8 |
596 |
|
|
597 |
movzx edx, byte [esi+4] |
movzx TMP1, byte [SRC_PTR+4] |
598 |
paddw mm0, [xvid_FIR_3_6_20_20 + eax*8] |
XVID_PADDW mm0, xvid_FIR_3_6_20_20, _EAX*8 |
599 |
paddw mm3, [xvid_FIR_6_3_1_0 + eax*8] |
XVID_PADDW mm3, xvid_FIR_6_3_1_0, _EAX*8 |
600 |
|
|
601 |
movzx eax, byte [esi+5] |
movzx _EAX, byte [SRC_PTR+5] |
602 |
paddw mm0, [xvid_FIR_1_3_6_20 + edx*8] |
XVID_PADDW mm0, xvid_FIR_1_3_6_20, TMP1*8 |
603 |
paddw mm3, [xvid_FIR_20_6_3_1 + edx*8] |
XVID_PADDW mm3, xvid_FIR_20_6_3_1, TMP1*8 |
604 |
|
|
605 |
movzx edx, byte [esi+6] |
movzx TMP1, byte [SRC_PTR+6] |
606 |
paddw mm0, [xvid_FIR_0_1_3_6 + eax*8] |
XVID_PADDW mm0, xvid_FIR_0_1_3_6, _EAX*8 |
607 |
paddw mm3, [xvid_FIR_20_20_6_3 + eax*8] |
XVID_PADDW mm3, xvid_FIR_20_20_6_3, _EAX*8 |
608 |
|
|
609 |
movzx eax, byte [esi+7] |
movzx _EAX, byte [SRC_PTR+7] |
610 |
paddw mm0, [xvid_FIR_0_0_1_3 + edx*8] |
XVID_PADDW mm0, xvid_FIR_0_0_1_3, TMP1*8 |
611 |
paddw mm3, [xvid_FIR_6_20_20_7 + edx*8] |
XVID_PADDW mm3, xvid_FIR_6_20_20_7, TMP1*8 |
612 |
|
|
613 |
paddw mm0, [xvid_FIR_0_0_0_1 + eax*8] |
XVID_PADDW mm0, xvid_FIR_0_0_0_1, _EAX*8 |
614 |
paddw mm3, [xvid_FIR_3_6_19_23 + eax*8] |
XVID_PADDW mm3, xvid_FIR_3_6_19_23, _EAX*8 |
615 |
|
|
616 |
%endif |
%endif |
617 |
|
|
622 |
packuswb mm0, mm3 |
packuswb mm0, mm3 |
623 |
|
|
624 |
%if (%1==1) |
%if (%1==1) |
625 |
MIX mm0, esi, ebx |
MIX mm0, SRC_PTR, _EBX |
626 |
%elif (%1==2) |
%elif (%1==2) |
627 |
MIX mm0, esi+1, ebx |
MIX mm0, SRC_PTR+1, _EBX |
628 |
%endif |
%endif |
629 |
%if (%2==1) |
%if (%2==1) |
630 |
MIX mm0, edi, Rounder1_MMX |
MIX mm0, DST_PTR, Rounder1_MMX |
631 |
%endif |
%endif |
632 |
|
|
633 |
movq [edi], mm0 |
movq [DST_PTR], mm0 |
634 |
|
|
635 |
add edi, ebp |
add DST_PTR, _EBP |
636 |
add esi, ebp |
add SRC_PTR, _EBP |
637 |
dec ecx |
dec TMP0 |
638 |
jg .Loop |
jg .Loop |
639 |
|
|
640 |
%if (%2==0) && (%1==0) |
%if (%2==0) && (%1==0) |
650 |
|
|
651 |
xvid_H_Pass_16_mmx: |
xvid_H_Pass_16_mmx: |
652 |
H_PASS_16 0, 0 |
H_PASS_16 0, 0 |
653 |
.endfunc |
ENDFUNC |
654 |
xvid_H_Pass_Avrg_16_mmx: |
xvid_H_Pass_Avrg_16_mmx: |
655 |
H_PASS_16 1, 0 |
H_PASS_16 1, 0 |
656 |
.endfunc |
ENDFUNC |
657 |
xvid_H_Pass_Avrg_Up_16_mmx: |
xvid_H_Pass_Avrg_Up_16_mmx: |
658 |
H_PASS_16 2, 0 |
H_PASS_16 2, 0 |
659 |
.endfunc |
ENDFUNC |
660 |
|
|
661 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
662 |
;// 8x? copy Functions |
;// 8x? copy Functions |
663 |
|
|
664 |
xvid_H_Pass_8_mmx: |
xvid_H_Pass_8_mmx: |
665 |
H_PASS_8 0, 0 |
H_PASS_8 0, 0 |
666 |
.endfunc |
ENDFUNC |
667 |
xvid_H_Pass_Avrg_8_mmx: |
xvid_H_Pass_Avrg_8_mmx: |
668 |
H_PASS_8 1, 0 |
H_PASS_8 1, 0 |
669 |
.endfunc |
ENDFUNC |
670 |
xvid_H_Pass_Avrg_Up_8_mmx: |
xvid_H_Pass_Avrg_Up_8_mmx: |
671 |
H_PASS_8 2, 0 |
H_PASS_8 2, 0 |
672 |
.endfunc |
ENDFUNC |
673 |
|
|
674 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
675 |
;// 16x? avrg Functions |
;// 16x? avrg Functions |
676 |
|
|
677 |
xvid_H_Pass_Add_16_mmx: |
xvid_H_Pass_Add_16_mmx: |
678 |
H_PASS_16 0, 1 |
H_PASS_16 0, 1 |
679 |
.endfunc |
ENDFUNC |
680 |
xvid_H_Pass_Avrg_Add_16_mmx: |
xvid_H_Pass_Avrg_Add_16_mmx: |
681 |
H_PASS_16 1, 1 |
H_PASS_16 1, 1 |
682 |
.endfunc |
ENDFUNC |
683 |
xvid_H_Pass_Avrg_Up_Add_16_mmx: |
xvid_H_Pass_Avrg_Up_Add_16_mmx: |
684 |
H_PASS_16 2, 1 |
H_PASS_16 2, 1 |
685 |
.endfunc |
ENDFUNC |
686 |
|
|
687 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
688 |
;// 8x? avrg Functions |
;// 8x? avrg Functions |
689 |
|
|
690 |
xvid_H_Pass_8_Add_mmx: |
xvid_H_Pass_8_Add_mmx: |
691 |
H_PASS_8 0, 1 |
H_PASS_8 0, 1 |
692 |
.endfunc |
ENDFUNC |
693 |
xvid_H_Pass_Avrg_8_Add_mmx: |
xvid_H_Pass_Avrg_8_Add_mmx: |
694 |
H_PASS_8 1, 1 |
H_PASS_8 1, 1 |
695 |
.endfunc |
ENDFUNC |
696 |
xvid_H_Pass_Avrg_Up_8_Add_mmx: |
xvid_H_Pass_Avrg_Up_8_Add_mmx: |
697 |
H_PASS_8 2, 1 |
H_PASS_8 2, 1 |
698 |
.endfunc |
ENDFUNC |
699 |
|
|
700 |
|
|
701 |
|
|
707 |
|
|
708 |
%macro V_LOAD 1 ; %1=Last? |
%macro V_LOAD 1 ; %1=Last? |
709 |
|
|
710 |
movd mm4, [edx] |
movd mm4, dword [TMP1] |
711 |
pxor mm6, mm6 |
pxor mm6, mm6 |
712 |
%if (%1==0) |
%if (%1==0) |
713 |
add edx, ebp |
add TMP1, _EBP |
714 |
%endif |
%endif |
715 |
punpcklbw mm4, mm6 |
punpcklbw mm4, mm6 |
716 |
|
|
761 |
packuswb %3, %3 |
packuswb %3, %3 |
762 |
|
|
763 |
%if (%1==1) |
%if (%1==1) |
764 |
V_MIX %3, esi, ebx |
V_MIX %3, SRC_PTR, _EBX |
765 |
add esi, ebp |
add SRC_PTR, _EBP |
766 |
%elif (%1==2) |
%elif (%1==2) |
767 |
add esi, ebp |
add SRC_PTR, _EBP |
768 |
V_MIX %3, esi, ebx |
V_MIX %3, SRC_PTR, _EBX |
769 |
%endif |
%endif |
770 |
%if (%2==1) |
%if (%2==1) |
771 |
V_MIX %3, edi, Rounder1_MMX |
V_MIX %3, DST_PTR, Rounder1_MMX |
772 |
%endif |
%endif |
773 |
|
|
774 |
movd eax, %3 |
movd eax, %3 |
775 |
mov [edi], eax |
mov [DST_PTR], _EAX |
776 |
|
|
777 |
%if (%4==0) |
%if (%4==0) |
778 |
add edi, ebp |
add DST_PTR, _EBP |
779 |
%endif |
%endif |
780 |
|
|
781 |
%endmacro |
%endmacro |
794 |
; the size (3rd argument) is meant to be a multiple of 4 |
; the size (3rd argument) is meant to be a multiple of 4 |
795 |
; mm0..mm3 serves as a 4x4 delay line |
; mm0..mm3 serves as a 4x4 delay line |
796 |
|
|
797 |
.Loop |
.Loop: |
798 |
|
|
799 |
push edi |
push DST_PTR |
800 |
push esi ; esi is preserved for src-mixing |
push SRC_PTR ; SRC_PTR is preserved for src-mixing |
801 |
mov edx, esi |
mov TMP1, SRC_PTR |
802 |
|
|
803 |
; ouput rows [0..3], from input rows [0..8] |
; ouput rows [0..3], from input rows [0..8] |
804 |
|
|
834 |
|
|
835 |
; ouput rows [4..7], from input rows [1..11] (!!) |
; ouput rows [4..7], from input rows [1..11] (!!) |
836 |
|
|
837 |
mov esi, [esp] |
mov SRC_PTR, [_ESP] |
838 |
lea edx, [esi+ebp] |
lea TMP1, [SRC_PTR+_EBP] |
839 |
|
|
840 |
lea esi, [esi+4*ebp] ; for src-mixing |
lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing |
841 |
push esi ; this will be the new value for next round |
push SRC_PTR ; this will be the new value for next round |
842 |
|
|
843 |
movq mm0, mm7 |
movq mm0, mm7 |
844 |
movq mm1, mm7 |
movq mm1, mm7 |
882 |
|
|
883 |
; ouput rows [8..11], from input rows [5..15] |
; ouput rows [8..11], from input rows [5..15] |
884 |
|
|
885 |
pop esi |
pop SRC_PTR |
886 |
lea edx, [esi+ebp] |
lea TMP1, [SRC_PTR+_EBP] |
887 |
|
|
888 |
lea esi, [esi+4*ebp] ; for src-mixing |
lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing |
889 |
push esi ; this will be the new value for next round |
push SRC_PTR ; this will be the new value for next round |
890 |
|
|
891 |
movq mm0, mm7 |
movq mm0, mm7 |
892 |
movq mm1, mm7 |
movq mm1, mm7 |
932 |
|
|
933 |
; ouput rows [12..15], from input rows [9.16] |
; ouput rows [12..15], from input rows [9.16] |
934 |
|
|
935 |
pop esi |
pop SRC_PTR |
936 |
lea edx, [esi+ebp] |
lea TMP1, [SRC_PTR+_EBP] |
937 |
|
|
938 |
%if (%1!=0) |
%if (%1!=0) |
939 |
lea esi, [esi+4*ebp] ; for src-mixing |
lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing |
940 |
%endif |
%endif |
941 |
|
|
942 |
movq mm0, mm7 |
movq mm0, mm7 |
972 |
|
|
973 |
; ... next 4 columns |
; ... next 4 columns |
974 |
|
|
975 |
pop esi |
pop SRC_PTR |
976 |
pop edi |
pop DST_PTR |
977 |
add esi, 4 |
add SRC_PTR, 4 |
978 |
add edi, 4 |
add DST_PTR, 4 |
979 |
sub ecx, 4 |
sub TMP0, 4 |
980 |
jg .Loop |
jg .Loop |
981 |
|
|
982 |
%if (%2==0) && (%1==0) |
%if (%2==0) && (%1==0) |
1000 |
; we process one stripe of 4x8 pixel each time |
; we process one stripe of 4x8 pixel each time |
1001 |
; the size (3rd argument) is meant to be a multiple of 4 |
; the size (3rd argument) is meant to be a multiple of 4 |
1002 |
; mm0..mm3 serves as a 4x4 delay line |
; mm0..mm3 serves as a 4x4 delay line |
1003 |
.Loop |
.Loop: |
1004 |
|
|
1005 |
push edi |
push DST_PTR |
1006 |
push esi ; esi is preserved for src-mixing |
push SRC_PTR ; SRC_PTR is preserved for src-mixing |
1007 |
mov edx, esi |
mov TMP1, SRC_PTR |
1008 |
|
|
1009 |
; ouput rows [0..3], from input rows [0..8] |
; ouput rows [0..3], from input rows [0..8] |
1010 |
|
|
1041 |
|
|
1042 |
; ouput rows [4..7], from input rows [1..9] |
; ouput rows [4..7], from input rows [1..9] |
1043 |
|
|
1044 |
mov esi, [esp] |
mov SRC_PTR, [_ESP] |
1045 |
lea edx, [esi+ebp] |
lea TMP1, [SRC_PTR+_EBP] |
1046 |
|
|
1047 |
%if (%1!=0) |
%if (%1!=0) |
1048 |
lea esi, [esi+4*ebp] ; for src-mixing |
lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing |
1049 |
%endif |
%endif |
1050 |
|
|
1051 |
movq mm0, mm7 |
movq mm0, mm7 |
1081 |
|
|
1082 |
; ... next 4 columns |
; ... next 4 columns |
1083 |
|
|
1084 |
pop esi |
pop SRC_PTR |
1085 |
pop edi |
pop DST_PTR |
1086 |
add esi, 4 |
add SRC_PTR, 4 |
1087 |
add edi, 4 |
add DST_PTR, 4 |
1088 |
sub ecx, 4 |
sub TMP0, 4 |
1089 |
jg .Loop |
jg .Loop |
1090 |
|
|
1091 |
%if (%2==0) && (%1==0) |
%if (%2==0) && (%1==0) |
1102 |
|
|
1103 |
xvid_V_Pass_16_mmx: |
xvid_V_Pass_16_mmx: |
1104 |
V_PASS_16 0, 0 |
V_PASS_16 0, 0 |
1105 |
.endfunc |
ENDFUNC |
1106 |
xvid_V_Pass_Avrg_16_mmx: |
xvid_V_Pass_Avrg_16_mmx: |
1107 |
V_PASS_16 1, 0 |
V_PASS_16 1, 0 |
1108 |
.endfunc |
ENDFUNC |
1109 |
xvid_V_Pass_Avrg_Up_16_mmx: |
xvid_V_Pass_Avrg_Up_16_mmx: |
1110 |
V_PASS_16 2, 0 |
V_PASS_16 2, 0 |
1111 |
.endfunc |
ENDFUNC |
1112 |
|
|
1113 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
1114 |
;// 8x? copy Functions |
;// 8x? copy Functions |
1115 |
|
|
1116 |
xvid_V_Pass_8_mmx: |
xvid_V_Pass_8_mmx: |
1117 |
V_PASS_8 0, 0 |
V_PASS_8 0, 0 |
1118 |
.endfunc |
ENDFUNC |
1119 |
xvid_V_Pass_Avrg_8_mmx: |
xvid_V_Pass_Avrg_8_mmx: |
1120 |
V_PASS_8 1, 0 |
V_PASS_8 1, 0 |
1121 |
.endfunc |
ENDFUNC |
1122 |
xvid_V_Pass_Avrg_Up_8_mmx: |
xvid_V_Pass_Avrg_Up_8_mmx: |
1123 |
V_PASS_8 2, 0 |
V_PASS_8 2, 0 |
1124 |
.endfunc |
ENDFUNC |
1125 |
|
|
1126 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
1127 |
;// 16x? avrg Functions |
;// 16x? avrg Functions |
1128 |
|
|
1129 |
xvid_V_Pass_Add_16_mmx: |
xvid_V_Pass_Add_16_mmx: |
1130 |
V_PASS_16 0, 1 |
V_PASS_16 0, 1 |
1131 |
.endfunc |
ENDFUNC |
1132 |
xvid_V_Pass_Avrg_Add_16_mmx: |
xvid_V_Pass_Avrg_Add_16_mmx: |
1133 |
V_PASS_16 1, 1 |
V_PASS_16 1, 1 |
1134 |
.endfunc |
ENDFUNC |
1135 |
xvid_V_Pass_Avrg_Up_Add_16_mmx: |
xvid_V_Pass_Avrg_Up_Add_16_mmx: |
1136 |
V_PASS_16 2, 1 |
V_PASS_16 2, 1 |
1137 |
.endfunc |
ENDFUNC |
1138 |
|
|
1139 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
1140 |
;// 8x? avrg Functions |
;// 8x? avrg Functions |
1141 |
|
|
1142 |
xvid_V_Pass_8_Add_mmx: |
xvid_V_Pass_8_Add_mmx: |
1143 |
V_PASS_8 0, 1 |
V_PASS_8 0, 1 |
1144 |
.endfunc |
ENDFUNC |
1145 |
xvid_V_Pass_Avrg_8_Add_mmx: |
xvid_V_Pass_Avrg_8_Add_mmx: |
1146 |
V_PASS_8 1, 1 |
V_PASS_8 1, 1 |
1147 |
.endfunc |
ENDFUNC |
1148 |
xvid_V_Pass_Avrg_Up_8_Add_mmx: |
xvid_V_Pass_Avrg_Up_8_Add_mmx: |
1149 |
V_PASS_8 2, 1 |
V_PASS_8 2, 1 |
1150 |
.endfunc |
ENDFUNC |
1151 |
|
|
1152 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
1153 |
|
|
1154 |
|
%undef SRC_PTR |
1155 |
|
%undef DST_PTR |
1156 |
|
|
1157 |
%ifidn __OUTPUT_FORMAT__,elf |
%ifidn __OUTPUT_FORMAT__,elf |
1158 |
section ".note.GNU-stack" noalloc noexec nowrite progbits |
section ".note.GNU-stack" noalloc noexec nowrite progbits |
1159 |
%endif |
%endif |