Annotation of /trunk/xvidcore/src/dct/x86_asm/fdct_mmx_ffmpeg.asm

Revision 1793 - (view) (download)

1 :	edgomez	1382	;/****************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - MMX and XMM forward discrete cosine transform -
5 :			; *
6 :			; * Copyright(C) 2003 Edouard Gomez <ed.gomez@free.fr>
7 :			; *
8 :			; * This program is free software; you can redistribute it and/or modify it
9 :			; * under the terms of the GNU General Public License as published by
10 :			; * the Free Software Foundation; either version 2 of the License, or
11 :			; * (at your option) any later version.
12 :			; *
13 :			; * This program is distributed in the hope that it will be useful,
14 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :			; * GNU General Public License for more details.
17 :			; *
18 :			; * You should have received a copy of the GNU General Public License
19 :			; * along with this program; if not, write to the Free Software
20 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :			; *
22 :	Isibaar	1793	; * $Id: fdct_mmx_ffmpeg.asm,v 1.7 2008-11-11 20:46:24 Isibaar Exp $
23 :	edgomez	1382	; *
24 :			; ***************************************************************************/
25 :
26 :			;/****************************************************************************
27 :			; *
28 :			; * Initial, but incomplete version provided by Intel at AppNote AP-922
29 :			; * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
30 :			; * Copyright (C) 1999 Intel Corporation
31 :			; *
32 :			; * Completed and corrected in fdctmm32.c/fdctmm32.doc
33 :			; * http://members.tripod.com/~liaor/
34 :			; * Copyright (C) 2000 - Royce Shih-Wea Liao <liaor@iname.com>
35 :			; *
36 :			; * Minimizing coefficients reordering changing the tables constants order
37 :			; * http://ffmpeg.sourceforge.net/
38 :			; * Copyright (C) 2001 Fabrice Bellard.
39 :			; *
40 :			; * The version coded here is just a port to NASM syntax from the FFMPEG's
41 :			; * version. So all credits go to the previous authors for all their
42 :			; * respective work in order to have a nice/fast mmx fDCT.
43 :			; ***************************************************************************/
44 :
45 :			BITS 32
46 :
47 :			;=============================================================================
48 :			; Macros and other preprocessor constants
49 :			;=============================================================================
50 :
51 :			%macro cglobal 1
52 :			%ifdef PREFIX
53 :	edgomez	1535	%ifdef MARK_FUNCS
54 :	edgomez	1540	global _%1:function %1.endfunc-%1
55 :			%define %1 _%1:function %1.endfunc-%1
56 :	Isibaar	1793	%define ENDFUNC .endfunc
57 :	edgomez	1535	%else
58 :			global _%1
59 :			%define %1 _%1
60 :	Isibaar	1793	%define ENDFUNC
61 :	edgomez	1535	%endif
62 :	edgomez	1382	%else
63 :	edgomez	1535	%ifdef MARK_FUNCS
64 :	edgomez	1540	global %1:function %1.endfunc-%1
65 :	Isibaar	1793	%define ENDFUNC .endfunc
66 :	edgomez	1535	%else
67 :			global %1
68 :	Isibaar	1793	%define ENDFUNC
69 :	edgomez	1535	%endif
70 :	edgomez	1382	%endif
71 :			%endmacro
72 :
73 :			;;; Define this if you want an unrolled version of the code
74 :			%define UNROLLED_LOOP
75 :
76 :			%define BITS_FRW_ACC 3
77 :			%define SHIFT_FRW_COL BITS_FRW_ACC
78 :			%define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
79 :			%define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
80 :			%define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
81 :
82 :			;=============================================================================
83 :			; Local Data (Read Only)
84 :			;=============================================================================
85 :
86 :			%ifdef FORMAT_COFF
87 :	edgomez	1519	SECTION .rodata
88 :	edgomez	1382	%else
89 :	edgomez	1519	SECTION .rodata align=16
90 :	edgomez	1382	%endif
91 :
92 :			ALIGN 8
93 :			tab_frw_01234567:
94 :			dw 16384, 16384, -8867, -21407
95 :			dw 16384, 16384, 21407, 8867
96 :			dw 16384, -16384, 21407, -8867
97 :			dw -16384, 16384, 8867, -21407
98 :			dw 22725, 19266, -22725, -12873
99 :			dw 12873, 4520, 19266, -4520
100 :			dw 12873, -22725, 19266, -22725
101 :			dw 4520, 19266, 4520, -12873
102 :
103 :			dw 22725, 22725, -12299, -29692
104 :			dw 22725, 22725, 29692, 12299
105 :			dw 22725, -22725, 29692, -12299
106 :			dw -22725, 22725, 12299, -29692
107 :			dw 31521, 26722, -31521, -17855
108 :			dw 17855, 6270, 26722, -6270
109 :			dw 17855, -31521, 26722, -31521
110 :			dw 6270, 26722, 6270, -17855
111 :
112 :			dw 21407, 21407, -11585, -27969
113 :			dw 21407, 21407, 27969, 11585
114 :			dw 21407, -21407, 27969, -11585
115 :			dw -21407, 21407, 11585, -27969
116 :			dw 29692, 25172, -29692, -16819
117 :			dw 16819, 5906, 25172, -5906
118 :			dw 16819, -29692, 25172, -29692
119 :			dw 5906, 25172, 5906, -16819
120 :
121 :			dw 19266, 19266, -10426, -25172
122 :			dw 19266, 19266, 25172, 10426
123 :			dw 19266, -19266, 25172, -10426
124 :			dw -19266, 19266, 10426, -25172
125 :			dw 26722, 22654, -26722, -15137
126 :			dw 15137, 5315, 22654, -5315
127 :			dw 15137, -26722, 22654, -26722
128 :			dw 5315, 22654, 5315, -15137
129 :
130 :			dw 16384, 16384, -8867, -21407
131 :			dw 16384, 16384, 21407, 8867
132 :			dw 16384, -16384, 21407, -8867
133 :			dw -16384, 16384, 8867, -21407
134 :			dw 22725, 19266, -22725, -12873
135 :			dw 12873, 4520, 19266, -4520
136 :			dw 12873, -22725, 19266, -22725
137 :			dw 4520, 19266, 4520, -12873
138 :
139 :			dw 19266, 19266, -10426, -25172
140 :			dw 19266, 19266, 25172, 10426
141 :			dw 19266, -19266, 25172, -10426
142 :			dw -19266, 19266, 10426, -25172
143 :			dw 26722, 22654, -26722, -15137
144 :			dw 15137, 5315, 22654, -5315
145 :			dw 15137, -26722, 22654, -26722
146 :			dw 5315, 22654, 5315, -15137
147 :
148 :			dw 21407, 21407, -11585, -27969
149 :			dw 21407, 21407, 27969, 11585
150 :			dw 21407, -21407, 27969, -11585
151 :			dw -21407, 21407, 11585, -27969
152 :			dw 29692, 25172, -29692, -16819
153 :			dw 16819, 5906, 25172, -5906
154 :			dw 16819, -29692, 25172, -29692
155 :			dw 5906, 25172, 5906, -16819,
156 :
157 :			dw 22725, 22725, -12299, -29692
158 :			dw 22725, 22725, 29692, 12299
159 :			dw 22725, -22725, 29692, -12299
160 :			dw -22725, 22725, 12299, -29692
161 :			dw 31521, 26722, -31521, -17855
162 :			dw 17855, 6270, 26722, -6270
163 :			dw 17855, -31521, 26722, -31521
164 :			dw 6270, 26722, 6270, -17855
165 :
166 :			ALIGN 8
167 :			fdct_one_corr:
168 :			dw 1, 1, 1, 1
169 :
170 :			ALIGN 8
171 :			fdct_tg_all_16:
172 :			dw 13036, 13036, 13036, 13036
173 :			dw 27146, 27146, 27146, 27146
174 :			dw -21746, -21746, -21746, -21746
175 :
176 :			ALIGN 8
177 :			cos_4_16:
178 :			dw -19195, -19195, -19195, -19195
179 :
180 :			ALIGN 8
181 :			ocos_4_16:
182 :			dw 23170, 23170, 23170, 23170
183 :
184 :			ALIGN 8
185 :			fdct_r_row:
186 :			dd RND_FRW_ROW, RND_FRW_ROW
187 :
188 :			;=============================================================================
189 :			; Factorized parts of the code turned into macros for better understanding
190 :			;=============================================================================
191 :
192 :			;; Macro for column DCT
193 :			;; FDCT_COLUMN_MMX(int16_t out, const int16_t in, int offset);
194 :			;; - out, register name holding the out address
195 :			;; - in, register name holding the in address
196 :			;; - column number to process
197 :			%macro FDCT_COLUMN_COMMON 3
198 :			movq mm0, [%2 + %32 + 116]
199 :			movq mm1, [%2 + %32 + 616]
200 :			movq mm2, mm0
201 :			movq mm3, [%2 + %32 + 216]
202 :			paddsw mm0, mm1
203 :			movq mm4, [%2 + %32 + 516]
204 :			psllw mm0, SHIFT_FRW_COL
205 :			movq mm5, [%2 + %32 + 016]
206 :			paddsw mm4, mm3
207 :			paddsw mm5, [%2 + %32 + 716]
208 :			psllw mm4, SHIFT_FRW_COL
209 :			movq mm6, mm0
210 :			psubsw mm2, mm1
211 :			movq mm1, [fdct_tg_all_16 + 4*2]
212 :			psubsw mm0, mm4
213 :			movq mm7, [%2 + %32 + 316]
214 :			pmulhw mm1, mm0
215 :			paddsw mm7, [%2 + %32 + 416]
216 :			psllw mm5, SHIFT_FRW_COL
217 :			paddsw mm6, mm4
218 :			psllw mm7, SHIFT_FRW_COL
219 :			movq mm4, mm5
220 :			psubsw mm5, mm7
221 :			paddsw mm1, mm5
222 :			paddsw mm4, mm7
223 :			por mm1, [fdct_one_corr]
224 :			psllw mm2, SHIFT_FRW_COL + 1
225 :			pmulhw mm5, [fdct_tg_all_16 + 4*2]
226 :			movq mm7, mm4
227 :			psubsw mm3, [%2 + %32 + 516]
228 :			psubsw mm4, mm6
229 :			movq [%1 + %32 + 216], mm1
230 :			paddsw mm7, mm6
231 :			movq mm1, [%2 + %32 + 316]
232 :			psllw mm3, SHIFT_FRW_COL + 1
233 :			psubsw mm1, [%2 + %32 + 416]
234 :			movq mm6, mm2
235 :			movq [%1 + %32 + 416], mm4
236 :			paddsw mm2, mm3
237 :			pmulhw mm2, [ocos_4_16]
238 :			psubsw mm6, mm3
239 :			pmulhw mm6, [ocos_4_16]
240 :			psubsw mm5, mm0
241 :			por mm5, [fdct_one_corr]
242 :			psllw mm1, SHIFT_FRW_COL
243 :			por mm2, [fdct_one_corr]
244 :			movq mm4, mm1
245 :			movq mm3, [%2 + %32 + 016]
246 :			paddsw mm1, mm6
247 :			psubsw mm3, [%2 + %32 + 716]
248 :			psubsw mm4, mm6
249 :			movq mm0, [fdct_tg_all_16 + 0*2]
250 :			psllw mm3, SHIFT_FRW_COL
251 :			movq mm6, [fdct_tg_all_16 + 8*2]
252 :			pmulhw mm0, mm1
253 :			movq [%1 + %32 + 016], mm7
254 :			pmulhw mm6, mm4
255 :			movq [%1 + %32 + 616], mm5
256 :			movq mm7, mm3
257 :			movq mm5, [fdct_tg_all_16 + 8*2]
258 :			psubsw mm7, mm2
259 :			paddsw mm3, mm2
260 :			pmulhw mm5, mm7
261 :			paddsw mm0, mm3
262 :			paddsw mm6, mm4
263 :			pmulhw mm3, [fdct_tg_all_16 + 0*2]
264 :			por mm0, [fdct_one_corr]
265 :			paddsw mm5, mm7
266 :			psubsw mm7, mm6
267 :			movq [%1 + %32 + 116], mm0
268 :			paddsw mm5, mm4
269 :			movq [%1 + %32 + 316], mm7
270 :			psubsw mm3, mm1
271 :			movq [%1 + %32 + 516], mm5
272 :			movq [%1 + %32 + 716], mm3
273 :			%endmacro
274 :
275 :			;; Macro for row DCT using MMX punpcklw instructions
276 :			;; FDCT_ROW_MMX(int16_t out, const int16_t in, const int16_t *table);
277 :			;; - out, register name holding the out address
278 :			;; - in, register name holding the in address
279 :			;; - table coefficients address (register or absolute)
280 :			%macro FDCT_ROW_MMX 3
281 :			movd mm1, [%2 + 6*2]
282 :			punpcklwd mm1, [%2 + 4*2]
283 :			movq mm2, mm1
284 :			psrlq mm1, 0x20
285 :			movq mm0, [%2 + 0*2]
286 :			punpcklwd mm1, mm2
287 :			movq mm5, mm0
288 :			paddsw mm0, mm1
289 :			psubsw mm5, mm1
290 :			movq mm1, mm0
291 :			movq mm6, mm5
292 :			punpckldq mm3, mm5
293 :			punpckhdq mm6, mm3
294 :			movq mm3, [%3 + 0*2]
295 :			movq mm4, [%3 + 4*2]
296 :			punpckldq mm2, mm0
297 :			pmaddwd mm3, mm0
298 :			punpckhdq mm1, mm2
299 :			movq mm2, [%3 + 16*2]
300 :			pmaddwd mm4, mm1
301 :			pmaddwd mm0, [%3 + 8*2]
302 :			movq mm7, [%3 + 20*2]
303 :			pmaddwd mm2, mm5
304 :			paddd mm3, [fdct_r_row]
305 :			pmaddwd mm7, mm6
306 :			pmaddwd mm1, [%3 + 12*2]
307 :			paddd mm3, mm4
308 :			pmaddwd mm5, [%3 + 24*2]
309 :			pmaddwd mm6, [%3 + 28*2]
310 :			paddd mm2, mm7
311 :			paddd mm0, [fdct_r_row]
312 :			psrad mm3, SHIFT_FRW_ROW
313 :			paddd mm2, [fdct_r_row]
314 :			paddd mm0, mm1
315 :			paddd mm5, [fdct_r_row]
316 :			psrad mm2, SHIFT_FRW_ROW
317 :			paddd mm5, mm6
318 :			psrad mm0, SHIFT_FRW_ROW
319 :			psrad mm5, SHIFT_FRW_ROW
320 :			packssdw mm3, mm0
321 :			packssdw mm2, mm5
322 :			movq mm6, mm3
323 :			punpcklwd mm3, mm2
324 :			punpckhwd mm6, mm2
325 :			movq [%1 + 0*2], mm3
326 :			movq [%1 + 4*2], mm6
327 :			%endmacro
328 :
329 :			;; Macro for column DCT using XMM instuction pshufw
330 :			;; FDCT_ROW_XMM(int16_t out, const int16_t in, const int16_t *table);
331 :			;; - out, register name holding the out address
332 :			;; - in, register name holding the in address
333 :			;; - table coefficient address
334 :			%macro FDCT_ROW_XMM 3
335 :			;; fdct_row_mmx2(const int16_t in, int16_t out, const int16_t *table)
336 :			pshufw mm5, [%2 + 4*2], 0x1B
337 :			movq mm0, [%2 + 0*2]
338 :			movq mm1, mm0
339 :			paddsw mm0, mm5
340 :			psubsw mm1, mm5
341 :			pshufw mm2, mm0, 0x4E
342 :			pshufw mm3, mm1, 0x4E
343 :			movq mm4, [%3 + 0*2]
344 :			movq mm6, [%3 + 4*2]
345 :			movq mm5, [%3 + 16*2]
346 :			movq mm7, [%3 + 20*2]
347 :			pmaddwd mm4, mm0
348 :			pmaddwd mm5, mm1
349 :			pmaddwd mm6, mm2
350 :			pmaddwd mm7, mm3
351 :			pmaddwd mm0, [%3 + 8*2]
352 :			pmaddwd mm2, [%3 + 12*2]
353 :			pmaddwd mm1, [%3 + 24*2]
354 :			pmaddwd mm3, [%3 + 28*2]
355 :			paddd mm4, mm6
356 :			paddd mm5, mm7
357 :			paddd mm0, mm2
358 :			paddd mm1, mm3
359 :			movq mm7, [fdct_r_row]
360 :			paddd mm4, mm7
361 :			paddd mm5, mm7
362 :			paddd mm0, mm7
363 :			paddd mm1, mm7
364 :			psrad mm4, SHIFT_FRW_ROW
365 :			psrad mm5, SHIFT_FRW_ROW
366 :			psrad mm0, SHIFT_FRW_ROW
367 :			psrad mm1, SHIFT_FRW_ROW
368 :			packssdw mm4, mm0
369 :			packssdw mm5, mm1
370 :			movq mm2, mm4
371 :			punpcklwd mm4, mm5
372 :			punpckhwd mm2, mm5
373 :			movq [%1 + 0*2], mm4
374 :			movq [%1 + 4*2], mm2
375 :			%endmacro
376 :
377 :			%macro MAKE_FDCT_FUNC 2
378 :			ALIGN 16
379 :			cglobal %1
380 :			%1:
381 :			;; Move the destination/source address to the eax register
382 :			mov eax, [esp + 4]
383 :
384 :			;; Process the columns (4 at a time)
385 :			FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3
386 :			FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7
387 :
388 :			%ifdef UNROLLED_LOOP
389 :			; Unrolled loop version
390 :			%assign i 0
391 :			%rep 8
392 :			;; Process the 'i'th row
393 :			%2 eax+2i8, eax+2i8, tab_frw_01234567+232i
394 :			%assign i i+1
395 :			%endrep
396 :			%else
397 :			mov ecx, 8
398 :			mov edx, tab_frw_01234567
399 :			ALIGN 8
400 :			.loop
401 :			%2 eax, eax, edx
402 :			add eax, 2*8
403 :			add edx, 2*32
404 :			dec ecx
405 :			jne .loop
406 :			%endif
407 :
408 :			ret
409 :	Isibaar	1793	ENDFUNC
410 :	edgomez	1382	%endmacro
411 :
412 :			;=============================================================================
413 :			; Code
414 :			;=============================================================================
415 :
416 :			SECTION .text
417 :
418 :			;-----------------------------------------------------------------------------
419 :			; void fdct_mmx_ffmpeg(int16_t block[64]);
420 :			;-----------------------------------------------------------------------------
421 :
422 :			MAKE_FDCT_FUNC fdct_mmx_ffmpeg, FDCT_ROW_MMX
423 :
424 :			;-----------------------------------------------------------------------------
425 :			; void fdct_xmm_ffmpeg(int16_t block[64]);
426 :			;-----------------------------------------------------------------------------
427 :
428 :	edgomez	1535	MAKE_FDCT_FUNC fdct_xmm_ffmpeg, FDCT_ROW_XMM
429 :	Isibaar	1790
430 :			%ifidn __OUTPUT_FORMAT__,elf
431 :			section ".note.GNU-stack" noalloc noexec nowrite progbits
432 :			%endif
433 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4