Annotation of /branches/dev-api-4/xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm

Revision 499 - (view) (download)
Original Path: trunk/xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm

1 :	Isibaar	3	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * mmx 8bit<->16bit transfers
5 :			; *
6 :			; * This program is an implementation of a part of one or more MPEG-4
7 :			; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
8 :			; * to use this software module in hardware or software products are
9 :			; * advised that its use may infringe existing patents or copyrights, and
10 :			; * any such use would be at such party's own risk. The original
11 :			; * developer of this software module and his/her company, and subsequent
12 :			; * editors and their companies, will have no liability for use of this
13 :			; * software or modifications or derivatives thereof.
14 :			; *
15 :			; * This program is free software; you can redistribute it and/or modify
16 :			; * it under the terms of the GNU General Public License as published by
17 :			; * the Free Software Foundation; either version 2 of the License, or
18 :			; * (at your option) any later version.
19 :			; *
20 :			; * This program is distributed in the hope that it will be useful,
21 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 :			; * GNU General Public License for more details.
24 :			; *
25 :			; * You should have received a copy of the GNU General Public License
26 :			; * along with this program; if not, write to the Free Software
27 :			; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 :			; *
29 :			; *************************************************************************/
30 :
31 :			;/**************************************************************************
32 :			; *
33 :			; * History:
34 :			; *
35 :	Isibaar	226	; * 04.06.2002 speed enhancement (unroll+overlap). -Skal-
36 :			; * + added transfer_8to16sub2_mmx/xmm
37 :	Isibaar	3	; * 07.01.2002 merge functions from compensate_mmx; rename functions
38 :	suxen_drol	499	; * 07.11.2001 initial version; (c)2001 peter ross <pross@xvid.org>
39 :	Isibaar	3	; *
40 :			; *************************************************************************/
41 :
42 :
43 :			bits 32
44 :
45 :			%macro cglobal 1
46 :			%ifdef PREFIX
47 :			global _%1
48 :			%define %1 _%1
49 :			%else
50 :			global %1
51 :			%endif
52 :			%endmacro
53 :
54 :
55 :			section .text
56 :
57 :	Isibaar	226	cglobal transfer_8to16copy_mmx
58 :			cglobal transfer_16to8copy_mmx
59 :			cglobal transfer_8to16sub_mmx
60 :			cglobal transfer_8to16sub2_mmx
61 :			cglobal transfer_8to16sub2_xmm
62 :			cglobal transfer_16to8add_mmx
63 :			cglobal transfer8x8_copy_mmx
64 :	Isibaar	3
65 :			;===========================================================================
66 :			;
67 :			; void transfer_8to16copy_mmx(int16_t * const dst,
68 :			; const uint8_t * const src,
69 :			; uint32_t stride);
70 :			;
71 :			;===========================================================================
72 :
73 :	Isibaar	226	%macro COPY_8_TO_16 1
74 :			movq mm0, [eax]
75 :			movq mm1, [eax+edx]
76 :			movq mm2, mm0
77 :			movq mm3, mm1
78 :			punpcklbw mm0, mm7
79 :			movq [ecx+%1*32], mm0
80 :			punpcklbw mm1, mm7
81 :			movq [ecx+%1*32+16], mm1
82 :			punpckhbw mm2, mm7
83 :			punpckhbw mm3, mm7
84 :			lea eax,[eax+2*edx]
85 :			movq [ecx+%1*32+8], mm2
86 :			movq [ecx+%1*32+24], mm3
87 :			%endmacro
88 :
89 :	Isibaar	3	align 16
90 :	Isibaar	226	transfer_8to16copy_mmx:
91 :	Isibaar	3
92 :	Isibaar	226	mov ecx, [esp+ 4] ; Dst
93 :			mov eax, [esp+ 8] ; Src
94 :			mov edx, [esp+12] ; Stride
95 :			pxor mm7,mm7
96 :	Isibaar	3
97 :	Isibaar	226	COPY_8_TO_16 0
98 :			COPY_8_TO_16 1
99 :			COPY_8_TO_16 2
100 :			COPY_8_TO_16 3
101 :			ret
102 :	Isibaar	3
103 :			;===========================================================================
104 :			;
105 :			; void transfer_16to8copy_mmx(uint8_t * const dst,
106 :			; const int16_t * const src,
107 :			; uint32_t stride);
108 :			;
109 :			;===========================================================================
110 :
111 :	Isibaar	226	%macro COPY_16_TO_8 1
112 :			movq mm0, [eax+%1*32]
113 :			movq mm1, [eax+%1*32+8]
114 :			packuswb mm0, mm1
115 :			movq [ecx], mm0
116 :			movq mm2, [eax+%1*32+16]
117 :			movq mm3, [eax+%1*32+24]
118 :			packuswb mm2, mm3
119 :			movq [ecx+edx], mm2
120 :			%endmacro
121 :
122 :	Isibaar	3	align 16
123 :	Isibaar	226	transfer_16to8copy_mmx:
124 :	Isibaar	3
125 :	Isibaar	226	mov ecx, [esp+ 4] ; Dst
126 :			mov eax, [esp+ 8] ; Src
127 :			mov edx, [esp+12] ; Stride
128 :	Isibaar	3
129 :	Isibaar	226	COPY_16_TO_8 0
130 :			lea ecx,[ecx+2*edx]
131 :			COPY_16_TO_8 1
132 :			lea ecx,[ecx+2*edx]
133 :			COPY_16_TO_8 2
134 :			lea ecx,[ecx+2*edx]
135 :			COPY_16_TO_8 3
136 :			ret
137 :	Isibaar	3
138 :			;===========================================================================
139 :			;
140 :			; void transfer_8to16sub_mmx(int16_t * const dct,
141 :			; uint8_t * const cur,
142 :			; const uint8_t * const ref,
143 :			; const uint32_t stride);
144 :			;
145 :			;===========================================================================
146 :			;/**************************************************************************
147 :			; *
148 :			; * History:
149 :			; *
150 :			; * 27.12.2001 renamed from 'compensate' to 'transfer_8to16sub'
151 :			; * 02.12.2001 loop unrolled, code runs 10% faster now (Isibaar)
152 :			; * 30.11.2001 16 pixels are processed per iteration (Isibaar)
153 :			; * 30.11.2001 .text missing
154 :	suxen_drol	499	; * 06.11.2001 inital version; (c)2001 peter ross <pross@xvid.org>
155 :	Isibaar	3	; *
156 :			; *************************************************************************/
157 :
158 :	Isibaar	226	%macro COPY_8_TO_16_SUB 1
159 :			movq mm0, [eax] ; cur
160 :			movq mm2, [eax+edx]
161 :			movq mm1, mm0
162 :			movq mm3, mm2
163 :	Isibaar	3
164 :	Isibaar	226	punpcklbw mm0, mm7
165 :			punpcklbw mm2, mm7
166 :			movq mm4, [ebx] ; ref
167 :			punpckhbw mm1, mm7
168 :			punpckhbw mm3, mm7
169 :			movq mm5, [ebx+edx] ; ref
170 :	Isibaar	3
171 :	Isibaar	226	movq mm6, mm4
172 :			movq [eax], mm4
173 :			movq [eax+edx], mm5
174 :			punpcklbw mm4, mm7
175 :			punpckhbw mm6, mm7
176 :			psubsw mm0, mm4
177 :			psubsw mm1, mm6
178 :			movq mm6, mm5
179 :			punpcklbw mm5, mm7
180 :			punpckhbw mm6, mm7
181 :			psubsw mm2, mm5
182 :			lea eax,[eax+2*edx]
183 :			psubsw mm3, mm6
184 :			lea ebx,[ebx+2*edx]
185 :	Isibaar	3
186 :	Isibaar	226	movq [ecx+%1*32+ 0], mm0 ; dst
187 :			movq [ecx+%1*32+ 8], mm1
188 :			movq [ecx+%1*32+16], mm2
189 :			movq [ecx+%1*32+24], mm3
190 :			%endmacro
191 :	Isibaar	3
192 :	Isibaar	226	align 16
193 :			transfer_8to16sub_mmx:
194 :			mov ecx, [esp + 4] ; Dst
195 :			mov eax, [esp + 8] ; Cur
196 :			push ebx
197 :			mov ebx, [esp+4+12] ; Ref
198 :			mov edx, [esp+4+16] ; Stride
199 :			pxor mm7, mm7
200 :	Isibaar	3
201 :	Isibaar	226	COPY_8_TO_16_SUB 0
202 :			COPY_8_TO_16_SUB 1
203 :			COPY_8_TO_16_SUB 2
204 :			COPY_8_TO_16_SUB 3
205 :	Isibaar	3
206 :	Isibaar	226	pop ebx
207 :			ret
208 :	Isibaar	3
209 :	Isibaar	226	;===========================================================================
210 :			;
211 :			; void transfer_8to16sub2_mmx(int16_t * const dct,
212 :			; uint8_t * const cur,
213 :			; const uint8_t * ref1,
214 :			; const uint8_t * ref2,
215 :			; const uint32_t stride)
216 :			;
217 :			;===========================================================================
218 :	Isibaar	3
219 :	Isibaar	226	%macro COPY_8_TO_16_SUB2_MMX 1
220 :			movq mm0, [eax] ; cur
221 :			movq mm2, [eax+edx]
222 :	Isibaar	3
223 :	Isibaar	226	; mm4 <- (ref1+ref2+1) / 2
224 :			movq mm4, [ebx] ; ref1
225 :			movq mm1, [esi] ; ref2
226 :			movq mm6, mm4
227 :			movq mm3, mm1
228 :			punpcklbw mm4, mm7
229 :			punpcklbw mm1, mm7
230 :			punpckhbw mm6, mm7
231 :			punpckhbw mm3, mm7
232 :			paddusw mm4, mm1
233 :			paddusw mm6, mm3
234 :			psrlw mm4,1
235 :			psrlw mm6,1
236 :			packuswb mm4, mm6
237 :	Isibaar	3
238 :	Isibaar	226	; mm5 <- (ref1+ref2+1) / 2
239 :			movq mm5, [ebx+edx] ; ref1
240 :			movq mm1, [esi+edx] ; ref2
241 :			movq mm6, mm5
242 :			movq mm3, mm1
243 :			punpcklbw mm5, mm7
244 :			punpcklbw mm1, mm7
245 :			punpckhbw mm6, mm7
246 :			punpckhbw mm3, mm7
247 :			paddusw mm5, mm1
248 :			paddusw mm6, mm3
249 :			lea esi,[esi+2*edx]
250 :			psrlw mm5,1
251 :			psrlw mm6,1
252 :			packuswb mm5, mm6
253 :	Isibaar	3
254 :
255 :	Isibaar	226	movq mm1, mm0
256 :			movq mm3, mm2
257 :			punpcklbw mm0, mm7
258 :			punpcklbw mm2, mm7
259 :			punpckhbw mm1, mm7
260 :			punpckhbw mm3, mm7
261 :	Isibaar	3
262 :	Isibaar	226	movq mm6, mm4
263 :			punpcklbw mm4, mm7
264 :			punpckhbw mm6, mm7
265 :			psubsw mm0, mm4
266 :			psubsw mm1, mm6
267 :			movq mm6, mm5
268 :			punpcklbw mm5, mm7
269 :			punpckhbw mm6, mm7
270 :			psubsw mm2, mm5
271 :			lea eax,[eax+2*edx]
272 :			psubsw mm3, mm6
273 :			lea ebx,[ebx+2*edx]
274 :	Isibaar	3
275 :	Isibaar	226	movq [ecx+%1*32+ 0], mm0 ; dst
276 :			movq [ecx+%1*32+ 8], mm1
277 :			movq [ecx+%1*32+16], mm2
278 :			movq [ecx+%1*32+24], mm3
279 :			%endmacro
280 :	Isibaar	3
281 :	Isibaar	226	align 16
282 :			transfer_8to16sub2_mmx:
283 :			mov ecx, [esp + 4] ; Dst
284 :			mov eax, [esp + 8] ; Cur
285 :			push ebx
286 :			mov ebx, [esp+4+12] ; Ref1
287 :			push esi
288 :			mov esi, [esp+8+16] ; Ref2
289 :			mov edx, [esp+8+20] ; Stride
290 :			pxor mm7, mm7
291 :	Isibaar	3
292 :	Isibaar	226	COPY_8_TO_16_SUB2_MMX 0
293 :			COPY_8_TO_16_SUB2_MMX 1
294 :			COPY_8_TO_16_SUB2_MMX 2
295 :			COPY_8_TO_16_SUB2_MMX 3
296 :	Isibaar	3
297 :	Isibaar	226	pop esi
298 :			pop ebx
299 :			ret
300 :	Isibaar	3
301 :	edgomez	215	;===========================================================================
302 :			;
303 :			; void transfer_8to16sub2_xmm(int16_t * const dct,
304 :	Isibaar	226	; uint8_t * const cur,
305 :			; const uint8_t * ref1,
306 :			; const uint8_t * ref2,
307 :			; const uint32_t stride)
308 :	edgomez	215	;
309 :			;===========================================================================
310 :	Isibaar	3
311 :	Isibaar	226	%macro COPY_8_TO_16_SUB2_SSE 1
312 :			movq mm0, [eax] ; cur
313 :			movq mm2, [eax+edx]
314 :			movq mm1, mm0
315 :			movq mm3, mm2
316 :	edgomez	215
317 :	Isibaar	226	punpcklbw mm0, mm7
318 :			punpcklbw mm2, mm7
319 :			movq mm4, [ebx] ; ref1
320 :			pavgb mm4, [esi] ; ref2
321 :			punpckhbw mm1, mm7
322 :			punpckhbw mm3, mm7
323 :			movq mm5, [ebx+edx] ; ref
324 :			pavgb mm5, [esi+edx] ; ref2
325 :	edgomez	215
326 :	Isibaar	226	movq mm6, mm4
327 :			punpcklbw mm4, mm7
328 :			punpckhbw mm6, mm7
329 :			psubsw mm0, mm4
330 :			psubsw mm1, mm6
331 :			lea esi,[esi+2*edx]
332 :			movq mm6, mm5
333 :			punpcklbw mm5, mm7
334 :			punpckhbw mm6, mm7
335 :			psubsw mm2, mm5
336 :			lea eax,[eax+2*edx]
337 :			psubsw mm3, mm6
338 :			lea ebx,[ebx+2*edx]
339 :	edgomez	215
340 :	Isibaar	226	movq [ecx+%1*32+ 0], mm0 ; dst
341 :			movq [ecx+%1*32+ 8], mm1
342 :			movq [ecx+%1*32+16], mm2
343 :			movq [ecx+%1*32+24], mm3
344 :			%endmacro
345 :	edgomez	215
346 :	Isibaar	226	align 16
347 :			transfer_8to16sub2_xmm:
348 :			mov ecx, [esp + 4] ; Dst
349 :			mov eax, [esp + 8] ; Cur
350 :			push ebx
351 :			mov ebx, [esp+4+12] ; Ref1
352 :			push esi
353 :			mov esi, [esp+8+16] ; Ref2
354 :			mov edx, [esp+8+20] ; Stride
355 :			pxor mm7, mm7
356 :	edgomez	215
357 :	Isibaar	226	COPY_8_TO_16_SUB2_SSE 0
358 :			COPY_8_TO_16_SUB2_SSE 1
359 :			COPY_8_TO_16_SUB2_SSE 2
360 :			COPY_8_TO_16_SUB2_SSE 3
361 :	edgomez	215
362 :	Isibaar	226	pop esi
363 :			pop ebx
364 :			ret
365 :	edgomez	215
366 :	Isibaar	3	;===========================================================================
367 :			;
368 :			; void transfer_16to8add_mmx(uint8_t * const dst,
369 :			; const int16_t * const src,
370 :			; uint32_t stride);
371 :			;
372 :			;===========================================================================
373 :
374 :	Isibaar	226	%macro COPY_16_TO_8_ADD 1
375 :			movq mm0, [ecx]
376 :			movq mm2, [ecx+edx]
377 :			movq mm1, mm0
378 :			movq mm3, mm2
379 :			punpcklbw mm0, mm7
380 :			punpcklbw mm2, mm7
381 :			punpckhbw mm1, mm7
382 :			punpckhbw mm3, mm7
383 :			paddsw mm0, [eax+%1*32+ 0]
384 :			paddsw mm1, [eax+%1*32+ 8]
385 :			paddsw mm2, [eax+%1*32+16]
386 :			paddsw mm3, [eax+%1*32+24]
387 :			packuswb mm0, mm1
388 :			movq [ecx], mm0
389 :			packuswb mm2, mm3
390 :			movq [ecx+edx], mm2
391 :			%endmacro
392 :	Isibaar	3
393 :
394 :	Isibaar	226	align 16
395 :			transfer_16to8add_mmx:
396 :			mov ecx, [esp+ 4] ; Dst
397 :			mov eax, [esp+ 8] ; Src
398 :			mov edx, [esp+12] ; Stride
399 :			pxor mm7, mm7
400 :	Isibaar	3
401 :	Isibaar	226	COPY_16_TO_8_ADD 0
402 :			lea ecx,[ecx+2*edx]
403 :			COPY_16_TO_8_ADD 1
404 :			lea ecx,[ecx+2*edx]
405 :			COPY_16_TO_8_ADD 2
406 :			lea ecx,[ecx+2*edx]
407 :			COPY_16_TO_8_ADD 3
408 :			ret
409 :	Isibaar	3
410 :			;===========================================================================
411 :			;
412 :			; void transfer8x8_copy_mmx(uint8_t * const dst,
413 :			; const uint8_t * const src,
414 :			; const uint32_t stride);
415 :			;
416 :			;
417 :			;===========================================================================
418 :
419 :	Isibaar	226	%macro COPY_8_TO_8 0
420 :			movq mm0, [eax]
421 :			movq mm1, [eax+edx]
422 :			movq [ecx], mm0
423 :			lea eax,[eax+2*edx]
424 :			movq [ecx+edx], mm1
425 :			%endmacro
426 :
427 :	Isibaar	3	align 16
428 :	Isibaar	226	transfer8x8_copy_mmx:
429 :			mov ecx, [esp+ 4] ; Dst
430 :			mov eax, [esp+ 8] ; Src
431 :			mov edx, [esp+12] ; Stride
432 :	Isibaar	3
433 :	Isibaar	226	COPY_8_TO_8
434 :			lea ecx,[ecx+2*edx]
435 :			COPY_8_TO_8
436 :			lea ecx,[ecx+2*edx]
437 :			COPY_8_TO_8
438 :			lea ecx,[ecx+2*edx]
439 :			COPY_8_TO_8
440 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4