Annotation of /branches/dev-api-3/xvidcore/src/utils/x86_asm/mem_transfer_3dne.asm

Revision 730 - (view) (download)

1 :	Isibaar	730	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * mmx 8bit<->16bit transfers
5 :			; *
6 :			; * This program is an implementation of a part of one or more MPEG-4
7 :			; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
8 :			; * to use this software module in hardware or software products are
9 :			; * advised that its use may infringe existing patents or copyrights, and
10 :			; * any such use would be at such party's own risk. The original
11 :			; * developer of this software module and his/her company, and subsequent
12 :			; * editors and their companies, will have no liability for use of this
13 :			; * software or modifications or derivatives thereof.
14 :			; *
15 :			; * This program is free software; you can redistribute it and/or modify
16 :			; * it under the terms of the GNU General Public License as published by
17 :			; * the Free Software Foundation; either version 2 of the License, or
18 :			; * (at your option) any later version.
19 :			; *
20 :			; * This program is distributed in the hope that it will be useful,
21 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 :			; * GNU General Public License for more details.
24 :			; *
25 :			; * You should have received a copy of the GNU General Public License
26 :			; * along with this program; if not, write to the Free Software
27 :			; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 :			; *
29 :			; *************************************************************************/
30 :
31 :			; these 3dne functions are compatible with iSSE, but are optimized specifically for
32 :			; K7 pipelines
33 :			;
34 :			;------------------------------------------------------------------------------
35 :			; 09.12.2002 Athlon optimizations contributed by Jaan Kalda
36 :			;------------------------------------------------------------------------------
37 :
38 :
39 :			bits 32
40 :			%ifdef FORMAT_COFF
41 :			section .data data
42 :			%else
43 :			section .data data align=16
44 :			%endif
45 :
46 :
47 :			align 8
48 :			mm_zero:
49 :			dd 0,0
50 :
51 :
52 :			%macro cglobal 1
53 :			%ifdef PREFIX
54 :			global _%1
55 :			%define %1 _%1
56 :			%else
57 :			global %1
58 :			%endif
59 :			%endmacro
60 :			%macro nop4 0
61 :			DB 08Dh,074h,026h,0
62 :			%endmacro
63 :
64 :			section .text
65 :
66 :			cglobal transfer_8to16copy_3dne
67 :			cglobal transfer_16to8copy_3dne
68 :			cglobal transfer_8to16sub_3dne
69 :			cglobal transfer_8to16sub2_3dne
70 :			cglobal transfer_16to8add_3dne
71 :			cglobal transfer8x8_copy_3dne
72 :
73 :			;===========================================================================
74 :			;
75 :			; void transfer_8to16copy_3dne(int16_t * const dst,
76 :			; const uint8_t * const src,
77 :			; uint32_t stride);
78 :			;
79 :			;===========================================================================
80 :
81 :			align 16
82 :			transfer_8to16copy_3dne:
83 :
84 :			mov eax, [esp+ 8] ; Src
85 :			mov edx, [esp+12] ; Stride
86 :			mov ecx, [esp+ 4] ; Dst
87 :			punpcklbw mm0, [byte eax]
88 :			punpcklbw mm1, [eax+4]
89 :			movq mm2,[eax+edx]
90 :			movq mm3,[eax+edx]
91 :			pxor mm7,mm7
92 :			lea eax,[eax+2*edx]
93 :			punpcklbw mm2,mm7
94 :			punpckhbw mm3,mm7
95 :			psrlw mm0,8
96 :			psrlw mm1,8
97 :			punpcklbw mm4, [eax]
98 :			punpcklbw mm5, [eax+edx+4]
99 :			movq [byte ecx+0*64], mm0
100 :			movq [ecx+0*64+8], mm1
101 :			punpcklbw mm6, [eax+edx]
102 :			punpcklbw mm7, [eax+4]
103 :			lea eax,[byte eax+2*edx]
104 :			psrlw mm4,8
105 :			psrlw mm5,8
106 :			punpcklbw mm0, [eax]
107 :			punpcklbw mm1, [eax+edx+4]
108 :			movq [ecx+0*64+16], mm2
109 :			movq [ecx+0*64+24], mm3
110 :			psrlw mm6,8
111 :			psrlw mm7,8
112 :			punpcklbw mm2, [eax+edx]
113 :			punpcklbw mm3, [eax+4]
114 :			lea eax,[byte eax+2*edx]
115 :			movq [byte ecx+0*64+32], mm4
116 :			movq [ecx+0*64+56], mm5
117 :			psrlw mm0,8
118 :			psrlw mm1,8
119 :			punpcklbw mm4, [eax]
120 :			punpcklbw mm5, [eax+edx+4]
121 :			movq [byte ecx+0*64+48], mm6
122 :			movq [ecx+0*64+40], mm7
123 :			psrlw mm2,8
124 :			psrlw mm3,8
125 :			punpcklbw mm6, [eax+edx]
126 :			punpcklbw mm7, [eax+4]
127 :			movq [byte ecx+1*64], mm0
128 :			movq [ecx+1*64+24], mm1
129 :			psrlw mm4,8
130 :			psrlw mm5,8
131 :			movq [ecx+1*64+16], mm2
132 :			movq [ecx+1*64+8], mm3
133 :			psrlw mm6,8
134 :			psrlw mm7,8
135 :			movq [byte ecx+1*64+32], mm4
136 :			movq [ecx+1*64+56], mm5
137 :			movq [byte ecx+1*64+48], mm6
138 :			movq [ecx+1*64+40], mm7
139 :			ret
140 :
141 :
142 :
143 :			;===========================================================================
144 :			;
145 :			; void transfer_16to8copy_3dne(uint8_t * const dst,
146 :			; const int16_t * const src,
147 :			; uint32_t stride);
148 :			;
149 :			;===========================================================================
150 :
151 :			align 16
152 :			transfer_16to8copy_3dne:
153 :
154 :			mov eax, [esp+ 8] ; Src
155 :			mov ecx, [esp+ 4] ; Dst
156 :			mov edx, [esp+12] ; Stride
157 :
158 :			movq mm0, [byte eax+0*32]
159 :			packuswb mm0,[eax+0*32+8]
160 :			movq mm1, [eax+0*32+16]
161 :			packuswb mm1,[eax+0*32+24]
162 :			movq mm5, [eax+2*32+16]
163 :			movq mm2, [eax+1*32]
164 :			packuswb mm2, [eax+1*32+8]
165 :			movq mm3, [eax+1*32+16]
166 :			packuswb mm3, [eax+1*32+24]
167 :			movq mm6, [eax+3*32]
168 :			movq mm4, [eax+2*32]
169 :			packuswb mm4, [eax+2*32+8]
170 :			packuswb mm5, [eax+2*32+24]
171 :			movq mm7, [eax+3*32+16]
172 :			packuswb mm7, [eax+3*32+24]
173 :			packuswb mm6, [eax+3*32+8]
174 :			movq [ecx], mm0
175 :			lea eax,[3*edx]
176 :			add eax,ecx
177 :			movq [ecx+edx], mm1
178 :			movq [ecx+2*edx], mm2
179 :			movq [byte eax], mm3
180 :			movq [ecx+4*edx], mm4
181 :			lea ecx,[byte ecx+4*edx]
182 :			movq [eax+2*edx], mm5
183 :			movq [eax+4*edx], mm7
184 :			movq [ecx+2*edx], mm6
185 :			ret
186 :
187 :			;===========================================================================
188 :			;
189 :			; void transfer_8to16sub_3dne(int16_t * const dct,
190 :			; uint8_t * const cur,
191 :			; const uint8_t * const ref,
192 :			; const uint32_t stride);
193 :			;
194 :			;===========================================================================
195 :			;/**************************************************************************
196 :			; *
197 :			; * History:
198 :			; *
199 :			; * 27.12.2001 renamed from 'compensate' to 'transfer_8to16sub'
200 :			; * 02.12.2001 loop unrolled, code runs 10% faster now (Isibaar)
201 :			; * 30.11.2001 16 pixels are processed per iteration (Isibaar)
202 :			; * 30.11.2001 .text missing
203 :			; * 06.11.2001 inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
204 :			; *
205 :			; *************************************************************************/
206 :
207 :			%macro COPY_8_TO_16_SUB 1
208 :			movq mm1, [eax] ; cur
209 :			movq mm0, mm1
210 :			movq mm4, [ecx] ; ref
211 :			movq mm6, mm4
212 :			movq [eax], mm4
213 :			punpckhbw mm1, mm7
214 :			punpckhbw mm6, mm7
215 :			punpcklbw mm4, mm7
216 :			align 8
217 :			movq mm2, [byte eax+edx]
218 :			punpcklbw mm0, mm7
219 :			movq mm3, [byte eax+edx]
220 :			punpcklbw mm2, mm7
221 :			movq mm5, [byte ecx+edx] ; ref
222 :			punpckhbw mm3, mm7
223 :			movq [byte eax+edx], mm5
224 :			psubsw mm1, mm6
225 :
226 :			movq mm6, mm5
227 :			psubsw mm0, mm4
228 :			%if (%1 < 3)
229 :			lea eax,[eax+2*edx]
230 :			lea ecx,[ecx+2*edx]
231 :			%else
232 :			mov ecx,[esp]
233 :			add esp,byte 4
234 :			%endif
235 :			movq [edi+%1*32+ 8], mm1
236 :			movq [byte edi+%1*32+ 0], mm0 ; dst
237 :			punpcklbw mm5, mm7
238 :			punpckhbw mm6, mm7
239 :			psubsw mm2, mm5
240 :			psubsw mm3, mm6
241 :			movq [edi+%1*32+16], mm2
242 :			movq [edi+%1*32+24], mm3
243 :			%endmacro
244 :
245 :			align 16
246 :			transfer_8to16sub_3dne:
247 :			mov eax, [esp + 8] ; Cur
248 :			mov ecx, [esp +12] ; Ref
249 :			push edi
250 :			mov edx, [dword esp+4+16] ; Stride
251 :			mov edi, [esp+4+ 4] ; Dst
252 :			pxor mm7, mm7
253 :			nop
254 :			align 4
255 :			COPY_8_TO_16_SUB 0
256 :			COPY_8_TO_16_SUB 1
257 :			COPY_8_TO_16_SUB 2
258 :			COPY_8_TO_16_SUB 3
259 :			mov edi,ecx
260 :			ret
261 :
262 :
263 :			;===========================================================================
264 :			;
265 :			; void transfer_8to16sub2_3dne(int16_t * const dct,
266 :			; uint8_t * const cur,
267 :			; const uint8_t * ref1,
268 :			; const uint8_t * ref2,
269 :			; const uint32_t stride)
270 :			;
271 :			;===========================================================================
272 :
273 :			%macro COPY_8_TO_16_SUB2_SSE 1
274 :			db 0Fh, 6Fh, 44h, 20h, 00 ;movq mm0, [byte eax] ; cur
275 :			punpcklbw mm0, mm7
276 :			movq mm2, [byte eax+edx]
277 :			punpcklbw mm2, mm7
278 :			db 0Fh, 6Fh, 4ch, 20h, 00 ;movq mm1, [byte eax]
279 :			punpckhbw mm1, mm7
280 :			movq mm3, [byte eax+edx]
281 :			punpckhbw mm3, mm7
282 :
283 :			movq mm4, [byte ebx] ; ref1
284 :			pavgb mm4, [byte esi] ; ref2
285 :			movq mm5, [ebx+edx] ; ref
286 :			pavgb mm5, [esi+edx] ; ref2
287 :			movq mm6, mm4
288 :			punpcklbw mm4, mm7
289 :			punpckhbw mm6, mm7
290 :			%if (%1 < 3)
291 :			lea esi,[esi+2*edx]
292 :			lea ebx,[byte ebx+2*edx]
293 :			lea eax,[eax+2*edx]
294 :			%else
295 :			mov esi,[esp]
296 :			mov ebx,[esp+4]
297 :			add esp,byte 8
298 :			%endif
299 :			psubsw mm0, mm4
300 :			psubsw mm1, mm6
301 :			movq mm6, mm5
302 :			punpcklbw mm5, mm7
303 :			punpckhbw mm6, mm7
304 :			psubsw mm2, mm5
305 :			psubsw mm3, mm6
306 :			movq [byte ecx+%1*32+ 0], mm0 ; dst
307 :			movq [ecx+%1*32+ 8], mm1
308 :			movq [ecx+%1*32+16], mm2
309 :			movq [ecx+%1*32+24], mm3
310 :			%endmacro
311 :
312 :			align 16
313 :			transfer_8to16sub2_3dne:
314 :			mov edx, [esp +20] ; Stride
315 :			mov ecx, [esp + 4] ; Dst
316 :			mov eax, [esp + 8] ; Cur
317 :			push ebx
318 :			lea ebp,[byte ebp]
319 :			mov ebx, [esp+4+12] ; Ref1
320 :			push esi
321 :			pxor mm7, mm7
322 :			mov esi, [esp+8+16] ; Ref2
323 :			nop4
324 :			COPY_8_TO_16_SUB2_SSE 0
325 :			COPY_8_TO_16_SUB2_SSE 1
326 :			COPY_8_TO_16_SUB2_SSE 2
327 :			COPY_8_TO_16_SUB2_SSE 3
328 :
329 :			ret
330 :
331 :
332 :			;===========================================================================
333 :			;
334 :			; void transfer_16to8add_3dne(uint8_t * const dst,
335 :			; const int16_t * const src,
336 :			; uint32_t stride);
337 :			;
338 :			;===========================================================================
339 :
340 :			%macro COPY_16_TO_8_ADD 1
341 :			db 0Fh, 6Fh, 44h, 21h, 00 ;movq mm0, [byte ecx]
342 :			punpcklbw mm0, mm7
343 :			movq mm2, [byte ecx+edx]
344 :			punpcklbw mm2, mm7
345 :			db 0Fh, 6Fh, 4ch, 21h, 00 ;movq mm1, [byte ecx]
346 :			punpckhbw mm1, mm7
347 :			movq mm3, [byte ecx+edx]
348 :			punpckhbw mm3, mm7
349 :			paddsw mm0, [byte eax+%1*32+ 0]
350 :			paddsw mm1, [eax+%1*32+ 8]
351 :			paddsw mm2, [eax+%1*32+16]
352 :			paddsw mm3, [eax+%1*32+24]
353 :			packuswb mm0, mm1
354 :			packuswb mm2, mm3
355 :			mov esp,esp
356 :			movq [byte ecx], mm0
357 :			movq [ecx+edx], mm2
358 :			%endmacro
359 :
360 :
361 :			align 16
362 :			transfer_16to8add_3dne:
363 :			mov ecx, [esp+ 4] ; Dst
364 :			mov edx, [esp+12] ; Stride
365 :			mov eax, [esp+ 8] ; Src
366 :			pxor mm7, mm7
367 :			nop
368 :
369 :			COPY_16_TO_8_ADD 0
370 :			lea ecx,[byte ecx+2*edx]
371 :			COPY_16_TO_8_ADD 1
372 :			lea ecx,[byte ecx+2*edx]
373 :			COPY_16_TO_8_ADD 2
374 :			lea ecx,[byte ecx+2*edx]
375 :			COPY_16_TO_8_ADD 3
376 :			ret
377 :
378 :			;===========================================================================
379 :			;
380 :			; void transfer8x8_copy_3dne(uint8_t * const dst,
381 :			; const uint8_t * const src,
382 :			; const uint32_t stride);
383 :			;
384 :			;
385 :			;===========================================================================
386 :
387 :			%macro COPY_8_TO_8 0
388 :			movq mm0, [byte eax]
389 :			movq mm1, [eax+edx]
390 :			movq [byte ecx], mm0
391 :			lea eax,[byte eax+2*edx]
392 :			movq [ecx+edx], mm1
393 :			%endmacro
394 :
395 :			align 16
396 :			transfer8x8_copy_3dne:
397 :			mov eax, [esp+ 8] ; Src
398 :			mov edx, [esp+12] ; Stride
399 :			mov ecx, [esp+ 4] ; Dst
400 :
401 :			COPY_8_TO_8
402 :			lea ecx,[byte ecx+2*edx]
403 :			COPY_8_TO_8
404 :			lea ecx,[byte ecx+2*edx]
405 :			COPY_8_TO_8
406 :			lea ecx,[byte ecx+2*edx]
407 :			COPY_8_TO_8
408 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4