Annotation of /trunk/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm

Revision 651 - (view) (download)

1 :	chl	434	;/*****************************************************************************
2 :	Isibaar	262	; *
3 :	chl	434	; * XVID MPEG-4 VIDEO CODEC
4 :			; * xmm 8x8 block-based halfpel interpolation
5 :	Isibaar	262	; *
6 :	chl	434	; * Copyright(C) 2002 Michael Militzer <michael@xvid.org>
7 :			; * Copyright(C) 2002 -Skal-
8 :	Isibaar	262	; *
9 :	edgomez	651	; * This file is part of XviD, a free MPEG-4 video encoder/decoder
10 :	Isibaar	262	; *
11 :	edgomez	651	; * XviD is free software; you can redistribute it and/or modify it
12 :			; * under the terms of the GNU General Public License as published by
13 :	chl	434	; * the Free Software Foundation; either version 2 of the License, or
14 :			; * (at your option) any later version.
15 :	Isibaar	262	; *
16 :	chl	434	; * This program is distributed in the hope that it will be useful,
17 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 :			; * GNU General Public License for more details.
20 :	Isibaar	262	; *
21 :	chl	434	; * You should have received a copy of the GNU General Public License
22 :			; * along with this program; if not, write to the Free Software
23 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 :	Isibaar	262	; *
25 :	edgomez	651	; * Under section 8 of the GNU General Public License, the copyright
26 :			; * holders of XVID explicitly forbid distribution in the following
27 :			; * countries:
28 :			; *
29 :			; * - Japan
30 :			; * - United States of America
31 :			; *
32 :			; * Linking XviD statically or dynamically with other modules is making a
33 :			; * combined work based on XviD. Thus, the terms and conditions of the
34 :			; * GNU General Public License cover the whole combination.
35 :			; *
36 :			; * As a special exception, the copyright holders of XviD give you
37 :			; * permission to link XviD with independent modules that communicate with
38 :			; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the
39 :			; * license terms of these independent modules, and to copy and distribute
40 :			; * the resulting combined work under terms of your choice, provided that
41 :			; * every copy of the combined work is accompanied by a complete copy of
42 :			; * the source code of XviD (the version of XviD used to produce the
43 :			; * combined work), being distributed under the terms of the GNU General
44 :			; * Public License plus this exception. An independent module is a module
45 :			; * which is not derived from or based on XviD.
46 :			; *
47 :			; * Note that people who make modified versions of XviD are not obligated
48 :			; * to grant this special exception for their modified versions; it is
49 :			; * their choice whether to do so. The GNU General Public License gives
50 :			; * permission to release a modified version without this exception; this
51 :			; * exception also makes it possible to release a modified version which
52 :			; * carries forward this exception.
53 :			; *
54 :			; * $Id: interpolate8x8_xmm.asm,v 1.3 2002-11-17 00:20:30 edgomez Exp $
55 :			; *
56 :	chl	434	; ****************************************************************************/
57 :	Isibaar	262
58 :			bits 32
59 :
60 :			%macro cglobal 1
61 :			%ifdef PREFIX
62 :			global _%1
63 :			%define %1 _%1
64 :			%else
65 :			global %1
66 :			%endif
67 :			%endmacro
68 :
69 :			section .data
70 :
71 :
72 :			align 16
73 :
74 :			mmx_one
75 :			times 8 db 1
76 :
77 :			section .text
78 :
79 :			cglobal interpolate8x8_halfpel_h_xmm
80 :			cglobal interpolate8x8_halfpel_v_xmm
81 :			cglobal interpolate8x8_halfpel_hv_xmm
82 :
83 :			;===========================================================================
84 :			;
85 :			; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst,
86 :			; const uint8_t * const src,
87 :			; const uint32_t stride,
88 :			; const uint32_t rounding);
89 :			;
90 :			;===========================================================================
91 :
92 :			%macro COPY_H_SSE_RND0 0
93 :			movq mm0, [eax]
94 :			pavgb mm0, [eax+1]
95 :			movq mm1, [eax+edx]
96 :			pavgb mm1, [eax+edx+1]
97 :			lea eax,[eax+2*edx]
98 :			movq [ecx],mm0
99 :			movq [ecx+edx],mm1
100 :			%endmacro
101 :
102 :			%macro COPY_H_SSE_RND1 0
103 :			movq mm0, [eax]
104 :			movq mm1, [eax+edx]
105 :			movq mm4, mm0
106 :			movq mm5, mm1
107 :			movq mm2, [eax+1]
108 :			movq mm3, [eax+edx+1]
109 :			pavgb mm0, mm2
110 :			pxor mm2, mm4
111 :			pavgb mm1, mm3
112 :			lea eax,[eax+2*edx]
113 :			pxor mm3, mm5
114 :			pand mm2, mm7
115 :			pand mm3, mm7
116 :			psubb mm0, mm2
117 :			movq [ecx], mm0
118 :			psubb mm1, mm3
119 :			movq [ecx+edx], mm1
120 :			%endmacro
121 :
122 :			align 16
123 :			interpolate8x8_halfpel_h_xmm:
124 :
125 :			mov eax, [esp+16]; rounding
126 :			mov ecx, [esp+ 4] ; Dst
127 :			test eax,eax
128 :			mov eax, [esp+ 8] ; Src
129 :			mov edx, [esp+12] ; stride
130 :
131 :			jnz near .rounding1
132 :
133 :			COPY_H_SSE_RND0
134 :			lea ecx,[ecx+2*edx]
135 :			COPY_H_SSE_RND0
136 :			lea ecx,[ecx+2*edx]
137 :			COPY_H_SSE_RND0
138 :			lea ecx,[ecx+2*edx]
139 :			COPY_H_SSE_RND0
140 :			ret
141 :
142 :			.rounding1
143 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
144 :			movq mm7, [mmx_one]
145 :			COPY_H_SSE_RND1
146 :			lea ecx, [ecx+2*edx]
147 :			COPY_H_SSE_RND1
148 :			lea ecx,[ecx+2*edx]
149 :			COPY_H_SSE_RND1
150 :			lea ecx,[ecx+2*edx]
151 :			COPY_H_SSE_RND1
152 :			ret
153 :
154 :			;===========================================================================
155 :			;
156 :			; void interpolate8x8_halfpel_v_xmm(uint8_t * const dst,
157 :			; const uint8_t * const src,
158 :			; const uint32_t stride,
159 :			; const uint32_t rounding);
160 :			;
161 :			;===========================================================================
162 :
163 :			%macro COPY_V_SSE_RND0 0
164 :			movq mm0, [eax]
165 :			movq mm1, [eax+edx]
166 :			pavgb mm0, mm1
167 :			pavgb mm1, [eax+2*edx]
168 :			lea eax,[eax+2*edx]
169 :			movq [ecx],mm0
170 :			movq [ecx+edx],mm1
171 :			%endmacro
172 :
173 :			%macro COPY_V_SSE_RND1 0
174 :			movq mm0, mm2
175 :			movq mm1, [eax]
176 :			movq mm2, [eax+edx]
177 :			lea eax,[eax+2*edx]
178 :			movq mm4, mm0
179 :			movq mm5, mm1
180 :			pavgb mm0, mm1
181 :			pxor mm4, mm1
182 :			pavgb mm1, mm2
183 :			pxor mm5, mm2
184 :			pand mm4, mm7 ; lsb's of (i^j)...
185 :			pand mm5, mm7 ; lsb's of (i^j)...
186 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
187 :			movq [ecx], mm0
188 :			psubb mm1, mm5 ; ...are substracted from result of pavgb
189 :			movq [ecx+edx], mm1
190 :			%endmacro
191 :
192 :			align 16
193 :			interpolate8x8_halfpel_v_xmm:
194 :
195 :			mov eax, [esp+16]; rounding
196 :			mov ecx, [esp+ 4] ; Dst
197 :			test eax,eax
198 :			mov eax, [esp+ 8] ; Src
199 :			mov edx, [esp+12] ; stride
200 :
201 :			; we process 2 line at a time
202 :
203 :			jnz near .rounding1
204 :
205 :			COPY_V_SSE_RND0
206 :			lea ecx, [ecx+2*edx]
207 :			COPY_V_SSE_RND0
208 :			lea ecx, [ecx+2*edx]
209 :			COPY_V_SSE_RND0
210 :			lea ecx, [ecx+2*edx]
211 :			COPY_V_SSE_RND0
212 :			ret
213 :
214 :			.rounding1
215 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
216 :			movq mm7, [mmx_one]
217 :			movq mm2, [eax] ; loop invariant
218 :			add eax, edx
219 :
220 :			COPY_V_SSE_RND1
221 :			lea ecx,[ecx+2*edx]
222 :			COPY_V_SSE_RND1
223 :			lea ecx,[ecx+2*edx]
224 :			COPY_V_SSE_RND1
225 :			lea ecx,[ecx+2*edx]
226 :			COPY_V_SSE_RND1
227 :			ret
228 :
229 :			;===========================================================================
230 :			;
231 :			; void interpolate8x8_halfpel_hv_xmm(uint8_t * const dst,
232 :			; const uint8_t * const src,
233 :			; const uint32_t stride,
234 :			; const uint32_t rounding);
235 :			;
236 :			;
237 :			;===========================================================================
238 :
239 :			; The trick is to correct the result of 'pavgb' with some combination of the
240 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
241 :			; The boolean relations are:
242 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
243 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
244 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
245 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
246 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
247 :
248 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
249 :
250 :			%macro COPY_HV_SSE_RND0 0
251 :			lea eax,[eax+edx]
252 :
253 :			movq mm0, [eax]
254 :			movq mm1, [eax+1]
255 :
256 :			movq mm6, mm0
257 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
258 :			lea eax,[eax+edx]
259 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
260 :
261 :			por mm3, mm1 ; ij \|= jk
262 :			movq mm6, mm2
263 :			pxor mm6, mm0 ; mm6 = s^t
264 :			pand mm3, mm6 ; (ij\|jk) &= st
265 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
266 :			pand mm3, mm7 ; mask lsb
267 :			psubb mm2, mm3 ; apply.
268 :
269 :			movq [ecx], mm2
270 :
271 :			movq mm2, [eax]
272 :			movq mm3, [eax+1]
273 :			movq mm6, mm2
274 :			pavgb mm2, mm3 ; preserved for next iteration
275 :			lea ecx,[ecx+edx]
276 :			pxor mm3, mm6 ; preserved for next iteration
277 :
278 :			por mm1, mm3
279 :			movq mm6, mm0
280 :			pxor mm6, mm2
281 :			pand mm1, mm6
282 :			pavgb mm0, mm2
283 :
284 :			pand mm1, mm7
285 :			psubb mm0, mm1
286 :
287 :			movq [ecx], mm0
288 :			%endmacro
289 :
290 :			%macro COPY_HV_SSE_RND1 0
291 :			lea eax,[eax+edx]
292 :
293 :			movq mm0, [eax]
294 :			movq mm1, [eax+1]
295 :
296 :			movq mm6, mm0
297 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
298 :			lea eax,[eax+edx]
299 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
300 :
301 :			pand mm3, mm1
302 :			movq mm6, mm2
303 :			pxor mm6, mm0
304 :			por mm3, mm6
305 :			pavgb mm2, mm0
306 :			pand mm3, mm7
307 :			psubb mm2, mm3
308 :
309 :			movq [ecx], mm2
310 :
311 :			movq mm2, [eax]
312 :			movq mm3, [eax+1]
313 :			movq mm6, mm2
314 :			pavgb mm2, mm3 ; preserved for next iteration
315 :			lea ecx,[ecx+edx]
316 :			pxor mm3, mm6 ; preserved for next iteration
317 :
318 :			pand mm1, mm3
319 :			movq mm6, mm0
320 :			pxor mm6, mm2
321 :			por mm1, mm6
322 :			pavgb mm0, mm2
323 :			pand mm1, mm7
324 :			psubb mm0, mm1
325 :
326 :			movq [ecx], mm0
327 :			%endmacro
328 :
329 :			align 16
330 :			interpolate8x8_halfpel_hv_xmm:
331 :			mov eax, [esp+16] ; rounding
332 :			mov ecx, [esp+ 4] ; Dst
333 :			test eax,eax
334 :			mov eax, [esp+ 8] ; Src
335 :			mov edx, [esp+12] ; stride
336 :
337 :			movq mm7, [mmx_one]
338 :
339 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
340 :			movq mm2, [eax]
341 :			movq mm3, [eax+1]
342 :			movq mm6, mm2
343 :			pavgb mm2, mm3
344 :			pxor mm3, mm6 ; mm2/mm3 ready
345 :
346 :			jnz near .rounding1
347 :
348 :			COPY_HV_SSE_RND0
349 :			add ecx, edx
350 :			COPY_HV_SSE_RND0
351 :			add ecx, edx
352 :			COPY_HV_SSE_RND0
353 :			add ecx, edx
354 :			COPY_HV_SSE_RND0
355 :			ret
356 :
357 :			.rounding1
358 :			COPY_HV_SSE_RND1
359 :			add ecx, edx
360 :			COPY_HV_SSE_RND1
361 :			add ecx, edx
362 :			COPY_HV_SSE_RND1
363 :			add ecx, edx
364 :			COPY_HV_SSE_RND1
365 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4