Annotation of /branches/dev-api-4/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm

Revision 434 - (view) (download)
Original Path: trunk/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm

1 :	chl	434	;/*****************************************************************************
2 :	Isibaar	262	; *
3 :	chl	434	; * XVID MPEG-4 VIDEO CODEC
4 :			; * xmm 8x8 block-based halfpel interpolation
5 :	Isibaar	262	; *
6 :	chl	434	; * Copyright(C) 2002 Michael Militzer <michael@xvid.org>
7 :			; * Copyright(C) 2002 -Skal-
8 :	Isibaar	262	; *
9 :	chl	434	; * This program is an implementation of a part of one or more MPEG-4
10 :			; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
11 :			; * to use this software module in hardware or software products are
12 :			; * advised that its use may infringe existing patents or copyrights, and
13 :			; * any such use would be at such party's own risk. The original
14 :			; * developer of this software module and his/her company, and subsequent
15 :			; * editors and their companies, will have no liability for use of this
16 :			; * software or modifications or derivatives thereof.
17 :	Isibaar	262	; *
18 :	chl	434	; * This program is free software; you can redistribute it and/or modify
19 :			; * it under the terms of the GNU General Public License as published by
20 :			; * the Free Software Foundation; either version 2 of the License, or
21 :			; * (at your option) any later version.
22 :	Isibaar	262	; *
23 :	chl	434	; * This program is distributed in the hope that it will be useful,
24 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 :			; * GNU General Public License for more details.
27 :	Isibaar	262	; *
28 :	chl	434	; * You should have received a copy of the GNU General Public License
29 :			; * along with this program; if not, write to the Free Software
30 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31 :	Isibaar	262	; *
32 :	chl	434	; ****************************************************************************/
33 :	Isibaar	262
34 :			bits 32
35 :
36 :			%macro cglobal 1
37 :			%ifdef PREFIX
38 :			global _%1
39 :			%define %1 _%1
40 :			%else
41 :			global %1
42 :			%endif
43 :			%endmacro
44 :
45 :			section .data
46 :
47 :
48 :			align 16
49 :
50 :			mmx_one
51 :			times 8 db 1
52 :
53 :			section .text
54 :
55 :			cglobal interpolate8x8_halfpel_h_xmm
56 :			cglobal interpolate8x8_halfpel_v_xmm
57 :			cglobal interpolate8x8_halfpel_hv_xmm
58 :
59 :			;===========================================================================
60 :			;
61 :			; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst,
62 :			; const uint8_t * const src,
63 :			; const uint32_t stride,
64 :			; const uint32_t rounding);
65 :			;
66 :			;===========================================================================
67 :
68 :			%macro COPY_H_SSE_RND0 0
69 :			movq mm0, [eax]
70 :			pavgb mm0, [eax+1]
71 :			movq mm1, [eax+edx]
72 :			pavgb mm1, [eax+edx+1]
73 :			lea eax,[eax+2*edx]
74 :			movq [ecx],mm0
75 :			movq [ecx+edx],mm1
76 :			%endmacro
77 :
78 :			%macro COPY_H_SSE_RND1 0
79 :			movq mm0, [eax]
80 :			movq mm1, [eax+edx]
81 :			movq mm4, mm0
82 :			movq mm5, mm1
83 :			movq mm2, [eax+1]
84 :			movq mm3, [eax+edx+1]
85 :			pavgb mm0, mm2
86 :			pxor mm2, mm4
87 :			pavgb mm1, mm3
88 :			lea eax,[eax+2*edx]
89 :			pxor mm3, mm5
90 :			pand mm2, mm7
91 :			pand mm3, mm7
92 :			psubb mm0, mm2
93 :			movq [ecx], mm0
94 :			psubb mm1, mm3
95 :			movq [ecx+edx], mm1
96 :			%endmacro
97 :
98 :			align 16
99 :			interpolate8x8_halfpel_h_xmm:
100 :
101 :			mov eax, [esp+16]; rounding
102 :			mov ecx, [esp+ 4] ; Dst
103 :			test eax,eax
104 :			mov eax, [esp+ 8] ; Src
105 :			mov edx, [esp+12] ; stride
106 :
107 :			jnz near .rounding1
108 :
109 :			COPY_H_SSE_RND0
110 :			lea ecx,[ecx+2*edx]
111 :			COPY_H_SSE_RND0
112 :			lea ecx,[ecx+2*edx]
113 :			COPY_H_SSE_RND0
114 :			lea ecx,[ecx+2*edx]
115 :			COPY_H_SSE_RND0
116 :			ret
117 :
118 :			.rounding1
119 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
120 :			movq mm7, [mmx_one]
121 :			COPY_H_SSE_RND1
122 :			lea ecx, [ecx+2*edx]
123 :			COPY_H_SSE_RND1
124 :			lea ecx,[ecx+2*edx]
125 :			COPY_H_SSE_RND1
126 :			lea ecx,[ecx+2*edx]
127 :			COPY_H_SSE_RND1
128 :			ret
129 :
130 :			;===========================================================================
131 :			;
132 :			; void interpolate8x8_halfpel_v_xmm(uint8_t * const dst,
133 :			; const uint8_t * const src,
134 :			; const uint32_t stride,
135 :			; const uint32_t rounding);
136 :			;
137 :			;===========================================================================
138 :
139 :			%macro COPY_V_SSE_RND0 0
140 :			movq mm0, [eax]
141 :			movq mm1, [eax+edx]
142 :			pavgb mm0, mm1
143 :			pavgb mm1, [eax+2*edx]
144 :			lea eax,[eax+2*edx]
145 :			movq [ecx],mm0
146 :			movq [ecx+edx],mm1
147 :			%endmacro
148 :
149 :			%macro COPY_V_SSE_RND1 0
150 :			movq mm0, mm2
151 :			movq mm1, [eax]
152 :			movq mm2, [eax+edx]
153 :			lea eax,[eax+2*edx]
154 :			movq mm4, mm0
155 :			movq mm5, mm1
156 :			pavgb mm0, mm1
157 :			pxor mm4, mm1
158 :			pavgb mm1, mm2
159 :			pxor mm5, mm2
160 :			pand mm4, mm7 ; lsb's of (i^j)...
161 :			pand mm5, mm7 ; lsb's of (i^j)...
162 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
163 :			movq [ecx], mm0
164 :			psubb mm1, mm5 ; ...are substracted from result of pavgb
165 :			movq [ecx+edx], mm1
166 :			%endmacro
167 :
168 :			align 16
169 :			interpolate8x8_halfpel_v_xmm:
170 :
171 :			mov eax, [esp+16]; rounding
172 :			mov ecx, [esp+ 4] ; Dst
173 :			test eax,eax
174 :			mov eax, [esp+ 8] ; Src
175 :			mov edx, [esp+12] ; stride
176 :
177 :			; we process 2 line at a time
178 :
179 :			jnz near .rounding1
180 :
181 :			COPY_V_SSE_RND0
182 :			lea ecx, [ecx+2*edx]
183 :			COPY_V_SSE_RND0
184 :			lea ecx, [ecx+2*edx]
185 :			COPY_V_SSE_RND0
186 :			lea ecx, [ecx+2*edx]
187 :			COPY_V_SSE_RND0
188 :			ret
189 :
190 :			.rounding1
191 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
192 :			movq mm7, [mmx_one]
193 :			movq mm2, [eax] ; loop invariant
194 :			add eax, edx
195 :
196 :			COPY_V_SSE_RND1
197 :			lea ecx,[ecx+2*edx]
198 :			COPY_V_SSE_RND1
199 :			lea ecx,[ecx+2*edx]
200 :			COPY_V_SSE_RND1
201 :			lea ecx,[ecx+2*edx]
202 :			COPY_V_SSE_RND1
203 :			ret
204 :
205 :			;===========================================================================
206 :			;
207 :			; void interpolate8x8_halfpel_hv_xmm(uint8_t * const dst,
208 :			; const uint8_t * const src,
209 :			; const uint32_t stride,
210 :			; const uint32_t rounding);
211 :			;
212 :			;
213 :			;===========================================================================
214 :
215 :			; The trick is to correct the result of 'pavgb' with some combination of the
216 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
217 :			; The boolean relations are:
218 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
219 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
220 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
221 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
222 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
223 :
224 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
225 :
226 :			%macro COPY_HV_SSE_RND0 0
227 :			lea eax,[eax+edx]
228 :
229 :			movq mm0, [eax]
230 :			movq mm1, [eax+1]
231 :
232 :			movq mm6, mm0
233 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
234 :			lea eax,[eax+edx]
235 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
236 :
237 :			por mm3, mm1 ; ij \|= jk
238 :			movq mm6, mm2
239 :			pxor mm6, mm0 ; mm6 = s^t
240 :			pand mm3, mm6 ; (ij\|jk) &= st
241 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
242 :			pand mm3, mm7 ; mask lsb
243 :			psubb mm2, mm3 ; apply.
244 :
245 :			movq [ecx], mm2
246 :
247 :			movq mm2, [eax]
248 :			movq mm3, [eax+1]
249 :			movq mm6, mm2
250 :			pavgb mm2, mm3 ; preserved for next iteration
251 :			lea ecx,[ecx+edx]
252 :			pxor mm3, mm6 ; preserved for next iteration
253 :
254 :			por mm1, mm3
255 :			movq mm6, mm0
256 :			pxor mm6, mm2
257 :			pand mm1, mm6
258 :			pavgb mm0, mm2
259 :
260 :			pand mm1, mm7
261 :			psubb mm0, mm1
262 :
263 :			movq [ecx], mm0
264 :			%endmacro
265 :
266 :			%macro COPY_HV_SSE_RND1 0
267 :			lea eax,[eax+edx]
268 :
269 :			movq mm0, [eax]
270 :			movq mm1, [eax+1]
271 :
272 :			movq mm6, mm0
273 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
274 :			lea eax,[eax+edx]
275 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
276 :
277 :			pand mm3, mm1
278 :			movq mm6, mm2
279 :			pxor mm6, mm0
280 :			por mm3, mm6
281 :			pavgb mm2, mm0
282 :			pand mm3, mm7
283 :			psubb mm2, mm3
284 :
285 :			movq [ecx], mm2
286 :
287 :			movq mm2, [eax]
288 :			movq mm3, [eax+1]
289 :			movq mm6, mm2
290 :			pavgb mm2, mm3 ; preserved for next iteration
291 :			lea ecx,[ecx+edx]
292 :			pxor mm3, mm6 ; preserved for next iteration
293 :
294 :			pand mm1, mm3
295 :			movq mm6, mm0
296 :			pxor mm6, mm2
297 :			por mm1, mm6
298 :			pavgb mm0, mm2
299 :			pand mm1, mm7
300 :			psubb mm0, mm1
301 :
302 :			movq [ecx], mm0
303 :			%endmacro
304 :
305 :			align 16
306 :			interpolate8x8_halfpel_hv_xmm:
307 :			mov eax, [esp+16] ; rounding
308 :			mov ecx, [esp+ 4] ; Dst
309 :			test eax,eax
310 :			mov eax, [esp+ 8] ; Src
311 :			mov edx, [esp+12] ; stride
312 :
313 :			movq mm7, [mmx_one]
314 :
315 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
316 :			movq mm2, [eax]
317 :			movq mm3, [eax+1]
318 :			movq mm6, mm2
319 :			pavgb mm2, mm3
320 :			pxor mm3, mm6 ; mm2/mm3 ready
321 :
322 :			jnz near .rounding1
323 :
324 :			COPY_HV_SSE_RND0
325 :			add ecx, edx
326 :			COPY_HV_SSE_RND0
327 :			add ecx, edx
328 :			COPY_HV_SSE_RND0
329 :			add ecx, edx
330 :			COPY_HV_SSE_RND0
331 :			ret
332 :
333 :			.rounding1
334 :			COPY_HV_SSE_RND1
335 :			add ecx, edx
336 :			COPY_HV_SSE_RND1
337 :			add ecx, edx
338 :			COPY_HV_SSE_RND1
339 :			add ecx, edx
340 :			COPY_HV_SSE_RND1
341 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4