Annotation of /trunk/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm

Revision 1382 - (view) (download)

1 :	edgomez	1382	;/*****************************************************************************
2 :	Isibaar	262	; *
3 :	edgomez	1382	; * XVID MPEG-4 VIDEO CODEC
4 :			; * - mmx 8x8 block-based halfpel interpolation -
5 :	Isibaar	262	; *
6 :	edgomez	1382	; * Copyright(C) 2002 Michael Militzer <isibaar@xvid.org>
7 :			; * 2002 Pascal Massimino <skal@planet-d.net>
8 :	Isibaar	262	; *
9 :	edgomez	1382	; * This program is free software ; you can redistribute it and/or modify
10 :			; * it under the terms of the GNU General Public License as published by
11 :			; * the Free Software Foundation ; either version 2 of the License, or
12 :			; * (at your option) any later version.
13 :	Isibaar	262	; *
14 :	edgomez	1382	; * This program is distributed in the hope that it will be useful,
15 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
16 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 :			; * GNU General Public License for more details.
18 :	Isibaar	262	; *
19 :	edgomez	1382	; * You should have received a copy of the GNU General Public License
20 :			; * along with this program ; if not, write to the Free Software
21 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 :	Isibaar	262	; *
23 :	edgomez	1382	; ****************************************************************************/
24 :	Isibaar	262
25 :	edgomez	1382	BITS 32
26 :	edgomez	851
27 :	edgomez	1382	%macro cglobal 1
28 :	Isibaar	262	%ifdef PREFIX
29 :	edgomez	1382	global _%1
30 :	Isibaar	262	%define %1 _%1
31 :			%else
32 :			global %1
33 :			%endif
34 :			%endmacro
35 :
36 :	edgomez	1382	;=============================================================================
37 :			; Read only data
38 :			;=============================================================================
39 :	Isibaar	262
40 :	edgomez	1382	%ifdef FORMAT_COFF
41 :			SECTION .rodata data
42 :			%else
43 :			SECTION .rodata data align=16
44 :			%endif
45 :	Isibaar	262
46 :	edgomez	1382	ALIGN 16
47 :			mmx_one:
48 :			times 8 db 1
49 :	Isibaar	262
50 :	edgomez	1382	SECTION .text
51 :	Isibaar	262
52 :			cglobal interpolate8x8_halfpel_h_xmm
53 :			cglobal interpolate8x8_halfpel_v_xmm
54 :			cglobal interpolate8x8_halfpel_hv_xmm
55 :
56 :			;===========================================================================
57 :			;
58 :			; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst,
59 :			; const uint8_t * const src,
60 :			; const uint32_t stride,
61 :			; const uint32_t rounding);
62 :			;
63 :			;===========================================================================
64 :
65 :			%macro COPY_H_SSE_RND0 0
66 :			movq mm0, [eax]
67 :			pavgb mm0, [eax+1]
68 :			movq mm1, [eax+edx]
69 :			pavgb mm1, [eax+edx+1]
70 :			lea eax,[eax+2*edx]
71 :			movq [ecx],mm0
72 :			movq [ecx+edx],mm1
73 :			%endmacro
74 :
75 :			%macro COPY_H_SSE_RND1 0
76 :			movq mm0, [eax]
77 :			movq mm1, [eax+edx]
78 :			movq mm4, mm0
79 :			movq mm5, mm1
80 :	edgomez	1382	movq mm2, [eax+1]
81 :	Isibaar	262	movq mm3, [eax+edx+1]
82 :			pavgb mm0, mm2
83 :			pxor mm2, mm4
84 :			pavgb mm1, mm3
85 :	edgomez	1382	lea eax, [eax+2*edx]
86 :	Isibaar	262	pxor mm3, mm5
87 :			pand mm2, mm7
88 :			pand mm3, mm7
89 :			psubb mm0, mm2
90 :			movq [ecx], mm0
91 :			psubb mm1, mm3
92 :	edgomez	1382	movq [ecx+edx], mm1
93 :	Isibaar	262	%endmacro
94 :
95 :	edgomez	1382	ALIGN 16
96 :	Isibaar	262	interpolate8x8_halfpel_h_xmm:
97 :
98 :	edgomez	1382	mov eax, [esp+16] ; rounding
99 :			mov ecx, [esp+ 4] ; Dst
100 :	Isibaar	262	test eax,eax
101 :	edgomez	1382	mov eax, [esp+ 8] ; Src
102 :			mov edx, [esp+12] ; stride
103 :	Isibaar	262
104 :			jnz near .rounding1
105 :
106 :			COPY_H_SSE_RND0
107 :			lea ecx,[ecx+2*edx]
108 :			COPY_H_SSE_RND0
109 :			lea ecx,[ecx+2*edx]
110 :			COPY_H_SSE_RND0
111 :			lea ecx,[ecx+2*edx]
112 :			COPY_H_SSE_RND0
113 :			ret
114 :
115 :			.rounding1
116 :	edgomez	1382	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
117 :	Isibaar	262	movq mm7, [mmx_one]
118 :			COPY_H_SSE_RND1
119 :			lea ecx, [ecx+2*edx]
120 :			COPY_H_SSE_RND1
121 :			lea ecx,[ecx+2*edx]
122 :			COPY_H_SSE_RND1
123 :			lea ecx,[ecx+2*edx]
124 :			COPY_H_SSE_RND1
125 :			ret
126 :
127 :			;===========================================================================
128 :			;
129 :			; void interpolate8x8_halfpel_v_xmm(uint8_t * const dst,
130 :	edgomez	1382	; const uint8_t * const src,
131 :			; const uint32_t stride,
132 :			; const uint32_t rounding);
133 :	Isibaar	262	;
134 :			;===========================================================================
135 :
136 :			%macro COPY_V_SSE_RND0 0
137 :	edgomez	1382	movq mm0, [eax]
138 :			movq mm1, [eax+edx]
139 :	Isibaar	262	pavgb mm0, mm1
140 :			pavgb mm1, [eax+2*edx]
141 :	edgomez	1382	lea eax, [eax+2*edx]
142 :			movq [ecx], mm0
143 :	Isibaar	262	movq [ecx+edx],mm1
144 :			%endmacro
145 :
146 :			%macro COPY_V_SSE_RND1 0
147 :			movq mm0, mm2
148 :			movq mm1, [eax]
149 :			movq mm2, [eax+edx]
150 :			lea eax,[eax+2*edx]
151 :			movq mm4, mm0
152 :			movq mm5, mm1
153 :			pavgb mm0, mm1
154 :	edgomez	1382	pxor mm4, mm1
155 :	Isibaar	262	pavgb mm1, mm2
156 :			pxor mm5, mm2
157 :	edgomez	1382	pand mm4, mm7 ; lsb's of (i^j)...
158 :			pand mm5, mm7 ; lsb's of (i^j)...
159 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
160 :	Isibaar	262	movq [ecx], mm0
161 :	edgomez	1382	psubb mm1, mm5 ; ...are substracted from result of pavgb
162 :	Isibaar	262	movq [ecx+edx], mm1
163 :			%endmacro
164 :
165 :	edgomez	1382	ALIGN 16
166 :	Isibaar	262	interpolate8x8_halfpel_v_xmm:
167 :
168 :			mov eax, [esp+16]; rounding
169 :	edgomez	1382	mov ecx, [esp+ 4] ; Dst
170 :	Isibaar	262	test eax,eax
171 :	edgomez	1382	mov eax, [esp+ 8] ; Src
172 :			mov edx, [esp+12] ; stride
173 :	Isibaar	262
174 :	edgomez	1382	; we process 2 line at a time
175 :	Isibaar	262	jnz near .rounding1
176 :
177 :			COPY_V_SSE_RND0
178 :			lea ecx, [ecx+2*edx]
179 :			COPY_V_SSE_RND0
180 :			lea ecx, [ecx+2*edx]
181 :			COPY_V_SSE_RND0
182 :			lea ecx, [ecx+2*edx]
183 :			COPY_V_SSE_RND0
184 :			ret
185 :
186 :			.rounding1
187 :	edgomez	1382	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
188 :	Isibaar	262	movq mm7, [mmx_one]
189 :	edgomez	1382	movq mm2, [eax] ; loop invariant
190 :	Isibaar	262	add eax, edx
191 :
192 :			COPY_V_SSE_RND1
193 :			lea ecx,[ecx+2*edx]
194 :			COPY_V_SSE_RND1
195 :			lea ecx,[ecx+2*edx]
196 :			COPY_V_SSE_RND1
197 :			lea ecx,[ecx+2*edx]
198 :			COPY_V_SSE_RND1
199 :			ret
200 :
201 :			;===========================================================================
202 :			;
203 :			; void interpolate8x8_halfpel_hv_xmm(uint8_t * const dst,
204 :	edgomez	1382	; const uint8_t * const src,
205 :			; const uint32_t stride,
206 :			; const uint32_t rounding);
207 :	Isibaar	262	;
208 :			;
209 :			;===========================================================================
210 :
211 :			; The trick is to correct the result of 'pavgb' with some combination of the
212 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
213 :			; The boolean relations are:
214 :	edgomez	1382	; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
215 :	Isibaar	262	; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
216 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
217 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
218 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
219 :
220 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
221 :
222 :			%macro COPY_HV_SSE_RND0 0
223 :	edgomez	1382	lea eax, [eax+edx]
224 :	Isibaar	262
225 :	edgomez	1382	movq mm0, [eax]
226 :			movq mm1, [eax+1]
227 :	Isibaar	262
228 :	edgomez	1382	movq mm6, mm0
229 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
230 :			lea eax, [eax+edx]
231 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
232 :	Isibaar	262
233 :	edgomez	1382	por mm3, mm1 ; ij \|= jk
234 :			movq mm6, mm2
235 :			pxor mm6, mm0 ; mm6 = s^t
236 :			pand mm3, mm6 ; (ij\|jk) &= st
237 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
238 :			pand mm3, mm7 ; mask lsb
239 :			psubb mm2, mm3 ; apply.
240 :	Isibaar	262
241 :	edgomez	1382	movq [ecx], mm2
242 :	Isibaar	262
243 :	edgomez	1382	movq mm2, [eax]
244 :			movq mm3, [eax+1]
245 :			movq mm6, mm2
246 :			pavgb mm2, mm3 ; preserved for next iteration
247 :			lea ecx,[ecx+edx]
248 :			pxor mm3, mm6 ; preserved for next iteration
249 :	Isibaar	262
250 :	edgomez	1382	por mm1, mm3
251 :			movq mm6, mm0
252 :			pxor mm6, mm2
253 :			pand mm1, mm6
254 :			pavgb mm0, mm2
255 :	Isibaar	262
256 :	edgomez	1382	pand mm1, mm7
257 :			psubb mm0, mm1
258 :	Isibaar	262
259 :	edgomez	1382	movq [ecx], mm0
260 :	Isibaar	262	%endmacro
261 :
262 :			%macro COPY_HV_SSE_RND1 0
263 :	edgomez	1382	lea eax, [eax+edx]
264 :	Isibaar	262
265 :	edgomez	1382	movq mm0, [eax]
266 :			movq mm1, [eax+1]
267 :	Isibaar	262
268 :	edgomez	1382	movq mm6, mm0
269 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
270 :			lea eax, [eax+edx]
271 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
272 :	Isibaar	262
273 :	edgomez	1382	pand mm3, mm1
274 :			movq mm6, mm2
275 :			pxor mm6, mm0
276 :			por mm3, mm6
277 :			pavgb mm2, mm0
278 :			pand mm3, mm7
279 :			psubb mm2, mm3
280 :	Isibaar	262
281 :	edgomez	1382	movq [ecx], mm2
282 :	Isibaar	262
283 :	edgomez	1382	movq mm2, [eax]
284 :			movq mm3, [eax+1]
285 :			movq mm6, mm2
286 :			pavgb mm2, mm3 ; preserved for next iteration
287 :			lea ecx,[ecx+edx]
288 :			pxor mm3, mm6 ; preserved for next iteration
289 :	Isibaar	262
290 :	edgomez	1382	pand mm1, mm3
291 :			movq mm6, mm0
292 :			pxor mm6, mm2
293 :			por mm1, mm6
294 :			pavgb mm0, mm2
295 :			pand mm1, mm7
296 :			psubb mm0, mm1
297 :	Isibaar	262
298 :	edgomez	1382	movq [ecx], mm0
299 :	Isibaar	262	%endmacro
300 :
301 :	edgomez	1382	ALIGN 16
302 :	Isibaar	262	interpolate8x8_halfpel_hv_xmm:
303 :	edgomez	1382	mov eax, [esp+16] ; rounding
304 :			mov ecx, [esp+ 4] ; Dst
305 :			test eax, eax
306 :			mov eax, [esp+ 8] ; Src
307 :			mov edx, [esp+12] ; stride
308 :	Isibaar	262
309 :			movq mm7, [mmx_one]
310 :
311 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
312 :			movq mm2, [eax]
313 :			movq mm3, [eax+1]
314 :			movq mm6, mm2
315 :			pavgb mm2, mm3
316 :	edgomez	1382	pxor mm3, mm6 ; mm2/mm3 ready
317 :	Isibaar	262
318 :			jnz near .rounding1
319 :
320 :			COPY_HV_SSE_RND0
321 :			add ecx, edx
322 :			COPY_HV_SSE_RND0
323 :			add ecx, edx
324 :			COPY_HV_SSE_RND0
325 :			add ecx, edx
326 :			COPY_HV_SSE_RND0
327 :			ret
328 :
329 :			.rounding1
330 :			COPY_HV_SSE_RND1
331 :			add ecx, edx
332 :			COPY_HV_SSE_RND1
333 :			add ecx, edx
334 :			COPY_HV_SSE_RND1
335 :			add ecx, edx
336 :			COPY_HV_SSE_RND1
337 :	edgomez	1382	ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4