Annotation of /branches/dev-api-4/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm

Revision 1192 - (view) (download)

1 :	edgomez	1192	;/*****************************************************************************
2 :	Isibaar	262	; *
3 :	edgomez	1192	; * XVID MPEG-4 VIDEO CODEC
4 :			; * - mmx 8x8 block-based halfpel interpolation -
5 :	Isibaar	262	; *
6 :	edgomez	1192	; * Copyright(C) 2002 Michael Militzer <isibaar@xvid.org>
7 :			; * 2002 Pascal Massimino <skal@planet-d.net>
8 :	Isibaar	262	; *
9 :	edgomez	1192	; * This program is free software ; you can redistribute it and/or modify
10 :			; * it under the terms of the GNU General Public License as published by
11 :			; * the Free Software Foundation ; either version 2 of the License, or
12 :			; * (at your option) any later version.
13 :	Isibaar	262	; *
14 :	edgomez	1192	; * This program is distributed in the hope that it will be useful,
15 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
16 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 :			; * GNU General Public License for more details.
18 :	Isibaar	262	; *
19 :	edgomez	1192	; * You should have received a copy of the GNU General Public License
20 :			; * along with this program ; if not, write to the Free Software
21 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 :	Isibaar	262	; *
23 :	edgomez	1192	; ****************************************************************************/
24 :	Isibaar	262
25 :	edgomez	1192	BITS 32
26 :	edgomez	851
27 :	edgomez	1192	%macro cglobal 1
28 :	Isibaar	262	%ifdef PREFIX
29 :	edgomez	1192	global _%1
30 :	Isibaar	262	%define %1 _%1
31 :			%else
32 :			global %1
33 :			%endif
34 :			%endmacro
35 :
36 :	edgomez	1192	;=============================================================================
37 :			; Read only data
38 :			;=============================================================================
39 :	Isibaar	262
40 :	edgomez	1192	SECTION .rodata
41 :	Isibaar	262
42 :	edgomez	1192	ALIGN 16
43 :			mmx_one:
44 :			times 8 db 1
45 :	Isibaar	262
46 :	edgomez	1192	SECTION .text
47 :	Isibaar	262
48 :			cglobal interpolate8x8_halfpel_h_xmm
49 :			cglobal interpolate8x8_halfpel_v_xmm
50 :			cglobal interpolate8x8_halfpel_hv_xmm
51 :
52 :			;===========================================================================
53 :			;
54 :			; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst,
55 :			; const uint8_t * const src,
56 :			; const uint32_t stride,
57 :			; const uint32_t rounding);
58 :			;
59 :			;===========================================================================
60 :
61 :			%macro COPY_H_SSE_RND0 0
62 :			movq mm0, [eax]
63 :			pavgb mm0, [eax+1]
64 :			movq mm1, [eax+edx]
65 :			pavgb mm1, [eax+edx+1]
66 :			lea eax,[eax+2*edx]
67 :			movq [ecx],mm0
68 :			movq [ecx+edx],mm1
69 :			%endmacro
70 :
71 :			%macro COPY_H_SSE_RND1 0
72 :			movq mm0, [eax]
73 :			movq mm1, [eax+edx]
74 :			movq mm4, mm0
75 :			movq mm5, mm1
76 :	edgomez	1192	movq mm2, [eax+1]
77 :	Isibaar	262	movq mm3, [eax+edx+1]
78 :			pavgb mm0, mm2
79 :			pxor mm2, mm4
80 :			pavgb mm1, mm3
81 :	edgomez	1192	lea eax, [eax+2*edx]
82 :	Isibaar	262	pxor mm3, mm5
83 :			pand mm2, mm7
84 :			pand mm3, mm7
85 :			psubb mm0, mm2
86 :			movq [ecx], mm0
87 :			psubb mm1, mm3
88 :	edgomez	1192	movq [ecx+edx], mm1
89 :	Isibaar	262	%endmacro
90 :
91 :	edgomez	1192	ALIGN 16
92 :	Isibaar	262	interpolate8x8_halfpel_h_xmm:
93 :
94 :	edgomez	1192	mov eax, [esp+16] ; rounding
95 :			mov ecx, [esp+ 4] ; Dst
96 :	Isibaar	262	test eax,eax
97 :	edgomez	1192	mov eax, [esp+ 8] ; Src
98 :			mov edx, [esp+12] ; stride
99 :	Isibaar	262
100 :			jnz near .rounding1
101 :
102 :			COPY_H_SSE_RND0
103 :			lea ecx,[ecx+2*edx]
104 :			COPY_H_SSE_RND0
105 :			lea ecx,[ecx+2*edx]
106 :			COPY_H_SSE_RND0
107 :			lea ecx,[ecx+2*edx]
108 :			COPY_H_SSE_RND0
109 :			ret
110 :
111 :			.rounding1
112 :	edgomez	1192	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
113 :	Isibaar	262	movq mm7, [mmx_one]
114 :			COPY_H_SSE_RND1
115 :			lea ecx, [ecx+2*edx]
116 :			COPY_H_SSE_RND1
117 :			lea ecx,[ecx+2*edx]
118 :			COPY_H_SSE_RND1
119 :			lea ecx,[ecx+2*edx]
120 :			COPY_H_SSE_RND1
121 :			ret
122 :
123 :			;===========================================================================
124 :			;
125 :			; void interpolate8x8_halfpel_v_xmm(uint8_t * const dst,
126 :	edgomez	1192	; const uint8_t * const src,
127 :			; const uint32_t stride,
128 :			; const uint32_t rounding);
129 :	Isibaar	262	;
130 :			;===========================================================================
131 :
132 :			%macro COPY_V_SSE_RND0 0
133 :	edgomez	1192	movq mm0, [eax]
134 :			movq mm1, [eax+edx]
135 :	Isibaar	262	pavgb mm0, mm1
136 :			pavgb mm1, [eax+2*edx]
137 :	edgomez	1192	lea eax, [eax+2*edx]
138 :			movq [ecx], mm0
139 :	Isibaar	262	movq [ecx+edx],mm1
140 :			%endmacro
141 :
142 :			%macro COPY_V_SSE_RND1 0
143 :			movq mm0, mm2
144 :			movq mm1, [eax]
145 :			movq mm2, [eax+edx]
146 :			lea eax,[eax+2*edx]
147 :			movq mm4, mm0
148 :			movq mm5, mm1
149 :			pavgb mm0, mm1
150 :	edgomez	1192	pxor mm4, mm1
151 :	Isibaar	262	pavgb mm1, mm2
152 :			pxor mm5, mm2
153 :	edgomez	1192	pand mm4, mm7 ; lsb's of (i^j)...
154 :			pand mm5, mm7 ; lsb's of (i^j)...
155 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
156 :	Isibaar	262	movq [ecx], mm0
157 :	edgomez	1192	psubb mm1, mm5 ; ...are substracted from result of pavgb
158 :	Isibaar	262	movq [ecx+edx], mm1
159 :			%endmacro
160 :
161 :	edgomez	1192	ALIGN 16
162 :	Isibaar	262	interpolate8x8_halfpel_v_xmm:
163 :
164 :			mov eax, [esp+16]; rounding
165 :	edgomez	1192	mov ecx, [esp+ 4] ; Dst
166 :	Isibaar	262	test eax,eax
167 :	edgomez	1192	mov eax, [esp+ 8] ; Src
168 :			mov edx, [esp+12] ; stride
169 :	Isibaar	262
170 :	edgomez	1192	; we process 2 line at a time
171 :	Isibaar	262	jnz near .rounding1
172 :
173 :			COPY_V_SSE_RND0
174 :			lea ecx, [ecx+2*edx]
175 :			COPY_V_SSE_RND0
176 :			lea ecx, [ecx+2*edx]
177 :			COPY_V_SSE_RND0
178 :			lea ecx, [ecx+2*edx]
179 :			COPY_V_SSE_RND0
180 :			ret
181 :
182 :			.rounding1
183 :	edgomez	1192	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
184 :	Isibaar	262	movq mm7, [mmx_one]
185 :	edgomez	1192	movq mm2, [eax] ; loop invariant
186 :	Isibaar	262	add eax, edx
187 :
188 :			COPY_V_SSE_RND1
189 :			lea ecx,[ecx+2*edx]
190 :			COPY_V_SSE_RND1
191 :			lea ecx,[ecx+2*edx]
192 :			COPY_V_SSE_RND1
193 :			lea ecx,[ecx+2*edx]
194 :			COPY_V_SSE_RND1
195 :			ret
196 :
197 :			;===========================================================================
198 :			;
199 :			; void interpolate8x8_halfpel_hv_xmm(uint8_t * const dst,
200 :	edgomez	1192	; const uint8_t * const src,
201 :			; const uint32_t stride,
202 :			; const uint32_t rounding);
203 :	Isibaar	262	;
204 :			;
205 :			;===========================================================================
206 :
207 :			; The trick is to correct the result of 'pavgb' with some combination of the
208 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
209 :			; The boolean relations are:
210 :	edgomez	1192	; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
211 :	Isibaar	262	; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
212 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
213 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
214 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
215 :
216 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
217 :
218 :			%macro COPY_HV_SSE_RND0 0
219 :	edgomez	1192	lea eax, [eax+edx]
220 :	Isibaar	262
221 :	edgomez	1192	movq mm0, [eax]
222 :			movq mm1, [eax+1]
223 :	Isibaar	262
224 :	edgomez	1192	movq mm6, mm0
225 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
226 :			lea eax, [eax+edx]
227 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
228 :	Isibaar	262
229 :	edgomez	1192	por mm3, mm1 ; ij \|= jk
230 :			movq mm6, mm2
231 :			pxor mm6, mm0 ; mm6 = s^t
232 :			pand mm3, mm6 ; (ij\|jk) &= st
233 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
234 :			pand mm3, mm7 ; mask lsb
235 :			psubb mm2, mm3 ; apply.
236 :	Isibaar	262
237 :	edgomez	1192	movq [ecx], mm2
238 :	Isibaar	262
239 :	edgomez	1192	movq mm2, [eax]
240 :			movq mm3, [eax+1]
241 :			movq mm6, mm2
242 :			pavgb mm2, mm3 ; preserved for next iteration
243 :			lea ecx,[ecx+edx]
244 :			pxor mm3, mm6 ; preserved for next iteration
245 :	Isibaar	262
246 :	edgomez	1192	por mm1, mm3
247 :			movq mm6, mm0
248 :			pxor mm6, mm2
249 :			pand mm1, mm6
250 :			pavgb mm0, mm2
251 :	Isibaar	262
252 :	edgomez	1192	pand mm1, mm7
253 :			psubb mm0, mm1
254 :	Isibaar	262
255 :	edgomez	1192	movq [ecx], mm0
256 :	Isibaar	262	%endmacro
257 :
258 :			%macro COPY_HV_SSE_RND1 0
259 :	edgomez	1192	lea eax, [eax+edx]
260 :	Isibaar	262
261 :	edgomez	1192	movq mm0, [eax]
262 :			movq mm1, [eax+1]
263 :	Isibaar	262
264 :	edgomez	1192	movq mm6, mm0
265 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
266 :			lea eax, [eax+edx]
267 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
268 :	Isibaar	262
269 :	edgomez	1192	pand mm3, mm1
270 :			movq mm6, mm2
271 :			pxor mm6, mm0
272 :			por mm3, mm6
273 :			pavgb mm2, mm0
274 :			pand mm3, mm7
275 :			psubb mm2, mm3
276 :	Isibaar	262
277 :	edgomez	1192	movq [ecx], mm2
278 :	Isibaar	262
279 :	edgomez	1192	movq mm2, [eax]
280 :			movq mm3, [eax+1]
281 :			movq mm6, mm2
282 :			pavgb mm2, mm3 ; preserved for next iteration
283 :			lea ecx,[ecx+edx]
284 :			pxor mm3, mm6 ; preserved for next iteration
285 :	Isibaar	262
286 :	edgomez	1192	pand mm1, mm3
287 :			movq mm6, mm0
288 :			pxor mm6, mm2
289 :			por mm1, mm6
290 :			pavgb mm0, mm2
291 :			pand mm1, mm7
292 :			psubb mm0, mm1
293 :	Isibaar	262
294 :	edgomez	1192	movq [ecx], mm0
295 :	Isibaar	262	%endmacro
296 :
297 :	edgomez	1192	ALIGN 16
298 :	Isibaar	262	interpolate8x8_halfpel_hv_xmm:
299 :	edgomez	1192	mov eax, [esp+16] ; rounding
300 :			mov ecx, [esp+ 4] ; Dst
301 :			test eax, eax
302 :			mov eax, [esp+ 8] ; Src
303 :			mov edx, [esp+12] ; stride
304 :	Isibaar	262
305 :			movq mm7, [mmx_one]
306 :
307 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
308 :			movq mm2, [eax]
309 :			movq mm3, [eax+1]
310 :			movq mm6, mm2
311 :			pavgb mm2, mm3
312 :	edgomez	1192	pxor mm3, mm6 ; mm2/mm3 ready
313 :	Isibaar	262
314 :			jnz near .rounding1
315 :
316 :			COPY_HV_SSE_RND0
317 :			add ecx, edx
318 :			COPY_HV_SSE_RND0
319 :			add ecx, edx
320 :			COPY_HV_SSE_RND0
321 :			add ecx, edx
322 :			COPY_HV_SSE_RND0
323 :			ret
324 :
325 :			.rounding1
326 :			COPY_HV_SSE_RND1
327 :			add ecx, edx
328 :			COPY_HV_SSE_RND1
329 :			add ecx, edx
330 :			COPY_HV_SSE_RND1
331 :			add ecx, edx
332 :			COPY_HV_SSE_RND1
333 :	edgomez	1192	ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4