Annotation of /trunk/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm

Revision 262 - (view) (download)

1 :	Isibaar	262	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * xmm 8x8 block-based halfpel interpolation
5 :			; *
6 :			; * This program is free software; you can redistribute it and/or modify
7 :			; * it under the terms of the GNU General Public License as published by
8 :			; * the Free Software Foundation; either version 2 of the License, or
9 :			; * (at your option) any later version.
10 :			; *
11 :			; * This program is distributed in the hope that it will be useful,
12 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 :			; * GNU General Public License for more details.
15 :			; *
16 :			; * You should have received a copy of the GNU General Public License
17 :			; * along with this program; if not, write to the Free Software
18 :			; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 :			; *
20 :			; *************************************************************************/
21 :
22 :			;/**************************************************************************
23 :			; *
24 :			; * History:
25 :			; *
26 :			; * 04.06.2002 rewrote some funcs, mostly XMM. -Skal-
27 :			; * Heavily tuned for overlap and AGI-stalls avoidance
28 :			; * 04.02.2002 initial version (Isibaar)
29 :			; *
30 :			; *************************************************************************/
31 :
32 :
33 :			bits 32
34 :
35 :			%macro cglobal 1
36 :			%ifdef PREFIX
37 :			global _%1
38 :			%define %1 _%1
39 :			%else
40 :			global %1
41 :			%endif
42 :			%endmacro
43 :
44 :			section .data
45 :
46 :
47 :			align 16
48 :
49 :			mmx_one
50 :			times 8 db 1
51 :
52 :			section .text
53 :
54 :			cglobal interpolate8x8_halfpel_h_xmm
55 :			cglobal interpolate8x8_halfpel_v_xmm
56 :			cglobal interpolate8x8_halfpel_hv_xmm
57 :
58 :			;===========================================================================
59 :			;
60 :			; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst,
61 :			; const uint8_t * const src,
62 :			; const uint32_t stride,
63 :			; const uint32_t rounding);
64 :			;
65 :			;===========================================================================
66 :
67 :			%macro COPY_H_SSE_RND0 0
68 :			movq mm0, [eax]
69 :			pavgb mm0, [eax+1]
70 :			movq mm1, [eax+edx]
71 :			pavgb mm1, [eax+edx+1]
72 :			lea eax,[eax+2*edx]
73 :			movq [ecx],mm0
74 :			movq [ecx+edx],mm1
75 :			%endmacro
76 :
77 :			%macro COPY_H_SSE_RND1 0
78 :			movq mm0, [eax]
79 :			movq mm1, [eax+edx]
80 :			movq mm4, mm0
81 :			movq mm5, mm1
82 :			movq mm2, [eax+1]
83 :			movq mm3, [eax+edx+1]
84 :			pavgb mm0, mm2
85 :			pxor mm2, mm4
86 :			pavgb mm1, mm3
87 :			lea eax,[eax+2*edx]
88 :			pxor mm3, mm5
89 :			pand mm2, mm7
90 :			pand mm3, mm7
91 :			psubb mm0, mm2
92 :			movq [ecx], mm0
93 :			psubb mm1, mm3
94 :			movq [ecx+edx], mm1
95 :			%endmacro
96 :
97 :			align 16
98 :			interpolate8x8_halfpel_h_xmm:
99 :
100 :			mov eax, [esp+16]; rounding
101 :			mov ecx, [esp+ 4] ; Dst
102 :			test eax,eax
103 :			mov eax, [esp+ 8] ; Src
104 :			mov edx, [esp+12] ; stride
105 :
106 :			jnz near .rounding1
107 :
108 :			COPY_H_SSE_RND0
109 :			lea ecx,[ecx+2*edx]
110 :			COPY_H_SSE_RND0
111 :			lea ecx,[ecx+2*edx]
112 :			COPY_H_SSE_RND0
113 :			lea ecx,[ecx+2*edx]
114 :			COPY_H_SSE_RND0
115 :			ret
116 :
117 :			.rounding1
118 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
119 :			movq mm7, [mmx_one]
120 :			COPY_H_SSE_RND1
121 :			lea ecx, [ecx+2*edx]
122 :			COPY_H_SSE_RND1
123 :			lea ecx,[ecx+2*edx]
124 :			COPY_H_SSE_RND1
125 :			lea ecx,[ecx+2*edx]
126 :			COPY_H_SSE_RND1
127 :			ret
128 :
129 :			;===========================================================================
130 :			;
131 :			; void interpolate8x8_halfpel_v_xmm(uint8_t * const dst,
132 :			; const uint8_t * const src,
133 :			; const uint32_t stride,
134 :			; const uint32_t rounding);
135 :			;
136 :			;===========================================================================
137 :
138 :			%macro COPY_V_SSE_RND0 0
139 :			movq mm0, [eax]
140 :			movq mm1, [eax+edx]
141 :			pavgb mm0, mm1
142 :			pavgb mm1, [eax+2*edx]
143 :			lea eax,[eax+2*edx]
144 :			movq [ecx],mm0
145 :			movq [ecx+edx],mm1
146 :			%endmacro
147 :
148 :			%macro COPY_V_SSE_RND1 0
149 :			movq mm0, mm2
150 :			movq mm1, [eax]
151 :			movq mm2, [eax+edx]
152 :			lea eax,[eax+2*edx]
153 :			movq mm4, mm0
154 :			movq mm5, mm1
155 :			pavgb mm0, mm1
156 :			pxor mm4, mm1
157 :			pavgb mm1, mm2
158 :			pxor mm5, mm2
159 :			pand mm4, mm7 ; lsb's of (i^j)...
160 :			pand mm5, mm7 ; lsb's of (i^j)...
161 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
162 :			movq [ecx], mm0
163 :			psubb mm1, mm5 ; ...are substracted from result of pavgb
164 :			movq [ecx+edx], mm1
165 :			%endmacro
166 :
167 :			align 16
168 :			interpolate8x8_halfpel_v_xmm:
169 :
170 :			mov eax, [esp+16]; rounding
171 :			mov ecx, [esp+ 4] ; Dst
172 :			test eax,eax
173 :			mov eax, [esp+ 8] ; Src
174 :			mov edx, [esp+12] ; stride
175 :
176 :			; we process 2 line at a time
177 :
178 :			jnz near .rounding1
179 :
180 :			COPY_V_SSE_RND0
181 :			lea ecx, [ecx+2*edx]
182 :			COPY_V_SSE_RND0
183 :			lea ecx, [ecx+2*edx]
184 :			COPY_V_SSE_RND0
185 :			lea ecx, [ecx+2*edx]
186 :			COPY_V_SSE_RND0
187 :			ret
188 :
189 :			.rounding1
190 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
191 :			movq mm7, [mmx_one]
192 :			movq mm2, [eax] ; loop invariant
193 :			add eax, edx
194 :
195 :			COPY_V_SSE_RND1
196 :			lea ecx,[ecx+2*edx]
197 :			COPY_V_SSE_RND1
198 :			lea ecx,[ecx+2*edx]
199 :			COPY_V_SSE_RND1
200 :			lea ecx,[ecx+2*edx]
201 :			COPY_V_SSE_RND1
202 :			ret
203 :
204 :			;===========================================================================
205 :			;
206 :			; void interpolate8x8_halfpel_hv_xmm(uint8_t * const dst,
207 :			; const uint8_t * const src,
208 :			; const uint32_t stride,
209 :			; const uint32_t rounding);
210 :			;
211 :			;
212 :			;===========================================================================
213 :
214 :			; The trick is to correct the result of 'pavgb' with some combination of the
215 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
216 :			; The boolean relations are:
217 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
218 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
219 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
220 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
221 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
222 :
223 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
224 :
225 :			%macro COPY_HV_SSE_RND0 0
226 :			lea eax,[eax+edx]
227 :
228 :			movq mm0, [eax]
229 :			movq mm1, [eax+1]
230 :
231 :			movq mm6, mm0
232 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
233 :			lea eax,[eax+edx]
234 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
235 :
236 :			por mm3, mm1 ; ij \|= jk
237 :			movq mm6, mm2
238 :			pxor mm6, mm0 ; mm6 = s^t
239 :			pand mm3, mm6 ; (ij\|jk) &= st
240 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
241 :			pand mm3, mm7 ; mask lsb
242 :			psubb mm2, mm3 ; apply.
243 :
244 :			movq [ecx], mm2
245 :
246 :			movq mm2, [eax]
247 :			movq mm3, [eax+1]
248 :			movq mm6, mm2
249 :			pavgb mm2, mm3 ; preserved for next iteration
250 :			lea ecx,[ecx+edx]
251 :			pxor mm3, mm6 ; preserved for next iteration
252 :
253 :			por mm1, mm3
254 :			movq mm6, mm0
255 :			pxor mm6, mm2
256 :			pand mm1, mm6
257 :			pavgb mm0, mm2
258 :
259 :			pand mm1, mm7
260 :			psubb mm0, mm1
261 :
262 :			movq [ecx], mm0
263 :			%endmacro
264 :
265 :			%macro COPY_HV_SSE_RND1 0
266 :			lea eax,[eax+edx]
267 :
268 :			movq mm0, [eax]
269 :			movq mm1, [eax+1]
270 :
271 :			movq mm6, mm0
272 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
273 :			lea eax,[eax+edx]
274 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
275 :
276 :			pand mm3, mm1
277 :			movq mm6, mm2
278 :			pxor mm6, mm0
279 :			por mm3, mm6
280 :			pavgb mm2, mm0
281 :			pand mm3, mm7
282 :			psubb mm2, mm3
283 :
284 :			movq [ecx], mm2
285 :
286 :			movq mm2, [eax]
287 :			movq mm3, [eax+1]
288 :			movq mm6, mm2
289 :			pavgb mm2, mm3 ; preserved for next iteration
290 :			lea ecx,[ecx+edx]
291 :			pxor mm3, mm6 ; preserved for next iteration
292 :
293 :			pand mm1, mm3
294 :			movq mm6, mm0
295 :			pxor mm6, mm2
296 :			por mm1, mm6
297 :			pavgb mm0, mm2
298 :			pand mm1, mm7
299 :			psubb mm0, mm1
300 :
301 :			movq [ecx], mm0
302 :			%endmacro
303 :
304 :			align 16
305 :			interpolate8x8_halfpel_hv_xmm:
306 :			mov eax, [esp+16] ; rounding
307 :			mov ecx, [esp+ 4] ; Dst
308 :			test eax,eax
309 :			mov eax, [esp+ 8] ; Src
310 :			mov edx, [esp+12] ; stride
311 :
312 :			movq mm7, [mmx_one]
313 :
314 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
315 :			movq mm2, [eax]
316 :			movq mm3, [eax+1]
317 :			movq mm6, mm2
318 :			pavgb mm2, mm3
319 :			pxor mm3, mm6 ; mm2/mm3 ready
320 :
321 :			jnz near .rounding1
322 :
323 :			COPY_HV_SSE_RND0
324 :			add ecx, edx
325 :			COPY_HV_SSE_RND0
326 :			add ecx, edx
327 :			COPY_HV_SSE_RND0
328 :			add ecx, edx
329 :			COPY_HV_SSE_RND0
330 :			ret
331 :
332 :			.rounding1
333 :			COPY_HV_SSE_RND1
334 :			add ecx, edx
335 :			COPY_HV_SSE_RND1
336 :			add ecx, edx
337 :			COPY_HV_SSE_RND1
338 :			add ecx, edx
339 :			COPY_HV_SSE_RND1
340 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4