Annotation of /trunk/xvidcore/src/image/x86_asm/interpolate8x8_3dn.asm

Revision 434 - (view) (download)

1 :	chl	434	;/*****************************************************************************
2 :	Isibaar	262	; *
3 :	chl	434	; * XVID MPEG-4 VIDEO CODEC
4 :			; * 3dnow 8x8 block-based halfpel interpolation
5 :	Isibaar	262	; *
6 :	chl	434	; * Copyright(C) 2002 Peter Ross <pross@xvid.org>
7 :			; * Copyright(C) 2002 Michael Militzer <michael@xvid.org>
8 :	Isibaar	262	; *
9 :	chl	434	; * This program is an implementation of a part of one or more MPEG-4
10 :			; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
11 :			; * to use this software module in hardware or software products are
12 :			; * advised that its use may infringe existing patents or copyrights, and
13 :			; * any such use would be at such party's own risk. The original
14 :			; * developer of this software module and his/her company, and subsequent
15 :			; * editors and their companies, will have no liability for use of this
16 :			; * software or modifications or derivatives thereof.
17 :	Isibaar	262	; *
18 :	chl	434	; * This program is free software; you can redistribute it and/or modify
19 :			; * it under the terms of the GNU General Public License as published by
20 :			; * the Free Software Foundation; either version 2 of the License, or
21 :			; * (at your option) any later version.
22 :	Isibaar	262	; *
23 :	chl	434	; * This program is distributed in the hope that it will be useful,
24 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 :			; * GNU General Public License for more details.
27 :	Isibaar	262	; *
28 :	chl	434	; * You should have received a copy of the GNU General Public License
29 :			; * along with this program; if not, write to the Free Software
30 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31 :	Isibaar	262	; *
32 :	chl	434	; ****************************************************************************/
33 :	Isibaar	262
34 :			bits 32
35 :
36 :			%macro cglobal 1
37 :			%ifdef PREFIX
38 :			global _%1
39 :			%define %1 _%1
40 :			%else
41 :			global %1
42 :			%endif
43 :			%endmacro
44 :
45 :			section .data
46 :
47 :
48 :			align 16
49 :
50 :			mmx_one
51 :			times 8 db 1
52 :
53 :			section .text
54 :
55 :			cglobal interpolate8x8_halfpel_h_3dn
56 :			cglobal interpolate8x8_halfpel_v_3dn
57 :			cglobal interpolate8x8_halfpel_hv_3dn
58 :
59 :
60 :			;===========================================================================
61 :			;
62 :			; void interpolate8x8_halfpel_h_3dn(uint8_t * const dst,
63 :			; const uint8_t * const src,
64 :			; const uint32_t stride,
65 :			; const uint32_t rounding);
66 :			;
67 :			;===========================================================================
68 :
69 :			%macro COPY_H_3DN_RND0 0
70 :			movq mm0, [eax]
71 :			pavgusb mm0, [eax+1]
72 :			movq mm1, [eax+edx]
73 :			pavgusb mm1, [eax+edx+1]
74 :			lea eax,[eax+2*edx]
75 :			movq [ecx],mm0
76 :			movq [ecx+edx],mm1
77 :			%endmacro
78 :
79 :			%macro COPY_H_3DN_RND1 0
80 :			movq mm0, [eax]
81 :			movq mm1, [eax+edx]
82 :			movq mm4, mm0
83 :			movq mm5, mm1
84 :			movq mm2, [eax+1]
85 :			movq mm3, [eax+edx+1]
86 :			pavgusb mm0, mm2
87 :			pxor mm2, mm4
88 :			pavgusb mm1, mm3
89 :			lea eax,[eax+2*edx]
90 :			pxor mm3, mm5
91 :			pand mm2, mm7
92 :			pand mm3, mm7
93 :			psubb mm0, mm2
94 :			movq [ecx], mm0
95 :			psubb mm1, mm3
96 :			movq [ecx+edx], mm1
97 :			%endmacro
98 :
99 :			align 16
100 :			interpolate8x8_halfpel_h_3dn:
101 :
102 :			mov eax, [esp+16] ; rounding
103 :			mov ecx, [esp+ 4] ; Dst
104 :			test eax,eax
105 :			mov eax, [esp+ 8] ; Src
106 :			mov edx, [esp+12] ; stride
107 :
108 :			jnz near .rounding1
109 :
110 :			COPY_H_3DN_RND0
111 :			lea ecx,[ecx+2*edx]
112 :			COPY_H_3DN_RND0
113 :			lea ecx,[ecx+2*edx]
114 :			COPY_H_3DN_RND0
115 :			lea ecx,[ecx+2*edx]
116 :			COPY_H_3DN_RND0
117 :			ret
118 :
119 :			.rounding1
120 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
121 :			movq mm7, [mmx_one]
122 :			COPY_H_3DN_RND1
123 :			lea ecx, [ecx+2*edx]
124 :			COPY_H_3DN_RND1
125 :			lea ecx,[ecx+2*edx]
126 :			COPY_H_3DN_RND1
127 :			lea ecx,[ecx+2*edx]
128 :			COPY_H_3DN_RND1
129 :			ret
130 :
131 :
132 :			;===========================================================================
133 :			;
134 :			; void interpolate8x8_halfpel_v_3dn(uint8_t * const dst,
135 :			; const uint8_t * const src,
136 :			; const uint32_t stride,
137 :			; const uint32_t rounding);
138 :			;
139 :			;===========================================================================
140 :
141 :			%macro COPY_V_3DN_RND0 0
142 :			movq mm0, [eax]
143 :			movq mm1, [eax+edx]
144 :			pavgusb mm0, mm1
145 :			pavgusb mm1, [eax+2*edx]
146 :			lea eax,[eax+2*edx]
147 :			movq [ecx],mm0
148 :			movq [ecx+edx],mm1
149 :			%endmacro
150 :
151 :			%macro COPY_V_3DN_RND1 0
152 :			movq mm0, mm2
153 :			movq mm1, [eax]
154 :			movq mm2, [eax+edx]
155 :			lea eax,[eax+2*edx]
156 :			movq mm4, mm0
157 :			movq mm5, mm1
158 :			pavgusb mm0, mm1
159 :			pxor mm4, mm1
160 :			pavgusb mm1, mm2
161 :			pxor mm5, mm2
162 :			pand mm4, mm7 ; lsb's of (i^j)...
163 :			pand mm5, mm7 ; lsb's of (i^j)...
164 :			psubb mm0, mm4 ; ...are substracted from result of pavgusb
165 :			movq [ecx], mm0
166 :			psubb mm1, mm5 ; ...are substracted from result of pavgusb
167 :			movq [ecx+edx], mm1
168 :			%endmacro
169 :
170 :			align 16
171 :			interpolate8x8_halfpel_v_3dn:
172 :
173 :			mov eax, [esp+16] ; rounding
174 :			mov ecx, [esp+ 4] ; Dst
175 :			test eax,eax
176 :			mov eax, [esp+ 8] ; Src
177 :			mov edx, [esp+12] ; stride
178 :
179 :			; we process 2 line at a time
180 :
181 :			jnz near .rounding1
182 :
183 :			COPY_V_3DN_RND0
184 :			lea ecx, [ecx+2*edx]
185 :			COPY_V_3DN_RND0
186 :			lea ecx, [ecx+2*edx]
187 :			COPY_V_3DN_RND0
188 :			lea ecx, [ecx+2*edx]
189 :			COPY_V_3DN_RND0
190 :			ret
191 :
192 :			.rounding1
193 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
194 :			movq mm7, [mmx_one]
195 :			movq mm2, [eax] ; loop invariant
196 :			add eax, edx
197 :
198 :			COPY_V_3DN_RND1
199 :			lea ecx,[ecx+2*edx]
200 :			COPY_V_3DN_RND1
201 :			lea ecx,[ecx+2*edx]
202 :			COPY_V_3DN_RND1
203 :			lea ecx,[ecx+2*edx]
204 :			COPY_V_3DN_RND1
205 :			ret
206 :
207 :
208 :			;===========================================================================
209 :			;
210 :			; void interpolate8x8_halfpel_hv_3dn(uint8_t * const dst,
211 :			; const uint8_t * const src,
212 :			; const uint32_t stride,
213 :			; const uint32_t rounding);
214 :			;
215 :			;
216 :			;===========================================================================
217 :
218 :			; The trick is to correct the result of 'pavgusb' with some combination of the
219 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgusb' (s and t).
220 :			; The boolean relations are:
221 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
222 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
223 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
224 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
225 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
226 :
227 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
228 :
229 :			%macro COPY_HV_3DN_RND0 0
230 :			lea eax,[eax+edx]
231 :
232 :			movq mm0, [eax]
233 :			movq mm1, [eax+1]
234 :
235 :			movq mm6, mm0
236 :			pavgusb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
237 :			lea eax,[eax+edx]
238 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
239 :
240 :			por mm3, mm1 ; ij \|= jk
241 :			movq mm6, mm2
242 :			pxor mm6, mm0 ; mm6 = s^t
243 :			pand mm3, mm6 ; (ij\|jk) &= st
244 :			pavgusb mm2, mm0 ; mm2 = (s+t+1)/2
245 :			pand mm3, mm7 ; mask lsb
246 :			psubb mm2, mm3 ; apply.
247 :
248 :			movq [ecx], mm2
249 :
250 :			movq mm2, [eax]
251 :			movq mm3, [eax+1]
252 :			movq mm6, mm2
253 :			pavgusb mm2, mm3 ; preserved for next iteration
254 :			lea ecx,[ecx+edx]
255 :			pxor mm3, mm6 ; preserved for next iteration
256 :
257 :			por mm1, mm3
258 :			movq mm6, mm0
259 :			pxor mm6, mm2
260 :			pand mm1, mm6
261 :			pavgusb mm0, mm2
262 :
263 :			pand mm1, mm7
264 :			psubb mm0, mm1
265 :
266 :			movq [ecx], mm0
267 :			%endmacro
268 :
269 :			%macro COPY_HV_3DN_RND1 0
270 :			lea eax,[eax+edx]
271 :
272 :			movq mm0, [eax]
273 :			movq mm1, [eax+1]
274 :
275 :			movq mm6, mm0
276 :			pavgusb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
277 :			lea eax,[eax+edx]
278 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
279 :
280 :			pand mm3, mm1
281 :			movq mm6, mm2
282 :			pxor mm6, mm0
283 :			por mm3, mm6
284 :			pavgusb mm2, mm0
285 :			pand mm3, mm7
286 :			psubb mm2, mm3
287 :
288 :			movq [ecx], mm2
289 :
290 :			movq mm2, [eax]
291 :			movq mm3, [eax+1]
292 :			movq mm6, mm2
293 :			pavgusb mm2, mm3 ; preserved for next iteration
294 :			lea ecx,[ecx+edx]
295 :			pxor mm3, mm6 ; preserved for next iteration
296 :
297 :			pand mm1, mm3
298 :			movq mm6, mm0
299 :			pxor mm6, mm2
300 :			por mm1, mm6
301 :			pavgusb mm0, mm2
302 :			pand mm1, mm7
303 :			psubb mm0, mm1
304 :
305 :			movq [ecx], mm0
306 :			%endmacro
307 :
308 :			align 16
309 :			interpolate8x8_halfpel_hv_3dn
310 :			mov eax, [esp+16] ; rounding
311 :			mov ecx, [esp+ 4] ; Dst
312 :			test eax,eax
313 :			mov eax, [esp+ 8] ; Src
314 :			mov edx, [esp+12] ; stride
315 :
316 :			movq mm7, [mmx_one]
317 :
318 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
319 :			movq mm2, [eax]
320 :			movq mm3, [eax+1]
321 :			movq mm6, mm2
322 :			pavgusb mm2, mm3
323 :			pxor mm3, mm6 ; mm2/mm3 ready
324 :
325 :			jnz near .rounding1
326 :
327 :			COPY_HV_3DN_RND0
328 :			add ecx, edx
329 :			COPY_HV_3DN_RND0
330 :			add ecx, edx
331 :			COPY_HV_3DN_RND0
332 :			add ecx, edx
333 :			COPY_HV_3DN_RND0
334 :			ret
335 :
336 :			.rounding1
337 :			COPY_HV_3DN_RND1
338 :			add ecx, edx
339 :			COPY_HV_3DN_RND1
340 :			add ecx, edx
341 :			COPY_HV_3DN_RND1
342 :			add ecx, edx
343 :			COPY_HV_3DN_RND1
344 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4