Annotation of /trunk/xvidcore/src/image/x86_asm/interpolate8x8_3dn.asm

Revision 1382 - (view) (download)

1 :	edgomez	1382	;/*****************************************************************************
2 :	Isibaar	262	; *
3 :	edgomez	1382	; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dnow 8x8 block-based halfpel interpolation -
5 :	Isibaar	262	; *
6 :	edgomez	1382	; * Copyright(C) 2001 Peter Ross <pross@xvid.org>
7 :			; * 2002 Michael Militzer <isibaar@xvid.org>
8 :			; * 2002 Pascal Massimino <skal@planet-d.net>
9 :	Isibaar	262	; *
10 :	edgomez	1382	; * This program is free software ; you can redistribute it and/or modify
11 :			; * it under the terms of the GNU General Public License as published by
12 :			; * the Free Software Foundation ; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :	Isibaar	262	; *
15 :	edgomez	1382	; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :	Isibaar	262	; *
20 :	edgomez	1382	; * You should have received a copy of the GNU General Public License
21 :			; * along with this program ; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :	Isibaar	262	; *
24 :	edgomez	1382	; ****************************************************************************/
25 :	Isibaar	262
26 :	edgomez	1382	BITS 32
27 :	edgomez	851
28 :	edgomez	1382	%macro cglobal 1
29 :	Isibaar	262	%ifdef PREFIX
30 :	edgomez	1382	global _%1
31 :	Isibaar	262	%define %1 _%1
32 :			%else
33 :			global %1
34 :			%endif
35 :			%endmacro
36 :
37 :	edgomez	1382	;=============================================================================
38 :			; Read Only data
39 :			;=============================================================================
40 :	Isibaar	262
41 :	edgomez	1382	%ifdef FORMAT_COFF
42 :			SECTION .rodata data
43 :			%else
44 :			SECTION .rodata data align=16
45 :			%endif
46 :	Isibaar	262
47 :	edgomez	1382	ALIGN 16
48 :			mmx_one:
49 :			times 8 db 1
50 :	Isibaar	262
51 :	edgomez	1382	;=============================================================================
52 :			; Code
53 :			;=============================================================================
54 :	Isibaar	262
55 :	edgomez	1382	SECTION .text
56 :	Isibaar	262
57 :			cglobal interpolate8x8_halfpel_h_3dn
58 :			cglobal interpolate8x8_halfpel_v_3dn
59 :			cglobal interpolate8x8_halfpel_hv_3dn
60 :
61 :	edgomez	1382	;-----------------------------------------------------------------------------
62 :	Isibaar	262	;
63 :			; void interpolate8x8_halfpel_h_3dn(uint8_t * const dst,
64 :	edgomez	1382	; const uint8_t * const src,
65 :			; const uint32_t stride,
66 :			; const uint32_t rounding);
67 :	Isibaar	262	;
68 :	edgomez	1382	;-----------------------------------------------------------------------------
69 :	Isibaar	262
70 :			%macro COPY_H_3DN_RND0 0
71 :	edgomez	1382	movq mm0, [eax]
72 :	Isibaar	262	pavgusb mm0, [eax+1]
73 :	edgomez	1382	movq mm1, [eax+edx]
74 :	Isibaar	262	pavgusb mm1, [eax+edx+1]
75 :	edgomez	1382	lea eax, [eax+2*edx]
76 :			movq [ecx], mm0
77 :			movq [ecx+edx], mm1
78 :	Isibaar	262	%endmacro
79 :
80 :			%macro COPY_H_3DN_RND1 0
81 :			movq mm0, [eax]
82 :			movq mm1, [eax+edx]
83 :			movq mm4, mm0
84 :			movq mm5, mm1
85 :	edgomez	1382	movq mm2, [eax+1]
86 :	Isibaar	262	movq mm3, [eax+edx+1]
87 :			pavgusb mm0, mm2
88 :			pxor mm2, mm4
89 :			pavgusb mm1, mm3
90 :	edgomez	1382	lea eax, [eax+2*edx]
91 :	Isibaar	262	pxor mm3, mm5
92 :			pand mm2, mm7
93 :			pand mm3, mm7
94 :			psubb mm0, mm2
95 :			movq [ecx], mm0
96 :			psubb mm1, mm3
97 :			movq [ecx+edx], mm1
98 :			%endmacro
99 :
100 :	edgomez	1382	ALIGN 16
101 :	Isibaar	262	interpolate8x8_halfpel_h_3dn:
102 :
103 :			mov eax, [esp+16] ; rounding
104 :			mov ecx, [esp+ 4] ; Dst
105 :	edgomez	1382	test eax, eax
106 :	Isibaar	262	mov eax, [esp+ 8] ; Src
107 :			mov edx, [esp+12] ; stride
108 :
109 :			jnz near .rounding1
110 :
111 :			COPY_H_3DN_RND0
112 :	edgomez	1382	lea ecx, [ecx+2*edx]
113 :	Isibaar	262	COPY_H_3DN_RND0
114 :	edgomez	1382	lea ecx, [ecx+2*edx]
115 :	Isibaar	262	COPY_H_3DN_RND0
116 :	edgomez	1382	lea ecx, [ecx+2*edx]
117 :	Isibaar	262	COPY_H_3DN_RND0
118 :			ret
119 :
120 :			.rounding1
121 :	edgomez	1382	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
122 :	Isibaar	262	movq mm7, [mmx_one]
123 :			COPY_H_3DN_RND1
124 :			lea ecx, [ecx+2*edx]
125 :			COPY_H_3DN_RND1
126 :	edgomez	1382	lea ecx, [ecx+2*edx]
127 :	Isibaar	262	COPY_H_3DN_RND1
128 :	edgomez	1382	lea ecx, [ecx+2*edx]
129 :	Isibaar	262	COPY_H_3DN_RND1
130 :			ret
131 :
132 :
133 :	edgomez	1382	;-----------------------------------------------------------------------------
134 :	Isibaar	262	;
135 :			; void interpolate8x8_halfpel_v_3dn(uint8_t * const dst,
136 :	edgomez	1382	; const uint8_t * const src,
137 :			; const uint32_t stride,
138 :			; const uint32_t rounding);
139 :	Isibaar	262	;
140 :	edgomez	1382	;-----------------------------------------------------------------------------
141 :	Isibaar	262
142 :			%macro COPY_V_3DN_RND0 0
143 :	edgomez	1382	movq mm0, [eax]
144 :			movq mm1, [eax+edx]
145 :	Isibaar	262	pavgusb mm0, mm1
146 :			pavgusb mm1, [eax+2*edx]
147 :	edgomez	1382	lea eax, [eax+2*edx]
148 :			movq [ecx], mm0
149 :			movq [ecx+edx], mm1
150 :	Isibaar	262	%endmacro
151 :
152 :			%macro COPY_V_3DN_RND1 0
153 :			movq mm0, mm2
154 :			movq mm1, [eax]
155 :			movq mm2, [eax+edx]
156 :	edgomez	1382	lea eax, [eax+2*edx]
157 :	Isibaar	262	movq mm4, mm0
158 :			movq mm5, mm1
159 :			pavgusb mm0, mm1
160 :	edgomez	1382	pxor mm4, mm1
161 :	Isibaar	262	pavgusb mm1, mm2
162 :			pxor mm5, mm2
163 :	edgomez	1382	pand mm4, mm7 ; lsb's of (i^j)...
164 :			pand mm5, mm7 ; lsb's of (i^j)...
165 :			psubb mm0, mm4 ; ...are substracted from result of pavgusb
166 :	Isibaar	262	movq [ecx], mm0
167 :	edgomez	1382	psubb mm1, mm5 ; ...are substracted from result of pavgusb
168 :	Isibaar	262	movq [ecx+edx], mm1
169 :			%endmacro
170 :
171 :	edgomez	1382	ALIGN 16
172 :	Isibaar	262	interpolate8x8_halfpel_v_3dn:
173 :
174 :			mov eax, [esp+16] ; rounding
175 :			mov ecx, [esp+ 4] ; Dst
176 :			test eax,eax
177 :			mov eax, [esp+ 8] ; Src
178 :			mov edx, [esp+12] ; stride
179 :
180 :			; we process 2 line at a time
181 :
182 :			jnz near .rounding1
183 :
184 :			COPY_V_3DN_RND0
185 :			lea ecx, [ecx+2*edx]
186 :			COPY_V_3DN_RND0
187 :			lea ecx, [ecx+2*edx]
188 :			COPY_V_3DN_RND0
189 :			lea ecx, [ecx+2*edx]
190 :			COPY_V_3DN_RND0
191 :			ret
192 :
193 :			.rounding1
194 :	edgomez	1382	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
195 :	Isibaar	262	movq mm7, [mmx_one]
196 :	edgomez	1382	movq mm2, [eax] ; loop invariant
197 :	Isibaar	262	add eax, edx
198 :
199 :			COPY_V_3DN_RND1
200 :	edgomez	1382	lea ecx, [ecx+2*edx]
201 :	Isibaar	262	COPY_V_3DN_RND1
202 :	edgomez	1382	lea ecx, [ecx+2*edx]
203 :	Isibaar	262	COPY_V_3DN_RND1
204 :	edgomez	1382	lea ecx, [ecx+2*edx]
205 :	Isibaar	262	COPY_V_3DN_RND1
206 :			ret
207 :
208 :
209 :	edgomez	1382	;-----------------------------------------------------------------------------
210 :	Isibaar	262	;
211 :			; void interpolate8x8_halfpel_hv_3dn(uint8_t * const dst,
212 :	edgomez	1382	; const uint8_t * const src,
213 :			; const uint32_t stride,
214 :			; const uint32_t rounding);
215 :	Isibaar	262	;
216 :			;
217 :	edgomez	1382	;-----------------------------------------------------------------------------
218 :	Isibaar	262
219 :			; The trick is to correct the result of 'pavgusb' with some combination of the
220 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgusb' (s and t).
221 :			; The boolean relations are:
222 :	edgomez	1382	; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
223 :	Isibaar	262	; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
224 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
225 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
226 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
227 :
228 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
229 :
230 :			%macro COPY_HV_3DN_RND0 0
231 :	edgomez	1382	lea eax, [eax+edx]
232 :	Isibaar	262
233 :	edgomez	1382	movq mm0, [eax]
234 :			movq mm1, [eax+1]
235 :	Isibaar	262
236 :	edgomez	1382	movq mm6, mm0
237 :			pavgusb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
238 :			lea eax, [eax+edx]
239 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
240 :	Isibaar	262
241 :	edgomez	1382	por mm3, mm1 ; ij \|= jk
242 :			movq mm6, mm2
243 :			pxor mm6, mm0 ; mm6 = s^t
244 :			pand mm3, mm6 ; (ij\|jk) &= st
245 :			pavgusb mm2, mm0 ; mm2 = (s+t+1)/2
246 :			pand mm3, mm7 ; mask lsb
247 :			psubb mm2, mm3 ; apply.
248 :	Isibaar	262
249 :	edgomez	1382	movq [ecx], mm2
250 :	Isibaar	262
251 :	edgomez	1382	movq mm2, [eax]
252 :			movq mm3, [eax+1]
253 :			movq mm6, mm2
254 :			pavgusb mm2, mm3 ; preserved for next iteration
255 :			lea ecx, [ecx+edx]
256 :			pxor mm3, mm6 ; preserved for next iteration
257 :	Isibaar	262
258 :	edgomez	1382	por mm1, mm3
259 :			movq mm6, mm0
260 :			pxor mm6, mm2
261 :			pand mm1, mm6
262 :			pavgusb mm0, mm2
263 :	Isibaar	262
264 :	edgomez	1382	pand mm1, mm7
265 :			psubb mm0, mm1
266 :	Isibaar	262
267 :	edgomez	1382	movq [ecx], mm0
268 :	Isibaar	262	%endmacro
269 :
270 :			%macro COPY_HV_3DN_RND1 0
271 :	edgomez	1382	lea eax,[eax+edx]
272 :	Isibaar	262
273 :	edgomez	1382	movq mm0, [eax]
274 :			movq mm1, [eax+1]
275 :	Isibaar	262
276 :	edgomez	1382	movq mm6, mm0
277 :			pavgusb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
278 :			lea eax, [eax+edx]
279 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
280 :	Isibaar	262
281 :	edgomez	1382	pand mm3, mm1
282 :			movq mm6, mm2
283 :			pxor mm6, mm0
284 :			por mm3, mm6
285 :			pavgusb mm2, mm0
286 :			pand mm3, mm7
287 :			psubb mm2, mm3
288 :	Isibaar	262
289 :	edgomez	1382	movq [ecx], mm2
290 :	Isibaar	262
291 :	edgomez	1382	movq mm2, [eax]
292 :			movq mm3, [eax+1]
293 :			movq mm6, mm2
294 :			pavgusb mm2, mm3 ; preserved for next iteration
295 :			lea ecx, [ecx+edx]
296 :			pxor mm3, mm6 ; preserved for next iteration
297 :	Isibaar	262
298 :	edgomez	1382	pand mm1, mm3
299 :			movq mm6, mm0
300 :			pxor mm6, mm2
301 :			por mm1, mm6
302 :			pavgusb mm0, mm2
303 :			pand mm1, mm7
304 :			psubb mm0, mm1
305 :	Isibaar	262
306 :	edgomez	1382	movq [ecx], mm0
307 :	Isibaar	262	%endmacro
308 :
309 :	edgomez	1382	ALIGN 16
310 :	Isibaar	262	interpolate8x8_halfpel_hv_3dn
311 :			mov eax, [esp+16] ; rounding
312 :			mov ecx, [esp+ 4] ; Dst
313 :	edgomez	1382	test eax, eax
314 :	Isibaar	262	mov eax, [esp+ 8] ; Src
315 :			mov edx, [esp+12] ; stride
316 :
317 :			movq mm7, [mmx_one]
318 :
319 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
320 :			movq mm2, [eax]
321 :			movq mm3, [eax+1]
322 :			movq mm6, mm2
323 :			pavgusb mm2, mm3
324 :	edgomez	1382	pxor mm3, mm6 ; mm2/mm3 ready
325 :	Isibaar	262
326 :			jnz near .rounding1
327 :
328 :			COPY_HV_3DN_RND0
329 :			add ecx, edx
330 :			COPY_HV_3DN_RND0
331 :			add ecx, edx
332 :			COPY_HV_3DN_RND0
333 :			add ecx, edx
334 :			COPY_HV_3DN_RND0
335 :			ret
336 :
337 :			.rounding1
338 :			COPY_HV_3DN_RND1
339 :			add ecx, edx
340 :			COPY_HV_3DN_RND1
341 :			add ecx, edx
342 :			COPY_HV_3DN_RND1
343 :			add ecx, edx
344 :			COPY_HV_3DN_RND1
345 :	edgomez	1382	ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4