24 |
; these 3dne functions are compatible with iSSE, but are optimized specifically |
; these 3dne functions are compatible with iSSE, but are optimized specifically |
25 |
; for K7 pipelines |
; for K7 pipelines |
26 |
|
|
27 |
BITS 32 |
%include "nasm.inc" |
|
|
|
|
%macro cglobal 1 |
|
|
%ifdef PREFIX |
|
|
%ifdef MARK_FUNCS |
|
|
global _%1:function |
|
|
%define %1 _%1:function |
|
|
%else |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%endif |
|
|
%else |
|
|
%ifdef MARK_FUNCS |
|
|
global %1:function |
|
|
%else |
|
|
global %1 |
|
|
%endif |
|
|
%endif |
|
|
%endmacro |
|
28 |
|
|
29 |
;============================================================================= |
;============================================================================= |
30 |
; Read only data |
; Read only data |
31 |
;============================================================================= |
;============================================================================= |
32 |
|
|
33 |
%ifdef FORMAT_COFF |
DATA |
|
SECTION .rodata |
|
|
%else |
|
|
SECTION .rodata align=16 |
|
|
%endif |
|
34 |
|
|
35 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
36 |
mmx_one: |
mmx_one: |
37 |
times 8 db 1 |
times 8 db 1 |
38 |
|
|
39 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
40 |
mm_minusone: |
mm_minusone: |
41 |
dd -1,-1 |
dd -1,-1 |
42 |
|
|
52 |
; Macros |
; Macros |
53 |
;============================================================================= |
;============================================================================= |
54 |
|
|
55 |
SECTION .text |
SECTION .rotext align=SECTION_ALIGN |
56 |
|
|
57 |
cglobal interpolate8x8_halfpel_h_3dne |
cglobal interpolate8x8_halfpel_h_3dne |
58 |
cglobal interpolate8x8_halfpel_v_3dne |
cglobal interpolate8x8_halfpel_v_3dne |
59 |
cglobal interpolate8x8_halfpel_hv_3dne |
cglobal interpolate8x8_halfpel_hv_3dne |
60 |
|
|
61 |
|
cglobal interpolate8x4_halfpel_h_3dne |
62 |
|
cglobal interpolate8x4_halfpel_v_3dne |
63 |
|
cglobal interpolate8x4_halfpel_hv_3dne |
64 |
|
|
65 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
66 |
; |
; |
67 |
; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst, |
; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst, |
73 |
|
|
74 |
%macro COPY_H_SSE_RND0 1 |
%macro COPY_H_SSE_RND0 1 |
75 |
%if (%1) |
%if (%1) |
76 |
movq mm0, [eax] |
movq mm0, [_EAX] |
77 |
%else |
%else |
78 |
movq mm0, [dword eax] |
movq mm0, [_EAX+0] |
79 |
|
; --- |
80 |
|
; nasm >0.99.x rejects the original statement: |
81 |
|
; movq mm0, [dword _EAX] |
82 |
|
; as it is ambiguous. for this statement nasm <0.99.x would |
83 |
|
; generate "movq mm0,[_EAX+0]" |
84 |
|
; --- |
85 |
%endif |
%endif |
86 |
pavgb mm0, [eax+1] |
pavgb mm0, [_EAX+1] |
87 |
movq mm1, [eax+edx] |
movq mm1, [_EAX+TMP1] |
88 |
pavgb mm1, [eax+edx+1] |
pavgb mm1, [_EAX+TMP1+1] |
89 |
lea eax, [eax+2*edx] |
lea _EAX, [_EAX+2*TMP1] |
90 |
movq [ecx], mm0 |
movq [TMP0], mm0 |
91 |
movq [ecx+edx], mm1 |
movq [TMP0+TMP1], mm1 |
92 |
%endmacro |
%endmacro |
93 |
|
|
94 |
%macro COPY_H_SSE_RND1 0 |
%macro COPY_H_SSE_RND1 0 |
95 |
movq mm0, [eax] |
movq mm0, [_EAX] |
96 |
movq mm1, [eax+edx] |
movq mm1, [_EAX+TMP1] |
97 |
movq mm4, mm0 |
movq mm4, mm0 |
98 |
movq mm5, mm1 |
movq mm5, mm1 |
99 |
movq mm2, [eax+1] |
movq mm2, [_EAX+1] |
100 |
movq mm3, [eax+edx+1] |
movq mm3, [_EAX+TMP1+1] |
101 |
pavgb mm0, mm2 |
pavgb mm0, mm2 |
102 |
pxor mm2, mm4 |
pxor mm2, mm4 |
103 |
pavgb mm1, mm3 |
pavgb mm1, mm3 |
104 |
lea eax, [eax+2*edx] |
lea _EAX, [_EAX+2*TMP1] |
105 |
pxor mm3, mm5 |
pxor mm3, mm5 |
106 |
pand mm2, mm7 |
pand mm2, mm7 |
107 |
pand mm3, mm7 |
pand mm3, mm7 |
108 |
psubb mm0, mm2 |
psubb mm0, mm2 |
109 |
movq [ecx], mm0 |
movq [TMP0], mm0 |
110 |
psubb mm1, mm3 |
psubb mm1, mm3 |
111 |
movq [ecx+edx], mm1 |
movq [TMP0+TMP1], mm1 |
112 |
%endmacro |
%endmacro |
113 |
|
|
114 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
115 |
interpolate8x8_halfpel_h_3dne: |
interpolate8x8_halfpel_h_3dne: |
116 |
|
|
117 |
mov eax, [esp+ 8] ; Src |
mov _EAX, prm2 ; Src |
118 |
mov edx, [esp+12] ; stride |
mov TMP1, prm3 ; stride |
119 |
dec dword [esp+16]; rounding |
dec PTR_TYPE prm4; rounding |
120 |
|
|
121 |
jz .rounding1 |
jz near .rounding1 |
122 |
mov ecx, [esp+ 4] ; Dst |
mov TMP0, prm1 ; Dst |
123 |
|
|
124 |
COPY_H_SSE_RND0 0 |
COPY_H_SSE_RND0 0 |
125 |
lea ecx,[ecx+2*edx] |
lea TMP0,[TMP0+2*TMP1] |
126 |
COPY_H_SSE_RND0 1 |
COPY_H_SSE_RND0 1 |
127 |
lea ecx,[ecx+2*edx] |
lea TMP0,[TMP0+2*TMP1] |
128 |
COPY_H_SSE_RND0 1 |
COPY_H_SSE_RND0 1 |
129 |
lea ecx,[ecx+2*edx] |
lea TMP0,[TMP0+2*TMP1] |
130 |
COPY_H_SSE_RND0 1 |
COPY_H_SSE_RND0 1 |
131 |
ret |
ret |
132 |
|
|
133 |
.rounding1 |
.rounding1: |
134 |
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
135 |
mov ecx, [esp+ 4] ; Dst |
mov TMP0, prm1 ; Dst |
136 |
movq mm7, [mmx_one] |
movq mm7, [mmx_one] |
137 |
COPY_H_SSE_RND1 |
COPY_H_SSE_RND1 |
138 |
lea ecx, [ecx+2*edx] |
lea TMP0, [TMP0+2*TMP1] |
139 |
COPY_H_SSE_RND1 |
COPY_H_SSE_RND1 |
140 |
lea ecx,[ecx+2*edx] |
lea TMP0,[TMP0+2*TMP1] |
141 |
COPY_H_SSE_RND1 |
COPY_H_SSE_RND1 |
142 |
lea ecx,[ecx+2*edx] |
lea TMP0,[TMP0+2*TMP1] |
143 |
COPY_H_SSE_RND1 |
COPY_H_SSE_RND1 |
144 |
ret |
ret |
145 |
|
ENDFUNC |
146 |
|
|
147 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
148 |
; |
; |
153 |
; |
; |
154 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
155 |
|
|
156 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
157 |
interpolate8x8_halfpel_v_3dne: |
interpolate8x8_halfpel_v_3dne: |
158 |
|
|
159 |
mov eax, [esp+ 8] ; Src |
mov _EAX, prm2 ; Src |
160 |
mov edx, [esp+12] ; stride |
mov TMP1, prm3 ; stride |
161 |
dec dword [esp+16]; rounding |
dec PTR_TYPE prm4; rounding |
162 |
|
|
163 |
; we process 2 line at a time |
; we process 2 line at a time |
164 |
|
|
165 |
jz .rounding1 |
jz near .rounding1 |
166 |
pxor mm2,mm2 |
pxor mm2,mm2 |
167 |
movq mm0, [eax] |
movq mm0, [_EAX] |
168 |
movq mm1, [eax+edx] |
movq mm1, [_EAX+TMP1] |
169 |
por mm2, [eax+2*edx] |
por mm2, [_EAX+2*TMP1] |
170 |
mov ecx, [esp+ 4] ; Dst |
mov TMP0, prm1 ; Dst |
171 |
lea eax, [eax+2*edx] |
lea _EAX, [_EAX+2*TMP1] |
172 |
pxor mm4, mm4 |
pxor mm4, mm4 |
173 |
pavgb mm0, mm1 |
pavgb mm0, mm1 |
174 |
pavgb mm1, mm2 |
pavgb mm1, mm2 |
175 |
movq [byte ecx], mm0 |
movq [byte TMP0], mm0 |
176 |
movq [ecx+edx], mm1 |
movq [TMP0+TMP1], mm1 |
177 |
pxor mm6, mm6 |
pxor mm6, mm6 |
178 |
add eax, edx |
add _EAX, TMP1 |
179 |
lea ecx, [ecx+2*edx] |
lea TMP0, [TMP0+2*TMP1] |
180 |
movq mm3, [byte eax] |
movq mm3, [byte _EAX] |
181 |
por mm4, [eax+edx] |
por mm4, [_EAX+TMP1] |
182 |
lea eax, [eax+2*edx] |
lea _EAX, [_EAX+2*TMP1] |
183 |
pavgb mm2, mm3 |
pavgb mm2, mm3 |
184 |
pavgb mm3, mm4 |
pavgb mm3, mm4 |
185 |
movq [ecx], mm2 |
movq [TMP0], mm2 |
186 |
movq [ecx+edx], mm3 |
movq [TMP0+TMP1], mm3 |
187 |
lea ecx, [byte ecx+2*edx] |
lea TMP0, [byte TMP0+2*TMP1] |
188 |
movq mm5, [byte eax] |
movq mm5, [byte _EAX] |
189 |
por mm6, [eax+edx] |
por mm6, [_EAX+TMP1] |
190 |
lea eax, [eax+2*edx] |
lea _EAX, [_EAX+2*TMP1] |
191 |
pavgb mm4, mm5 |
pavgb mm4, mm5 |
192 |
pavgb mm5, mm6 |
pavgb mm5, mm6 |
193 |
movq [ecx], mm4 |
movq [TMP0], mm4 |
194 |
movq [ecx+edx], mm5 |
movq [TMP0+TMP1], mm5 |
195 |
lea ecx, [ecx+2*edx] |
lea TMP0, [TMP0+2*TMP1] |
196 |
movq mm7, [eax] |
movq mm7, [_EAX] |
197 |
movq mm0, [eax+edx] |
movq mm0, [_EAX+TMP1] |
198 |
pavgb mm6, mm7 |
pavgb mm6, mm7 |
199 |
pavgb mm7, mm0 |
pavgb mm7, mm0 |
200 |
movq [ecx], mm6 |
movq [TMP0], mm6 |
201 |
movq [ecx+edx], mm7 |
movq [TMP0+TMP1], mm7 |
202 |
ret |
ret |
203 |
|
|
204 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
205 |
.rounding1 |
.rounding1: |
206 |
pcmpeqb mm0, mm0 |
pcmpeqb mm0, mm0 |
207 |
psubusb mm0, [eax] |
psubusb mm0, [_EAX] |
208 |
add eax, edx |
add _EAX, TMP1 |
209 |
mov ecx, [esp+ 4] ; Dst |
mov TMP0, prm1 ; Dst |
210 |
push esi |
push _ESI |
211 |
pcmpeqb mm1, mm1 |
pcmpeqb mm1, mm1 |
212 |
pcmpeqb mm2, mm2 |
pcmpeqb mm2, mm2 |
213 |
mov esi, mm_minusone |
mov _ESI, mm_minusone |
214 |
psubusb mm1, [byte eax] |
psubusb mm1, [byte _EAX] |
215 |
psubusb mm2, [eax+edx] |
psubusb mm2, [_EAX+TMP1] |
216 |
lea eax, [eax+2*edx] |
lea _EAX, [_EAX+2*TMP1] |
217 |
movq mm6, [esi] |
movq mm6, [_ESI] |
218 |
movq mm7, [esi] |
movq mm7, [_ESI] |
219 |
pavgb mm0, mm1 |
pavgb mm0, mm1 |
220 |
pavgb mm1, mm2 |
pavgb mm1, mm2 |
221 |
psubusb mm6, mm0 |
psubusb mm6, mm0 |
222 |
psubusb mm7, mm1 |
psubusb mm7, mm1 |
223 |
movq [ecx], mm6 |
movq [TMP0], mm6 |
224 |
movq [ecx+edx], mm7 |
movq [TMP0+TMP1], mm7 |
225 |
lea ecx, [ecx+2*edx] |
lea TMP0, [TMP0+2*TMP1] |
226 |
pcmpeqb mm3, mm3 |
pcmpeqb mm3, mm3 |
227 |
pcmpeqb mm4, mm4 |
pcmpeqb mm4, mm4 |
228 |
psubusb mm3, [eax] |
psubusb mm3, [_EAX] |
229 |
psubusb mm4, [eax+edx] |
psubusb mm4, [_EAX+TMP1] |
230 |
lea eax, [eax+2*edx] |
lea _EAX, [_EAX+2*TMP1] |
231 |
pavgb mm2, mm3 |
pavgb mm2, mm3 |
232 |
pavgb mm3, mm4 |
pavgb mm3, mm4 |
233 |
movq mm0, [esi] |
movq mm0, [_ESI] |
234 |
movq mm1, [esi] |
movq mm1, [_ESI] |
235 |
psubusb mm0, mm2 |
psubusb mm0, mm2 |
236 |
psubusb mm1, mm3 |
psubusb mm1, mm3 |
237 |
movq [ecx], mm0 |
movq [TMP0], mm0 |
238 |
movq [ecx+edx], mm1 |
movq [TMP0+TMP1], mm1 |
239 |
lea ecx,[ecx+2*edx] |
lea TMP0,[TMP0+2*TMP1] |
240 |
|
|
241 |
pcmpeqb mm5, mm5 |
pcmpeqb mm5, mm5 |
242 |
pcmpeqb mm6, mm6 |
pcmpeqb mm6, mm6 |
243 |
psubusb mm5, [eax] |
psubusb mm5, [_EAX] |
244 |
psubusb mm6, [eax+edx] |
psubusb mm6, [_EAX+TMP1] |
245 |
lea eax, [eax+2*edx] |
lea _EAX, [_EAX+2*TMP1] |
246 |
pavgb mm4, mm5 |
pavgb mm4, mm5 |
247 |
pavgb mm5, mm6 |
pavgb mm5, mm6 |
248 |
movq mm2, [esi] |
movq mm2, [_ESI] |
249 |
movq mm3, [esi] |
movq mm3, [_ESI] |
250 |
psubusb mm2, mm4 |
psubusb mm2, mm4 |
251 |
psubusb mm3, mm5 |
psubusb mm3, mm5 |
252 |
movq [ecx], mm2 |
movq [TMP0], mm2 |
253 |
movq [ecx+edx], mm3 |
movq [TMP0+TMP1], mm3 |
254 |
lea ecx, [ecx+2*edx] |
lea TMP0, [TMP0+2*TMP1] |
255 |
pcmpeqb mm7, mm7 |
pcmpeqb mm7, mm7 |
256 |
pcmpeqb mm0, mm0 |
pcmpeqb mm0, mm0 |
257 |
psubusb mm7, [eax] |
psubusb mm7, [_EAX] |
258 |
psubusb mm0, [eax+edx] |
psubusb mm0, [_EAX+TMP1] |
259 |
pavgb mm6, mm7 |
pavgb mm6, mm7 |
260 |
pavgb mm7, mm0 |
pavgb mm7, mm0 |
261 |
movq mm4, [esi] |
movq mm4, [_ESI] |
262 |
movq mm5, [esi] |
movq mm5, [_ESI] |
263 |
psubusb mm4, mm6 |
psubusb mm4, mm6 |
264 |
pop esi |
pop _ESI |
265 |
psubusb mm5, mm7 |
psubusb mm5, mm7 |
266 |
movq [ecx], mm4 |
movq [TMP0], mm4 |
267 |
movq [ecx+edx], mm5 |
movq [TMP0+TMP1], mm5 |
268 |
ret |
ret |
269 |
|
ENDFUNC |
270 |
|
|
271 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
272 |
; |
; |
291 |
|
|
292 |
%macro COPY_HV_SSE_RND0 0 |
%macro COPY_HV_SSE_RND0 0 |
293 |
|
|
294 |
movq mm0, [eax+edx] |
movq mm0, [_EAX+TMP1] |
295 |
movq mm1, [eax+edx+1] |
movq mm1, [_EAX+TMP1+1] |
296 |
|
|
297 |
movq mm6, mm0 |
movq mm6, mm0 |
298 |
pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step |
pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step |
299 |
lea eax, [eax+2*edx] |
lea _EAX, [_EAX+2*TMP1] |
300 |
pxor mm1, mm6 ; mm1=(j^k). preserved for next step |
pxor mm1, mm6 ; mm1=(j^k). preserved for next step |
301 |
|
|
302 |
por mm3, mm1 ; ij |= jk |
por mm3, mm1 ; ij |= jk |
304 |
pxor mm6, mm0 ; mm6 = s^t |
pxor mm6, mm0 ; mm6 = s^t |
305 |
pand mm3, mm6 ; (ij|jk) &= st |
pand mm3, mm6 ; (ij|jk) &= st |
306 |
pavgb mm2, mm0 ; mm2 = (s+t+1)/2 |
pavgb mm2, mm0 ; mm2 = (s+t+1)/2 |
307 |
movq mm6, [eax] |
movq mm6, [_EAX] |
308 |
pand mm3, mm7 ; mask lsb |
pand mm3, mm7 ; mask lsb |
309 |
psubb mm2, mm3 ; apply. |
psubb mm2, mm3 ; apply. |
310 |
|
|
311 |
movq [ecx], mm2 |
movq [TMP0], mm2 |
312 |
|
|
313 |
movq mm2, [eax] |
movq mm2, [_EAX] |
314 |
movq mm3, [eax+1] |
movq mm3, [_EAX+1] |
315 |
pavgb mm2, mm3 ; preserved for next iteration |
pavgb mm2, mm3 ; preserved for next iteration |
316 |
pxor mm3, mm6 ; preserved for next iteration |
pxor mm3, mm6 ; preserved for next iteration |
317 |
|
|
324 |
pand mm1, mm7 |
pand mm1, mm7 |
325 |
psubb mm0, mm1 |
psubb mm0, mm1 |
326 |
|
|
327 |
movq [ecx+edx], mm0 |
movq [TMP0+TMP1], mm0 |
328 |
%endmacro |
%endmacro |
329 |
|
|
330 |
%macro COPY_HV_SSE_RND1 0 |
%macro COPY_HV_SSE_RND1 0 |
331 |
movq mm0, [eax+edx] |
movq mm0, [_EAX+TMP1] |
332 |
movq mm1, [eax+edx+1] |
movq mm1, [_EAX+TMP1+1] |
333 |
|
|
334 |
movq mm6, mm0 |
movq mm6, mm0 |
335 |
pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step |
pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step |
336 |
lea eax,[eax+2*edx] |
lea _EAX,[_EAX+2*TMP1] |
337 |
pxor mm1, mm6 ; mm1=(j^k). preserved for next step |
pxor mm1, mm6 ; mm1=(j^k). preserved for next step |
338 |
|
|
339 |
pand mm3, mm1 |
pand mm3, mm1 |
341 |
pxor mm6, mm0 |
pxor mm6, mm0 |
342 |
por mm3, mm6 |
por mm3, mm6 |
343 |
pavgb mm2, mm0 |
pavgb mm2, mm0 |
344 |
movq mm6, [eax] |
movq mm6, [_EAX] |
345 |
pand mm3, mm7 |
pand mm3, mm7 |
346 |
psubb mm2, mm3 |
psubb mm2, mm3 |
347 |
|
|
348 |
movq [ecx], mm2 |
movq [TMP0], mm2 |
349 |
|
|
350 |
movq mm2, [eax] |
movq mm2, [_EAX] |
351 |
movq mm3, [eax+1] |
movq mm3, [_EAX+1] |
352 |
pavgb mm2, mm3 ; preserved for next iteration |
pavgb mm2, mm3 ; preserved for next iteration |
353 |
pxor mm3, mm6 ; preserved for next iteration |
pxor mm3, mm6 ; preserved for next iteration |
354 |
|
|
359 |
pavgb mm0, mm2 |
pavgb mm0, mm2 |
360 |
pand mm1, mm7 |
pand mm1, mm7 |
361 |
psubb mm0, mm1 |
psubb mm0, mm1 |
362 |
movq [ecx+edx], mm0 |
movq [TMP0+TMP1], mm0 |
363 |
%endmacro |
%endmacro |
364 |
|
|
365 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
366 |
interpolate8x8_halfpel_hv_3dne: |
interpolate8x8_halfpel_hv_3dne: |
367 |
mov eax, [esp+ 8] ; Src |
mov _EAX, prm2 ; Src |
368 |
mov edx, [esp+12] ; stride |
mov TMP1, prm3 ; stride |
369 |
dec dword [esp+16] ; rounding |
dec PTR_TYPE prm4 ; rounding |
370 |
|
|
371 |
; loop invariants: mm2=(i+j+1)/2 and mm3= i^j |
; loop invariants: mm2=(i+j+1)/2 and mm3= i^j |
372 |
movq mm2, [eax] |
movq mm2, [_EAX] |
373 |
movq mm3, [eax+1] |
movq mm3, [_EAX+1] |
374 |
movq mm6, mm2 |
movq mm6, mm2 |
375 |
pavgb mm2, mm3 |
pavgb mm2, mm3 |
376 |
pxor mm3, mm6 ; mm2/mm3 ready |
pxor mm3, mm6 ; mm2/mm3 ready |
377 |
mov ecx, [esp+ 4] ; Dst |
mov TMP0, prm1 ; Dst |
378 |
movq mm7, [mmx_one] |
movq mm7, [mmx_one] |
379 |
|
|
380 |
jz near .rounding1 |
jz near .rounding1 |
381 |
lea ebp,[byte ebp] |
lea _EBP,[byte _EBP] |
382 |
COPY_HV_SSE_RND0 |
COPY_HV_SSE_RND0 |
383 |
lea ecx,[ecx+2*edx] |
lea TMP0,[TMP0+2*TMP1] |
384 |
COPY_HV_SSE_RND0 |
COPY_HV_SSE_RND0 |
385 |
lea ecx,[ecx+2*edx] |
lea TMP0,[TMP0+2*TMP1] |
386 |
COPY_HV_SSE_RND0 |
COPY_HV_SSE_RND0 |
387 |
lea ecx,[ecx+2*edx] |
lea TMP0,[TMP0+2*TMP1] |
388 |
COPY_HV_SSE_RND0 |
COPY_HV_SSE_RND0 |
389 |
ret |
ret |
390 |
|
|
391 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
392 |
.rounding1 |
.rounding1: |
393 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
394 |
lea ecx,[ecx+2*edx] |
lea TMP0,[TMP0+2*TMP1] |
395 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
396 |
lea ecx,[ecx+2*edx] |
lea TMP0,[TMP0+2*TMP1] |
397 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
398 |
lea ecx,[ecx+2*edx] |
lea TMP0,[TMP0+2*TMP1] |
399 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
400 |
ret |
ret |
401 |
|
ENDFUNC |
402 |
|
|
403 |
|
;----------------------------------------------------------------------------- |
404 |
|
; |
405 |
|
; void interpolate8x4_halfpel_h_3dne(uint8_t * const dst, |
406 |
|
; const uint8_t * const src, |
407 |
|
; const uint32_t stride, |
408 |
|
; const uint32_t rounding); |
409 |
|
; |
410 |
|
;----------------------------------------------------------------------------- |
411 |
|
|
412 |
|
ALIGN SECTION_ALIGN |
413 |
|
interpolate8x4_halfpel_h_3dne: |
414 |
|
|
415 |
|
mov _EAX, prm2 ; Src |
416 |
|
mov TMP1, prm3 ; stride |
417 |
|
dec PTR_TYPE prm4; rounding |
418 |
|
|
419 |
|
jz .rounding1 |
420 |
|
mov TMP0, prm1 ; Dst |
421 |
|
|
422 |
|
COPY_H_SSE_RND0 0 |
423 |
|
lea TMP0,[TMP0+2*TMP1] |
424 |
|
COPY_H_SSE_RND0 1 |
425 |
|
ret |
426 |
|
|
427 |
|
.rounding1: |
428 |
|
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
429 |
|
mov TMP0, prm1 ; Dst |
430 |
|
movq mm7, [mmx_one] |
431 |
|
COPY_H_SSE_RND1 |
432 |
|
lea TMP0, [TMP0+2*TMP1] |
433 |
|
COPY_H_SSE_RND1 |
434 |
|
ret |
435 |
|
ENDFUNC |
436 |
|
|
437 |
|
;----------------------------------------------------------------------------- |
438 |
|
; |
439 |
|
; void interpolate8x4_halfpel_v_3dne(uint8_t * const dst, |
440 |
|
; const uint8_t * const src, |
441 |
|
; const uint32_t stride, |
442 |
|
; const uint32_t rounding); |
443 |
|
; |
444 |
|
;----------------------------------------------------------------------------- |
445 |
|
|
446 |
|
ALIGN SECTION_ALIGN |
447 |
|
interpolate8x4_halfpel_v_3dne: |
448 |
|
|
449 |
|
mov _EAX, prm2 ; Src |
450 |
|
mov TMP1, prm3 ; stride |
451 |
|
dec PTR_TYPE prm4; rounding |
452 |
|
|
453 |
|
; we process 2 line at a time |
454 |
|
|
455 |
|
jz .rounding1 |
456 |
|
pxor mm2,mm2 |
457 |
|
movq mm0, [_EAX] |
458 |
|
movq mm1, [_EAX+TMP1] |
459 |
|
por mm2, [_EAX+2*TMP1] ; Something like preload (pipelining) |
460 |
|
mov TMP0, prm1 ; Dst |
461 |
|
lea _EAX, [_EAX+2*TMP1] |
462 |
|
pxor mm4, mm4 |
463 |
|
pavgb mm0, mm1 |
464 |
|
pavgb mm1, mm2 |
465 |
|
movq [byte TMP0], mm0 |
466 |
|
movq [TMP0+TMP1], mm1 |
467 |
|
|
468 |
|
pxor mm6, mm6 |
469 |
|
add _EAX, TMP1 |
470 |
|
lea TMP0, [TMP0+2*TMP1] |
471 |
|
movq mm3, [byte _EAX] |
472 |
|
por mm4, [_EAX+TMP1] |
473 |
|
lea _EAX, [_EAX+2*TMP1] |
474 |
|
pavgb mm2, mm3 |
475 |
|
pavgb mm3, mm4 |
476 |
|
movq [TMP0], mm2 |
477 |
|
movq [TMP0+TMP1], mm3 |
478 |
|
|
479 |
|
ret |
480 |
|
|
481 |
|
ALIGN SECTION_ALIGN |
482 |
|
.rounding1: |
483 |
|
pcmpeqb mm0, mm0 |
484 |
|
psubusb mm0, [_EAX] ; _EAX==line0 |
485 |
|
add _EAX, TMP1 ; _EAX==line1 |
486 |
|
mov TMP0, prm1 ; Dst |
487 |
|
|
488 |
|
push _ESI |
489 |
|
|
490 |
|
pcmpeqb mm1, mm1 |
491 |
|
pcmpeqb mm2, mm2 |
492 |
|
mov _ESI, mm_minusone |
493 |
|
psubusb mm1, [byte _EAX] ; line1 |
494 |
|
psubusb mm2, [_EAX+TMP1] ; line2 |
495 |
|
lea _EAX, [_EAX+2*TMP1] ; _EAX==line3 |
496 |
|
movq mm6, [_ESI] |
497 |
|
movq mm7, [_ESI] |
498 |
|
pavgb mm0, mm1 |
499 |
|
pavgb mm1, mm2 |
500 |
|
psubusb mm6, mm0 |
501 |
|
psubusb mm7, mm1 |
502 |
|
movq [TMP0], mm6 ; store line0 |
503 |
|
movq [TMP0+TMP1], mm7 ; store line1 |
504 |
|
|
505 |
|
lea TMP0, [TMP0+2*TMP1] |
506 |
|
pcmpeqb mm3, mm3 |
507 |
|
pcmpeqb mm4, mm4 |
508 |
|
psubusb mm3, [_EAX] ; line3 |
509 |
|
psubusb mm4, [_EAX+TMP1] ; line4 |
510 |
|
lea _EAX, [_EAX+2*TMP1] ; _EAX==line 5 |
511 |
|
pavgb mm2, mm3 |
512 |
|
pavgb mm3, mm4 |
513 |
|
movq mm0, [_ESI] |
514 |
|
movq mm1, [_ESI] |
515 |
|
psubusb mm0, mm2 |
516 |
|
psubusb mm1, mm3 |
517 |
|
movq [TMP0], mm0 |
518 |
|
movq [TMP0+TMP1], mm1 |
519 |
|
|
520 |
|
pop _ESI |
521 |
|
|
522 |
|
ret |
523 |
|
|
524 |
|
ENDFUNC |
525 |
|
|
526 |
|
;----------------------------------------------------------------------------- |
527 |
|
; |
528 |
|
; void interpolate8x4_halfpel_hv_3dne(uint8_t * const dst, |
529 |
|
; const uint8_t * const src, |
530 |
|
; const uint32_t stride, |
531 |
|
; const uint32_t rounding); |
532 |
|
; |
533 |
|
; |
534 |
|
;----------------------------------------------------------------------------- |
535 |
|
|
536 |
|
ALIGN SECTION_ALIGN |
537 |
|
interpolate8x4_halfpel_hv_3dne: |
538 |
|
mov _EAX, prm2 ; Src |
539 |
|
mov TMP1, prm3 ; stride |
540 |
|
dec PTR_TYPE prm4 ; rounding |
541 |
|
|
542 |
|
; loop invariants: mm2=(i+j+1)/2 and mm3= i^j |
543 |
|
movq mm2, [_EAX] |
544 |
|
movq mm3, [_EAX+1] |
545 |
|
movq mm6, mm2 |
546 |
|
pavgb mm2, mm3 |
547 |
|
pxor mm3, mm6 ; mm2/mm3 ready |
548 |
|
mov TMP0, prm1 ; Dst |
549 |
|
movq mm7, [mmx_one] |
550 |
|
|
551 |
|
jz near .rounding1 |
552 |
|
lea _EBP,[byte _EBP] |
553 |
|
COPY_HV_SSE_RND0 |
554 |
|
lea TMP0,[TMP0+2*TMP1] |
555 |
|
COPY_HV_SSE_RND0 |
556 |
|
ret |
557 |
|
|
558 |
|
ALIGN SECTION_ALIGN |
559 |
|
.rounding1: |
560 |
|
COPY_HV_SSE_RND1 |
561 |
|
lea TMP0,[TMP0+2*TMP1] |
562 |
|
COPY_HV_SSE_RND1 |
563 |
|
ret |
564 |
|
ENDFUNC |
565 |
|
|
566 |
|
|
567 |
|
%ifidn __OUTPUT_FORMAT__,elf |
568 |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
569 |
|
%endif |
570 |
|
|