19 |
; * along with this program; if not, write to the Free Software |
; * along with this program; if not, write to the Free Software |
20 |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
; * |
; * |
22 |
; * $Id: sad_3dne.asm,v 1.9 2008-11-11 20:46:24 Isibaar Exp $ |
; * $Id: sad_3dne.asm,v 1.10 2008-11-26 01:04:34 Isibaar Exp $ |
23 |
; * |
; * |
24 |
; ***************************************************************************/ |
; ***************************************************************************/ |
25 |
|
|
26 |
; these 3dne functions are compatible with iSSE, but are optimized specifically |
; these 3dne functions are compatible with iSSE, but are optimized specifically |
27 |
; for K7 pipelines |
; for K7 pipelines |
28 |
|
|
29 |
BITS 32 |
%include "nasm.inc" |
|
|
|
|
%macro cglobal 1 |
|
|
%ifdef PREFIX |
|
|
%ifdef MARK_FUNCS |
|
|
global _%1:function %1.endfunc-%1 |
|
|
%define %1 _%1:function %1.endfunc-%1 |
|
|
%define ENDFUNC .endfunc |
|
|
%else |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%define ENDFUNC |
|
|
%endif |
|
|
%else |
|
|
%ifdef MARK_FUNCS |
|
|
global %1:function %1.endfunc-%1 |
|
|
%define ENDFUNC .endfunc |
|
|
%else |
|
|
global %1 |
|
|
%define ENDFUNC |
|
|
%endif |
|
|
%endif |
|
|
%endmacro |
|
30 |
|
|
31 |
;============================================================================= |
;============================================================================= |
32 |
; Read only data |
; Read only data |
33 |
;============================================================================= |
;============================================================================= |
34 |
|
|
35 |
%ifdef FORMAT_COFF |
DATA |
|
SECTION .rodata |
|
|
%else |
|
|
SECTION .rodata align=16 |
|
|
%endif |
|
36 |
|
|
37 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
38 |
mmx_one: |
mmx_one: |
39 |
times 4 dw 1 |
times 4 dw 1 |
40 |
|
|
44 |
|
|
45 |
;; %1 block number (0..4) |
;; %1 block number (0..4) |
46 |
%macro SAD_16x16_SSE 1 |
%macro SAD_16x16_SSE 1 |
47 |
movq mm7, [eax] |
movq mm7, [_EAX] |
48 |
movq mm6, [eax+8] |
movq mm6, [_EAX+8] |
49 |
psadbw mm7, [edx] |
psadbw mm7, [TMP1] |
50 |
psadbw mm6, [edx+8] |
psadbw mm6, [TMP1+8] |
51 |
%if (%1) |
%if (%1) |
52 |
paddd mm1, mm5 |
paddd mm1, mm5 |
53 |
%endif |
%endif |
54 |
movq mm5, [eax+ecx] |
movq mm5, [_EAX+TMP0] |
55 |
movq mm4, [eax+ecx+8] |
movq mm4, [_EAX+TMP0+8] |
56 |
psadbw mm5, [edx+ecx] |
psadbw mm5, [TMP1+TMP0] |
57 |
psadbw mm4, [edx+ecx+8] |
psadbw mm4, [TMP1+TMP0+8] |
58 |
movq mm3, [eax+2*ecx] |
movq mm3, [_EAX+2*TMP0] |
59 |
movq mm2, [eax+2*ecx+8] |
movq mm2, [_EAX+2*TMP0+8] |
60 |
psadbw mm3, [edx+2*ecx] |
psadbw mm3, [TMP1+2*TMP0] |
61 |
psadbw mm2, [edx+2*ecx+8] |
psadbw mm2, [TMP1+2*TMP0+8] |
62 |
%if (%1) |
%if (%1) |
63 |
movd [esp+4*(%1-1)], mm1 |
movd [_ESP+4*(%1-1)], mm1 |
64 |
%else |
%else |
65 |
sub esp, byte 12 |
sub _ESP, byte 12 |
66 |
%endif |
%endif |
67 |
movq mm1, [eax+ebx] |
movq mm1, [_EAX+_EBX] |
68 |
movq mm0, [eax+ebx+8] |
movq mm0, [_EAX+_EBX+8] |
69 |
psadbw mm1, [edx+ebx] |
psadbw mm1, [TMP1+_EBX] |
70 |
psadbw mm0, [edx+ebx+8] |
psadbw mm0, [TMP1+_EBX+8] |
71 |
lea eax, [eax+4*ecx] |
lea _EAX, [_EAX+4*TMP0] |
72 |
lea edx, [edx+4*ecx] |
lea TMP1, [TMP1+4*TMP0] |
73 |
paddd mm7, mm6 |
paddd mm7, mm6 |
74 |
paddd mm5, mm4 |
paddd mm5, mm4 |
75 |
paddd mm3, mm2 |
paddd mm3, mm2 |
79 |
%endmacro |
%endmacro |
80 |
|
|
81 |
%macro SADBI_16x16_SSE0 0 |
%macro SADBI_16x16_SSE0 0 |
82 |
movq mm2, [edx] |
movq mm2, [TMP1] |
83 |
movq mm3, [edx+8] |
movq mm3, [TMP1+8] |
84 |
|
|
85 |
movq mm5, [byte eax] |
movq mm5, [byte _EAX] |
86 |
movq mm6, [eax+8] |
movq mm6, [_EAX+8] |
87 |
pavgb mm2, [byte ebx] |
pavgb mm2, [byte _EBX] |
88 |
pavgb mm3, [ebx+8] |
pavgb mm3, [_EBX+8] |
89 |
|
|
90 |
add edx, ecx |
add TMP1, TMP0 |
91 |
psadbw mm5, mm2 |
psadbw mm5, mm2 |
92 |
psadbw mm6, mm3 |
psadbw mm6, mm3 |
93 |
|
|
94 |
add eax, ecx |
add _EAX, TMP0 |
95 |
add ebx, ecx |
add _EBX, TMP0 |
96 |
movq mm2, [byte edx] |
movq mm2, [byte TMP1] |
97 |
|
|
98 |
movq mm3, [edx+8] |
movq mm3, [TMP1+8] |
99 |
movq mm0, [byte eax] |
movq mm0, [byte _EAX] |
100 |
|
|
101 |
movq mm1, [eax+8] |
movq mm1, [_EAX+8] |
102 |
pavgb mm2, [byte ebx] |
pavgb mm2, [byte _EBX] |
103 |
|
|
104 |
pavgb mm3, [ebx+8] |
pavgb mm3, [_EBX+8] |
105 |
add edx, ecx |
add TMP1, TMP0 |
106 |
add eax, ecx |
add _EAX, TMP0 |
107 |
|
|
108 |
add ebx, ecx |
add _EBX, TMP0 |
109 |
psadbw mm0, mm2 |
psadbw mm0, mm2 |
110 |
psadbw mm1, mm3 |
psadbw mm1, mm3 |
111 |
|
|
112 |
%endmacro |
%endmacro |
113 |
|
|
114 |
%macro SADBI_16x16_SSE 0 |
%macro SADBI_16x16_SSE 0 |
115 |
movq mm2, [byte edx] |
movq mm2, [byte TMP1] |
116 |
movq mm3, [edx+8] |
movq mm3, [TMP1+8] |
117 |
paddusw mm5, mm0 |
paddusw mm5, mm0 |
118 |
paddusw mm6, mm1 |
paddusw mm6, mm1 |
119 |
movq mm0, [eax] |
movq mm0, [_EAX] |
120 |
movq mm1, [eax+8] |
movq mm1, [_EAX+8] |
121 |
pavgb mm2, [ebx] |
pavgb mm2, [_EBX] |
122 |
pavgb mm3, [ebx+8] |
pavgb mm3, [_EBX+8] |
123 |
add edx, ecx |
add TMP1, TMP0 |
124 |
add eax, ecx |
add _EAX, TMP0 |
125 |
add ebx, ecx |
add _EBX, TMP0 |
126 |
psadbw mm0, mm2 |
psadbw mm0, mm2 |
127 |
psadbw mm1, mm3 |
psadbw mm1, mm3 |
128 |
%endmacro |
%endmacro |
129 |
|
|
130 |
%macro SADBI_8x8_3dne 0 |
%macro SADBI_8x8_3dne 0 |
131 |
movq mm2, [edx] |
movq mm2, [TMP1] |
132 |
movq mm3, [edx+ecx] |
movq mm3, [TMP1+TMP0] |
133 |
pavgb mm2, [eax] |
pavgb mm2, [_EAX] |
134 |
pavgb mm3, [eax+ecx] |
pavgb mm3, [_EAX+TMP0] |
135 |
lea edx, [edx+2*ecx] |
lea TMP1, [TMP1+2*TMP0] |
136 |
lea eax, [eax+2*ecx] |
lea _EAX, [_EAX+2*TMP0] |
137 |
paddusw mm5, mm0 |
paddusw mm5, mm0 |
138 |
paddusw mm6, mm1 |
paddusw mm6, mm1 |
139 |
movq mm0, [ebx] |
movq mm0, [_EBX] |
140 |
movq mm1, [ebx+ecx] |
movq mm1, [_EBX+TMP0] |
141 |
lea ebx, [ebx+2*ecx] |
lea _EBX, [_EBX+2*TMP0] |
142 |
psadbw mm0, mm2 |
psadbw mm0, mm2 |
143 |
psadbw mm1, mm3 |
psadbw mm1, mm3 |
144 |
%endmacro |
%endmacro |
145 |
|
|
146 |
%macro ABS_16x16_SSE 1 |
%macro ABS_16x16_SSE 1 |
147 |
%if (%1 == 0) |
%if (%1 == 0) |
148 |
movq mm7, [eax] |
movq mm7, [_EAX] |
149 |
psadbw mm7, mm4 |
psadbw mm7, mm4 |
150 |
mov esi, esi |
mov esi, esi |
151 |
movq mm6, [eax+8] |
movq mm6, [_EAX+8] |
152 |
movq mm5, [eax+ecx] |
movq mm5, [_EAX+TMP0] |
153 |
movq mm3, [eax+ecx+8] |
movq mm3, [_EAX+TMP0+8] |
154 |
psadbw mm6, mm4 |
psadbw mm6, mm4 |
155 |
|
|
156 |
movq mm2, [byte eax+2*ecx] |
movq mm2, [byte _EAX+2*TMP0] |
157 |
psadbw mm5, mm4 |
psadbw mm5, mm4 |
158 |
movq mm1, [eax+2*ecx+8] |
movq mm1, [_EAX+2*TMP0+8] |
159 |
psadbw mm3, mm4 |
psadbw mm3, mm4 |
160 |
|
|
161 |
movq mm0, [eax+edx+0] |
movq mm0, [_EAX+TMP1+0] |
162 |
psadbw mm2, mm4 |
psadbw mm2, mm4 |
163 |
add eax, edx |
add _EAX, TMP1 |
164 |
psadbw mm1, mm4 |
psadbw mm1, mm4 |
165 |
%endif |
%endif |
166 |
%if (%1 == 1) |
%if (%1 == 1) |
167 |
psadbw mm0, mm4 |
psadbw mm0, mm4 |
168 |
paddd mm7, mm0 |
paddd mm7, mm0 |
169 |
movq mm0, [eax+8] |
movq mm0, [_EAX+8] |
170 |
psadbw mm0, mm4 |
psadbw mm0, mm4 |
171 |
paddd mm6, mm0 |
paddd mm6, mm0 |
172 |
|
|
173 |
movq mm0, [byte eax+ecx] |
movq mm0, [byte _EAX+TMP0] |
174 |
psadbw mm0, mm4 |
psadbw mm0, mm4 |
175 |
|
|
176 |
paddd mm5, mm0 |
paddd mm5, mm0 |
177 |
movq mm0, [eax+ecx+8] |
movq mm0, [_EAX+TMP0+8] |
178 |
|
|
179 |
psadbw mm0, mm4 |
psadbw mm0, mm4 |
180 |
paddd mm3, mm0 |
paddd mm3, mm0 |
181 |
movq mm0, [eax+2*ecx] |
movq mm0, [_EAX+2*TMP0] |
182 |
psadbw mm0, mm4 |
psadbw mm0, mm4 |
183 |
paddd mm2, mm0 |
paddd mm2, mm0 |
184 |
|
|
185 |
movq mm0, [eax+2*ecx+8] |
movq mm0, [_EAX+2*TMP0+8] |
186 |
add eax, edx |
add _EAX, TMP1 |
187 |
psadbw mm0, mm4 |
psadbw mm0, mm4 |
188 |
paddd mm1, mm0 |
paddd mm1, mm0 |
189 |
movq mm0, [eax] |
movq mm0, [_EAX] |
190 |
%endif |
%endif |
191 |
%if (%1 == 2) |
%if (%1 == 2) |
192 |
psadbw mm0, mm4 |
psadbw mm0, mm4 |
193 |
paddd mm7, mm0 |
paddd mm7, mm0 |
194 |
movq mm0, [eax+8] |
movq mm0, [_EAX+8] |
195 |
psadbw mm0, mm4 |
psadbw mm0, mm4 |
196 |
paddd mm6, mm0 |
paddd mm6, mm0 |
197 |
%endif |
%endif |
201 |
; Code |
; Code |
202 |
;============================================================================= |
;============================================================================= |
203 |
|
|
204 |
SECTION .text |
SECTION .rotext align=SECTION_ALIGN |
205 |
|
|
206 |
cglobal sad16_3dne |
cglobal sad16_3dne |
207 |
cglobal sad8_3dne |
cglobal sad8_3dne |
220 |
|
|
221 |
; optimization: 21% faster |
; optimization: 21% faster |
222 |
|
|
223 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
224 |
sad16_3dne: |
sad16_3dne: |
225 |
mov eax, [esp+ 4] ; Src1 |
mov _EAX, prm1 ; Src1 |
226 |
mov edx, [esp+ 8] ; Src2 |
mov TMP1, prm2 ; Src2 |
227 |
mov ecx, [esp+12] ; Stride |
mov TMP0, prm3 ; Stride |
228 |
push ebx |
|
229 |
lea ebx, [2*ecx+ecx] |
push _EBX |
230 |
|
lea _EBX, [2*TMP0+TMP0] |
231 |
|
|
232 |
SAD_16x16_SSE 0 |
SAD_16x16_SSE 0 |
233 |
SAD_16x16_SSE 1 |
SAD_16x16_SSE 1 |
234 |
SAD_16x16_SSE 2 |
SAD_16x16_SSE 2 |
235 |
SAD_16x16_SSE 3 |
SAD_16x16_SSE 3 |
236 |
|
|
|
mov ecx, [esp] |
|
|
add ecx, [esp+4] |
|
|
add ecx, [esp+8] |
|
237 |
paddd mm1, mm5 |
paddd mm1, mm5 |
|
mov ebx, [esp+12] |
|
|
add esp, byte 4+12 |
|
238 |
movd eax, mm1 |
movd eax, mm1 |
239 |
add eax, ecx |
add eax, dword [_ESP] |
240 |
|
add eax, dword [_ESP+4] |
241 |
|
add eax, dword [_ESP+8] |
242 |
|
mov _EBX, [_ESP+12] |
243 |
|
add _ESP, byte PTR_SIZE+12 |
244 |
|
|
245 |
ret |
ret |
246 |
ENDFUNC |
ENDFUNC |
254 |
; |
; |
255 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
256 |
|
|
257 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
258 |
sad8_3dne: |
sad8_3dne: |
259 |
|
|
260 |
mov eax, [esp+ 4] ; Src1 |
mov _EAX, prm1 ; Src1 |
261 |
mov ecx, [esp+12] ; Stride |
mov TMP0, prm3 ; Stride |
262 |
mov edx, [esp+ 8] ; Src2 |
mov TMP1, prm2 ; Src2 |
263 |
push ebx |
push _EBX |
264 |
lea ebx, [ecx+2*ecx] |
lea _EBX, [TMP0+2*TMP0] |
265 |
|
|
266 |
movq mm0, [byte eax] ;0 |
movq mm0, [byte _EAX] ;0 |
267 |
psadbw mm0, [byte edx] |
psadbw mm0, [byte TMP1] |
268 |
movq mm1, [eax+ecx] ;1 |
movq mm1, [_EAX+TMP0] ;1 |
269 |
psadbw mm1, [edx+ecx] |
psadbw mm1, [TMP1+TMP0] |
270 |
|
|
271 |
movq mm2, [eax+2*ecx] ;2 |
movq mm2, [_EAX+2*TMP0] ;2 |
272 |
psadbw mm2, [edx+2*ecx] |
psadbw mm2, [TMP1+2*TMP0] |
273 |
movq mm3, [eax+ebx] ;3 |
movq mm3, [_EAX+_EBX] ;3 |
274 |
psadbw mm3, [edx+ebx] |
psadbw mm3, [TMP1+_EBX] |
275 |
|
|
276 |
paddd mm0, mm1 |
paddd mm0, mm1 |
277 |
|
|
278 |
movq mm4, [byte eax+4*ecx];4 |
movq mm4, [byte _EAX+4*TMP0];4 |
279 |
psadbw mm4, [edx+4*ecx] |
psadbw mm4, [TMP1+4*TMP0] |
280 |
movq mm5, [eax+2*ebx] ;6 |
movq mm5, [_EAX+2*_EBX] ;6 |
281 |
psadbw mm5, [edx+2*ebx] |
psadbw mm5, [TMP1+2*_EBX] |
282 |
|
|
283 |
paddd mm2, mm3 |
paddd mm2, mm3 |
284 |
paddd mm0, mm2 |
paddd mm0, mm2 |
285 |
|
|
286 |
lea ebx, [ebx+4*ecx] ;3+4=7 |
lea _EBX, [_EBX+4*TMP0] ;3+4=7 |
287 |
lea ecx, [ecx+4*ecx] ; 5 |
lea TMP0, [TMP0+4*TMP0] ; 5 |
288 |
movq mm6, [eax+ecx] ;5 |
movq mm6, [_EAX+TMP0] ;5 |
289 |
psadbw mm6, [edx+ecx] |
psadbw mm6, [TMP1+TMP0] |
290 |
movq mm7, [eax+ebx] ;7 |
movq mm7, [_EAX+_EBX] ;7 |
291 |
psadbw mm7, [edx+ebx] |
psadbw mm7, [TMP1+_EBX] |
292 |
paddd mm4, mm5 |
paddd mm4, mm5 |
293 |
paddd mm6, mm7 |
paddd mm6, mm7 |
294 |
paddd mm0, mm4 |
paddd mm0, mm4 |
295 |
mov ebx, [esp] |
mov _EBX, [_ESP] |
296 |
add esp, byte 4 |
add _ESP, byte PTR_SIZE |
297 |
paddd mm0, mm6 |
paddd mm0, mm6 |
298 |
movd eax, mm0 |
movd eax, mm0 |
299 |
|
|
311 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
312 |
;optimization: 14% faster |
;optimization: 14% faster |
313 |
|
|
314 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
315 |
sad16bi_3dne: |
sad16bi_3dne: |
316 |
mov eax, [esp+ 4] ; Src |
mov _EAX, prm1 ; Src |
317 |
mov edx, [esp+ 8] ; Ref1 |
mov TMP1, prm2 ; Ref1 |
318 |
mov ecx, [esp+16] ; Stride |
mov TMP0, prm4 ; Stride |
319 |
push ebx |
|
320 |
mov ebx, [esp+4+12] ; Ref2 |
push _EBX |
321 |
|
%ifdef ARCH_IS_X86_64 |
322 |
|
mov _EBX, prm3 |
323 |
|
%else |
324 |
|
mov _EBX, [_ESP+4+12] ; Ref2 |
325 |
|
%endif |
326 |
|
|
327 |
SADBI_16x16_SSE0 |
SADBI_16x16_SSE0 |
328 |
SADBI_16x16_SSE |
SADBI_16x16_SSE |
343 |
paddusw mm5,mm0 |
paddusw mm5,mm0 |
344 |
paddusw mm6,mm1 |
paddusw mm6,mm1 |
345 |
|
|
346 |
pop ebx |
pop _EBX |
347 |
paddusw mm6,mm5 |
paddusw mm6,mm5 |
348 |
movd eax, mm6 |
movd eax, mm6 |
349 |
|
|
359 |
; |
; |
360 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
361 |
|
|
362 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
363 |
sad8bi_3dne: |
sad8bi_3dne: |
364 |
mov eax, [esp+12] ; Ref2 |
mov _EAX, prm3 ; Ref2 |
365 |
mov edx, [esp+ 8] ; Ref1 |
mov TMP1, prm2 ; Ref1 |
366 |
mov ecx, [esp+16] ; Stride |
mov TMP0, prm4 ; Stride |
367 |
push ebx |
|
368 |
mov ebx, [esp+4+ 4] ; Src |
push _EBX |
369 |
|
%ifdef ARCH_IS_X86_64 |
370 |
movq mm2, [edx] |
mov _EBX, prm1 |
371 |
movq mm3, [edx+ecx] |
%else |
372 |
pavgb mm2, [eax] |
mov _EBX, [_ESP+4+ 4] ; Src |
373 |
pavgb mm3, [eax+ecx] |
%endif |
374 |
lea edx, [edx+2*ecx] |
|
375 |
lea eax, [eax+2*ecx] |
movq mm2, [TMP1] |
376 |
movq mm5, [ebx] |
movq mm3, [TMP1+TMP0] |
377 |
movq mm6, [ebx+ecx] |
pavgb mm2, [_EAX] |
378 |
lea ebx, [ebx+2*ecx] |
pavgb mm3, [_EAX+TMP0] |
379 |
|
lea TMP1, [TMP1+2*TMP0] |
380 |
|
lea _EAX, [_EAX+2*TMP0] |
381 |
|
movq mm5, [_EBX] |
382 |
|
movq mm6, [_EBX+TMP0] |
383 |
|
lea _EBX, [_EBX+2*TMP0] |
384 |
psadbw mm5, mm2 |
psadbw mm5, mm2 |
385 |
psadbw mm6, mm3 |
psadbw mm6, mm3 |
386 |
|
|
387 |
movq mm2, [edx] |
movq mm2, [TMP1] |
388 |
movq mm3, [edx+ecx] |
movq mm3, [TMP1+TMP0] |
389 |
pavgb mm2, [eax] |
pavgb mm2, [_EAX] |
390 |
pavgb mm3, [eax+ecx] |
pavgb mm3, [_EAX+TMP0] |
391 |
lea edx, [edx+2*ecx] |
lea TMP1, [TMP1+2*TMP0] |
392 |
lea eax, [eax+2*ecx] |
lea _EAX, [_EAX+2*TMP0] |
393 |
movq mm0, [ebx] |
movq mm0, [_EBX] |
394 |
movq mm1, [ebx+ecx] |
movq mm1, [_EBX+TMP0] |
395 |
lea ebx, [ebx+2*ecx] |
lea _EBX, [_EBX+2*TMP0] |
396 |
psadbw mm0, mm2 |
psadbw mm0, mm2 |
397 |
psadbw mm1, mm3 |
psadbw mm1, mm3 |
398 |
|
|
399 |
movq mm2, [edx] |
movq mm2, [TMP1] |
400 |
movq mm3, [edx+ecx] |
movq mm3, [TMP1+TMP0] |
401 |
pavgb mm2, [eax] |
pavgb mm2, [_EAX] |
402 |
pavgb mm3, [eax+ecx] |
pavgb mm3, [_EAX+TMP0] |
403 |
lea edx, [edx+2*ecx] |
lea TMP1, [TMP1+2*TMP0] |
404 |
lea eax, [eax+2*ecx] |
lea _EAX, [_EAX+2*TMP0] |
405 |
paddusw mm5,mm0 |
paddusw mm5,mm0 |
406 |
paddusw mm6,mm1 |
paddusw mm6,mm1 |
407 |
movq mm0, [ebx] |
movq mm0, [_EBX] |
408 |
movq mm1, [ebx+ecx] |
movq mm1, [_EBX+TMP0] |
409 |
lea ebx, [ebx+2*ecx] |
lea _EBX, [_EBX+2*TMP0] |
410 |
psadbw mm0, mm2 |
psadbw mm0, mm2 |
411 |
psadbw mm1, mm3 |
psadbw mm1, mm3 |
412 |
|
|
413 |
movq mm2, [edx] |
movq mm2, [TMP1] |
414 |
movq mm3, [edx+ecx] |
movq mm3, [TMP1+TMP0] |
415 |
pavgb mm2, [eax] |
pavgb mm2, [_EAX] |
416 |
pavgb mm3, [eax+ecx] |
pavgb mm3, [_EAX+TMP0] |
417 |
paddusw mm5,mm0 |
paddusw mm5,mm0 |
418 |
paddusw mm6,mm1 |
paddusw mm6,mm1 |
419 |
movq mm0, [ebx] |
movq mm0, [_EBX] |
420 |
movq mm1, [ebx+ecx] |
movq mm1, [_EBX+TMP0] |
421 |
psadbw mm0, mm2 |
psadbw mm0, mm2 |
422 |
psadbw mm1, mm3 |
psadbw mm1, mm3 |
423 |
paddusw mm5,mm0 |
paddusw mm5,mm0 |
424 |
paddusw mm6,mm1 |
paddusw mm6,mm1 |
425 |
|
|
426 |
paddusw mm6,mm5 |
paddusw mm6,mm5 |
427 |
mov ebx,[esp] |
mov _EBX,[_ESP] |
428 |
add esp,byte 4 |
add _ESP,byte PTR_SIZE |
429 |
movd eax, mm6 |
movd eax, mm6 |
430 |
|
|
431 |
ret |
ret |
440 |
;=========================================================================== |
;=========================================================================== |
441 |
; optimization: 25 % faster |
; optimization: 25 % faster |
442 |
|
|
443 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
444 |
dev16_3dne: |
dev16_3dne: |
445 |
|
|
446 |
mov eax, [esp+ 4] ; Src |
mov _EAX, prm1 ; Src |
447 |
mov ecx, [esp+ 8] ; Stride |
mov TMP0, prm2 ; Stride |
448 |
lea edx, [ecx+2*ecx] |
lea TMP1, [TMP0+2*TMP0] |
449 |
|
|
450 |
pxor mm4, mm4 |
pxor mm4, mm4 |
451 |
|
|
452 |
ALIGN 8 |
ALIGN SECTION_ALIGN |
453 |
ABS_16x16_SSE 0 |
ABS_16x16_SSE 0 |
454 |
ABS_16x16_SSE 1 |
ABS_16x16_SSE 1 |
455 |
ABS_16x16_SSE 1 |
ABS_16x16_SSE 1 |
463 |
|
|
464 |
paddd mm7, mm6 |
paddd mm7, mm6 |
465 |
paddd mm1, mm3 |
paddd mm1, mm3 |
466 |
mov eax, [esp+ 4] ; Src |
mov _EAX, prm1 ; Src |
467 |
paddd mm7, mm1 |
paddd mm7, mm1 |
468 |
punpcklbw mm7, mm7 ;xxyyaazz |
punpcklbw mm7, mm7 ;xxyyaazz |
469 |
pshufw mm4, mm7, 055h ; mm4 contains the mean |
pshufw mm4, mm7, 055h ; mm4 contains the mean |