Parent Directory | Revision Log
Revision 1764 - (view) (download)
1 : | edgomez | 1382 | ;/**************************************************************************** |
2 : | Isibaar | 262 | ; * |
3 : | edgomez | 1382 | ; * XVID MPEG-4 VIDEO CODEC |
4 : | ; * - SSE2 optimized SAD operators - | ||
5 : | Isibaar | 262 | ; * |
6 : | edgomez | 1382 | ; * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net> |
7 : | Isibaar | 262 | ; * |
8 : | ; * | ||
9 : | edgomez | 1382 | ; * This program is free software; you can redistribute it and/or modify it |
10 : | ; * under the terms of the GNU General Public License as published by | ||
11 : | ; * the Free Software Foundation; either version 2 of the License, or | ||
12 : | ; * (at your option) any later version. | ||
13 : | Isibaar | 262 | ; * |
14 : | edgomez | 1382 | ; * This program is distributed in the hope that it will be useful, |
15 : | ; * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 : | ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 : | ; * GNU General Public License for more details. | ||
18 : | Isibaar | 262 | ; * |
19 : | edgomez | 1382 | ; * You should have received a copy of the GNU General Public License |
20 : | ; * along with this program; if not, write to the Free Software | ||
21 : | ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
22 : | Isibaar | 262 | ; * |
23 : | Isibaar | 1764 | ; * $Id: sad_sse2.asm,v 1.13 2006-12-06 19:55:07 Isibaar Exp $ |
24 : | edgomez | 652 | ; * |
25 : | edgomez | 1382 | ; ***************************************************************************/ |
26 : | Isibaar | 262 | |
27 : | edgomez | 1382 | BITS 32 |
28 : | Isibaar | 262 | |
29 : | edgomez | 1382 | %macro cglobal 1 |
30 : | Isibaar | 262 | %ifdef PREFIX |
31 : | edgomez | 1535 | %ifdef MARK_FUNCS |
32 : | edgomez | 1540 | global _%1:function %1.endfunc-%1 |
33 : | %define %1 _%1:function %1.endfunc-%1 | ||
34 : | edgomez | 1535 | %else |
35 : | global _%1 | ||
36 : | %define %1 _%1 | ||
37 : | %endif | ||
38 : | Isibaar | 262 | %else |
39 : | edgomez | 1535 | %ifdef MARK_FUNCS |
40 : | edgomez | 1540 | global %1:function %1.endfunc-%1 |
41 : | edgomez | 1535 | %else |
42 : | global %1 | ||
43 : | %endif | ||
44 : | Isibaar | 262 | %endif |
45 : | %endmacro | ||
46 : | |||
47 : | edgomez | 1382 | ;============================================================================= |
48 : | ; Read only data | ||
49 : | ;============================================================================= | ||
50 : | Isibaar | 262 | |
51 : | edgomez | 1382 | %ifdef FORMAT_COFF |
52 : | edgomez | 1519 | SECTION .rodata |
53 : | edgomez | 1382 | %else |
54 : | edgomez | 1519 | SECTION .rodata align=16 |
55 : | edgomez | 1382 | %endif |
56 : | Isibaar | 262 | |
57 : | edgomez | 1382 | ALIGN 64 |
58 : | Isibaar | 262 | zero times 4 dd 0 |
59 : | |||
60 : | edgomez | 1382 | ;============================================================================= |
61 : | ; Code | ||
62 : | ;============================================================================= | ||
63 : | Isibaar | 262 | |
64 : | edgomez | 1382 | SECTION .text |
65 : | |||
66 : | Isibaar | 262 | cglobal sad16_sse2 |
67 : | cglobal dev16_sse2 | ||
68 : | |||
69 : | Isibaar | 1764 | cglobal sad16_sse3 |
70 : | cglobal dev16_sse3 | ||
71 : | |||
72 : | edgomez | 1382 | ;----------------------------------------------------------------------------- |
73 : | ; uint32_t sad16_sse2 (const uint8_t * const cur, <- assumed aligned! | ||
74 : | ; const uint8_t * const ref, | ||
75 : | ; const uint32_t stride, | ||
76 : | ; const uint32_t /*ignored*/); | ||
77 : | ;----------------------------------------------------------------------------- | ||
78 : | Isibaar | 262 | |
79 : | |||
80 : | Isibaar | 1764 | %macro SAD_16x16_SSE2 1 |
81 : | %1 xmm0, [edx] | ||
82 : | %1 xmm1, [edx+ecx] | ||
83 : | edgomez | 1382 | lea edx,[edx+2*ecx] |
84 : | movdqa xmm2, [eax] | ||
85 : | movdqa xmm3, [eax+ecx] | ||
86 : | lea eax,[eax+2*ecx] | ||
87 : | psadbw xmm0, xmm2 | ||
88 : | paddusw xmm6,xmm0 | ||
89 : | psadbw xmm1, xmm3 | ||
90 : | paddusw xmm6,xmm1 | ||
91 : | Isibaar | 262 | %endmacro |
92 : | |||
93 : | Isibaar | 1764 | %macro SAD16_SSE2_SSE3 1 |
94 : | edgomez | 1382 | mov eax, [esp+ 4] ; cur (assumed aligned) |
95 : | mov edx, [esp+ 8] ; ref | ||
96 : | mov ecx, [esp+12] ; stride | ||
97 : | Isibaar | 262 | |
98 : | edgomez | 1382 | pxor xmm6, xmm6 ; accum |
99 : | Isibaar | 262 | |
100 : | Isibaar | 1764 | SAD_16x16_SSE2 %1 |
101 : | SAD_16x16_SSE2 %1 | ||
102 : | SAD_16x16_SSE2 %1 | ||
103 : | SAD_16x16_SSE2 %1 | ||
104 : | SAD_16x16_SSE2 %1 | ||
105 : | SAD_16x16_SSE2 %1 | ||
106 : | SAD_16x16_SSE2 %1 | ||
107 : | SAD_16x16_SSE2 %1 | ||
108 : | Isibaar | 262 | |
109 : | edgomez | 1382 | pshufd xmm5, xmm6, 00000010b |
110 : | paddusw xmm6, xmm5 | ||
111 : | pextrw eax, xmm6, 0 | ||
112 : | ret | ||
113 : | Isibaar | 1764 | %endmacro |
114 : | |||
115 : | ALIGN 16 | ||
116 : | sad16_sse2: | ||
117 : | SAD16_SSE2_SSE3 movdqu | ||
118 : | edgomez | 1540 | .endfunc |
119 : | Isibaar | 262 | |
120 : | |||
121 : | Isibaar | 1764 | ALIGN 16 |
122 : | sad16_sse3: | ||
123 : | SAD16_SSE2_SSE3 lddqu | ||
124 : | .endfunc | ||
125 : | |||
126 : | |||
127 : | edgomez | 1382 | ;----------------------------------------------------------------------------- |
128 : | ; uint32_t dev16_sse2(const uint8_t * const cur, const uint32_t stride); | ||
129 : | ;----------------------------------------------------------------------------- | ||
130 : | Isibaar | 262 | |
131 : | Isibaar | 1764 | %macro MEAN_16x16_SSE2 1 ; eax: src, ecx:stride, mm7: zero or mean => mm6: result |
132 : | %1 xmm0, [eax] | ||
133 : | %1 xmm1, [eax+ecx] | ||
134 : | edgomez | 1382 | lea eax, [eax+2*ecx] ; + 2*stride |
135 : | psadbw xmm0, xmm7 | ||
136 : | paddusw xmm6, xmm0 | ||
137 : | psadbw xmm1, xmm7 | ||
138 : | paddusw xmm6, xmm1 | ||
139 : | Isibaar | 262 | %endmacro |
140 : | |||
141 : | |||
142 : | Isibaar | 1764 | %macro MEAN16_SSE2_SSE3 1 |
143 : | edgomez | 1382 | mov eax, [esp+ 4] ; src |
144 : | mov ecx, [esp+ 8] ; stride | ||
145 : | Isibaar | 262 | |
146 : | edgomez | 1382 | pxor xmm6, xmm6 ; accum |
147 : | pxor xmm7, xmm7 ; zero | ||
148 : | Isibaar | 262 | |
149 : | Isibaar | 1764 | MEAN_16x16_SSE2 %1 |
150 : | MEAN_16x16_SSE2 %1 | ||
151 : | MEAN_16x16_SSE2 %1 | ||
152 : | MEAN_16x16_SSE2 %1 | ||
153 : | Isibaar | 262 | |
154 : | Isibaar | 1764 | MEAN_16x16_SSE2 %1 |
155 : | MEAN_16x16_SSE2 %1 | ||
156 : | MEAN_16x16_SSE2 %1 | ||
157 : | MEAN_16x16_SSE2 %1 | ||
158 : | Isibaar | 262 | |
159 : | edgomez | 1382 | mov eax, [esp+ 4] ; src again |
160 : | Isibaar | 262 | |
161 : | edgomez | 1382 | pshufd xmm7, xmm6, 10b |
162 : | paddusw xmm7, xmm6 | ||
163 : | pxor xmm6, xmm6 ; zero accum | ||
164 : | psrlw xmm7, 8 ; => Mean | ||
165 : | pshuflw xmm7, xmm7, 0 ; replicate Mean | ||
166 : | packuswb xmm7, xmm7 | ||
167 : | pshufd xmm7, xmm7, 00000000b | ||
168 : | Isibaar | 262 | |
169 : | Isibaar | 1764 | MEAN_16x16_SSE2 %1 |
170 : | MEAN_16x16_SSE2 %1 | ||
171 : | MEAN_16x16_SSE2 %1 | ||
172 : | MEAN_16x16_SSE2 %1 | ||
173 : | Isibaar | 262 | |
174 : | Isibaar | 1764 | MEAN_16x16_SSE2 %1 |
175 : | MEAN_16x16_SSE2 %1 | ||
176 : | MEAN_16x16_SSE2 %1 | ||
177 : | MEAN_16x16_SSE2 %1 | ||
178 : | Isibaar | 262 | |
179 : | edgomez | 1382 | pshufd xmm7, xmm6, 10b |
180 : | paddusw xmm7, xmm6 | ||
181 : | pextrw eax, xmm7, 0 | ||
182 : | ret | ||
183 : | Isibaar | 1764 | %endmacro |
184 : | |||
185 : | ALIGN 16 | ||
186 : | dev16_sse2: | ||
187 : | MEAN16_SSE2_SSE3 movdqu | ||
188 : | edgomez | 1540 | .endfunc |
189 : | |||
190 : | Isibaar | 1764 | ALIGN 16 |
191 : | dev16_sse3: | ||
192 : | MEAN16_SSE2_SSE3 lddqu | ||
193 : | .endfunc |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |