12 |
; ARG1 argument passed to FUNC |
; ARG1 argument passed to FUNC |
13 |
; |
; |
14 |
; throughout the FUNC the registers mean: |
; throughout the FUNC the registers mean: |
|
; eax y_stride |
|
|
; ebx u_ptr |
|
|
; ecx v_ptr |
|
|
; edx x_stride |
|
|
; esi y_ptr |
|
|
; edi x_ptr |
|
|
; ebp width |
|
|
; |
|
15 |
;------------------------------------------------------------------------------ |
;------------------------------------------------------------------------------ |
16 |
|
|
17 |
|
%define y_stride _EAX |
18 |
|
%define u_ptr _EBX |
19 |
|
%define v_ptr _ECX |
20 |
|
%define x_stride _EDX |
21 |
|
%define x_stride_d edx |
22 |
|
%define y_ptr _ESI |
23 |
|
%define x_ptr _EDI |
24 |
|
%define width _EBP |
25 |
|
|
26 |
%macro MAKE_COLORSPACE 8 |
%macro MAKE_COLORSPACE 8 |
27 |
%define NAME %1 |
%define NAME %1 |
28 |
%define STACK %2 |
%define STACK %2 |
33 |
%define ARG1 %7 |
%define ARG1 %7 |
34 |
%define ARG2 %8 |
%define ARG2 %8 |
35 |
; --- define function global/symbol |
; --- define function global/symbol |
36 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
37 |
cglobal NAME |
cglobal NAME |
38 |
NAME: |
NAME: |
39 |
; --- init stack --- |
; --- init stack --- |
40 |
|
|
41 |
%define pushsize 16 |
push _EBX ; esp + localsize + 16 |
|
%define localsize 20 + STACK |
|
42 |
|
|
43 |
%define vflip esp + localsize + pushsize + 40 |
%ifdef ARCH_IS_X86_64 |
|
%define height esp + localsize + pushsize + 36 |
|
|
%define width esp + localsize + pushsize + 32 |
|
|
%define uv_stride esp + localsize + pushsize + 28 |
|
|
%define y_stride esp + localsize + pushsize + 24 |
|
|
%define v_ptr esp + localsize + pushsize + 20 |
|
|
%define u_ptr esp + localsize + pushsize + 16 |
|
|
%define y_ptr esp + localsize + pushsize + 12 |
|
|
%define x_stride esp + localsize + pushsize + 8 |
|
|
%define x_ptr esp + localsize + pushsize + 4 |
|
|
%define _ip esp + localsize + pushsize + 0 |
|
|
|
|
|
push ebx ; esp + localsize + 16 |
|
|
push esi ; esp + localsize + 8 |
|
|
push edi ; esp + localsize + 4 |
|
|
push ebp ; esp + localsize + 0 |
|
|
|
|
|
%define x_dif esp + localsize - 4 |
|
|
%define y_dif esp + localsize - 8 |
|
|
%define uv_dif esp + localsize - 12 |
|
|
%define fixed_width esp + localsize - 16 |
|
|
%define tmp_height esp + localsize - 20 |
|
44 |
|
|
45 |
sub esp, localsize |
%define localsize 2*PTR_SIZE + STACK |
46 |
|
%ifndef WINDOWS |
47 |
|
%define pushsize 2*PTR_SIZE |
48 |
|
%define shadow 0 |
49 |
|
%else |
50 |
|
%define pushsize 4*PTR_SIZE |
51 |
|
%define shadow 32 + 16 |
52 |
|
%endif |
53 |
|
|
54 |
|
%define prm_vflip dword [_ESP + localsize + pushsize + shadow + 4*PTR_SIZE] |
55 |
|
%define prm_height dword [_ESP + localsize + pushsize + shadow + 3*PTR_SIZE] |
56 |
|
%define prm_width dword [_ESP + localsize + pushsize + shadow + 2*PTR_SIZE] |
57 |
|
%define prm_uv_stride dword [_ESP + localsize + pushsize + shadow + 1*PTR_SIZE] |
58 |
|
|
59 |
|
%ifdef WINDOWS |
60 |
|
%define prm_y_stride dword [_ESP + localsize + pushsize + shadow - 1*PTR_SIZE] |
61 |
|
%define prm_v_ptr [_ESP + localsize + pushsize + shadow - 2*PTR_SIZE] |
62 |
|
|
63 |
|
push _ESI ; esp + localsize + 8 |
64 |
|
push _EDI ; esp + localsize + 4 |
65 |
|
|
66 |
|
%else |
67 |
|
%define prm_y_stride prm6d |
68 |
|
%define prm_v_ptr prm5 |
69 |
|
%endif |
70 |
|
|
71 |
|
%define prm_u_ptr prm4 |
72 |
|
%define prm_y_ptr prm3 |
73 |
|
%define prm_x_stride prm2d |
74 |
|
%define prm_x_ptr prm1 |
75 |
|
%define _ip _ESP + localsize + pushsize + 0 |
76 |
|
|
77 |
|
%define x_dif TMP0 |
78 |
|
|
79 |
|
%else |
80 |
|
|
81 |
|
%define localsize 5*PTR_SIZE + STACK |
82 |
|
%define pushsize 4*PTR_SIZE |
83 |
|
|
84 |
|
%define prm_vflip [_ESP + localsize + pushsize + 10*PTR_SIZE] |
85 |
|
%define prm_height [_ESP + localsize + pushsize + 9*PTR_SIZE] |
86 |
|
%define prm_width [_ESP + localsize + pushsize + 8*PTR_SIZE] |
87 |
|
%define prm_uv_stride [_ESP + localsize + pushsize + 7*PTR_SIZE] |
88 |
|
%define prm_y_stride [_ESP + localsize + pushsize + 6*PTR_SIZE] |
89 |
|
%define prm_v_ptr [_ESP + localsize + pushsize + 5*PTR_SIZE] |
90 |
|
%define prm_u_ptr [_ESP + localsize + pushsize + 4*PTR_SIZE] |
91 |
|
%define prm_y_ptr [_ESP + localsize + pushsize + 3*PTR_SIZE] |
92 |
|
%define prm_x_stride [_ESP + localsize + pushsize + 2*PTR_SIZE] |
93 |
|
%define prm_x_ptr [_ESP + localsize + pushsize + 1*PTR_SIZE] |
94 |
|
%define _ip _ESP + localsize + pushsize + 0 |
95 |
|
|
96 |
|
%define x_dif dword [_ESP + localsize - 5*4] |
97 |
|
|
98 |
|
push _ESI ; esp + localsize + 8 |
99 |
|
push _EDI ; esp + localsize + 4 |
100 |
|
|
101 |
|
%endif |
102 |
|
|
103 |
|
push _EBP ; esp + localsize + 0 |
104 |
|
|
105 |
|
%define y_dif dword [_ESP + localsize - 1*4] |
106 |
|
%define uv_dif dword [_ESP + localsize - 2*4] |
107 |
|
%define fixed_width dword [_ESP + localsize - 3*4] |
108 |
|
%define tmp_height dword [_ESP + localsize - 4*4] |
109 |
|
|
110 |
|
sub _ESP, localsize |
111 |
|
|
112 |
; --- init varibles --- |
; --- init varibles --- |
113 |
|
|
114 |
mov eax, [width] ; fixed width |
mov eax, prm_width ; fixed width |
115 |
add eax, 15 ; |
add eax, 15 ; |
116 |
and eax, ~15 ; |
and eax, ~15 ; |
117 |
mov [fixed_width],eax ; |
mov fixed_width, eax ; |
118 |
|
|
119 |
mov ebx, [x_stride] ; |
mov ebx, prm_x_stride ; |
120 |
%rep BYTES |
%rep BYTES |
121 |
sub ebx, eax ; |
sub _EBX, _EAX ; |
122 |
%endrep |
%endrep |
123 |
mov [x_dif], ebx ; x_dif = x_stride - BYTES*fixed_width |
mov x_dif, _EBX ; x_dif = x_stride - BYTES*fixed_width |
124 |
|
|
125 |
mov ebx, [y_stride] ; |
mov ebx, prm_y_stride ; |
126 |
sub ebx, eax ; |
sub ebx, eax ; |
127 |
mov [y_dif], ebx ; y_dif = y_stride - fixed_width |
mov y_dif, ebx ; y_dif = y_stride - fixed_width |
128 |
|
|
129 |
mov ebx, [uv_stride] ; |
mov ebx, prm_uv_stride ; |
130 |
mov ecx, eax ; |
mov TMP1, _EAX ; |
131 |
shr ecx, 1 ; |
shr TMP1, 1 ; |
132 |
sub ebx, ecx ; |
sub _EBX, TMP1 ; |
133 |
mov [uv_dif], ebx ; uv_dif = uv_stride - fixed_width/2 |
mov uv_dif, ebx ; uv_dif = uv_stride - fixed_width/2 |
134 |
|
|
135 |
mov esi, [y_ptr] ; $esi$ = y_ptr |
%ifdef ARCH_IS_X86_64 |
136 |
mov edi, [x_ptr] ; $edi$ = x_ptr |
%ifndef WINDOWS |
137 |
mov edx, [x_stride] ; $edx$ = x_stride |
mov TMP1d, prm_x_stride |
138 |
mov ebp, [height] ; $ebp$ = height |
mov _ESI, prm_y_ptr |
139 |
|
mov _EDX, TMP1 |
140 |
|
%else |
141 |
|
mov _ESI, prm_y_ptr |
142 |
|
mov _EDI, prm_x_ptr |
143 |
|
%endif |
144 |
|
%else |
145 |
|
mov _ESI, prm_y_ptr ; $esi$ = y_ptr |
146 |
|
mov _EDI, prm_x_ptr ; $edi$ = x_ptr |
147 |
|
mov edx, prm_x_stride ; $edx$ = x_stride |
148 |
|
%endif |
149 |
|
|
150 |
|
mov ebp, prm_height ; $ebp$ = height |
151 |
|
|
152 |
mov ebx, [vflip] |
mov ebx, prm_vflip |
153 |
or ebx, ebx |
or _EBX, _EBX |
154 |
jz .dont_flip |
jz .dont_flip |
155 |
|
|
156 |
; --- do flipping --- |
; --- do flipping --- |
157 |
|
|
158 |
xor ebx,ebx |
xor _EBX,_EBX |
159 |
%rep BYTES |
%rep BYTES |
160 |
sub ebx, eax |
sub _EBX, _EAX |
161 |
%endrep |
%endrep |
162 |
sub ebx, edx |
sub _EBX, _EDX |
163 |
mov [x_dif], ebx ; x_dif = -BYTES*fixed_width - x_stride |
mov x_dif, _EBX ; x_dif = -BYTES*fixed_width - x_stride |
164 |
|
|
165 |
mov eax, ebp |
mov _EAX, _EBP |
166 |
sub eax, 1 |
sub _EAX, 1 |
167 |
push edx |
%ifdef ARCH_IS_X86_64 |
168 |
|
mov TMP1, _EDX |
169 |
mul edx |
mul edx |
170 |
pop edx |
mov _EDX, TMP1 |
171 |
add edi, eax ; $edi$ += (height-1) * x_stride |
%else |
172 |
|
push _EDX |
173 |
|
mul edx |
174 |
|
pop _EDX |
175 |
|
%endif |
176 |
|
add _EDI, _EAX ; $edi$ += (height-1) * x_stride |
177 |
|
|
178 |
neg edx ; x_stride = -x_stride |
neg _EDX ; x_stride = -x_stride |
179 |
|
|
180 |
.dont_flip |
.dont_flip: |
181 |
|
|
182 |
; --- begin loop --- |
; --- begin loop --- |
183 |
|
|
184 |
mov eax, [y_stride] ; $eax$ = y_stride |
mov eax, prm_y_stride ; $eax$ = y_stride |
185 |
mov ebx, [u_ptr] ; $ebx$ = u_ptr |
mov _EBX, prm_u_ptr ; $ebx$ = u_ptr |
186 |
mov ecx, [v_ptr] ; $ecx$ = v_ptr |
mov _ECX, prm_v_ptr ; $ecx$ = v_ptr |
187 |
|
|
188 |
FUNC %+ _INIT ARG1, ARG2 ; call FUNC_INIT |
FUNC %+ _INIT ARG1, ARG2 ; call FUNC_INIT |
189 |
|
|
190 |
.y_loop |
.y_loop: |
191 |
mov [tmp_height], ebp |
mov tmp_height, ebp |
192 |
mov ebp, [fixed_width] |
mov ebp, fixed_width |
193 |
|
|
194 |
.x_loop |
.x_loop: |
195 |
FUNC ARG1, ARG2 ; call FUNC |
FUNC ARG1, ARG2 ; call FUNC |
196 |
|
|
197 |
add edi, BYTES*PIXELS ; x_ptr += BYTES*PIXELS |
add _EDI, BYTES*PIXELS ; x_ptr += BYTES*PIXELS |
198 |
add esi, PIXELS ; y_ptr += PIXELS |
add _ESI, PIXELS ; y_ptr += PIXELS |
199 |
add ebx, PIXELS/2 ; u_ptr += PIXELS/2 |
add _EBX, PIXELS/2 ; u_ptr += PIXELS/2 |
200 |
add ecx, PIXELS/2 ; v_ptr += PIXELS/2 |
add _ECX, PIXELS/2 ; v_ptr += PIXELS/2 |
201 |
|
|
202 |
sub ebp, PIXELS ; $ebp$ -= PIXELS |
sub _EBP, PIXELS ; $ebp$ -= PIXELS |
203 |
jg .x_loop ; if ($ebp$ > 0) goto .x_loop |
jg .x_loop ; if ($ebp$ > 0) goto .x_loop |
204 |
|
|
205 |
mov ebp, [tmp_height] |
mov ebp, tmp_height |
206 |
add edi, [x_dif] ; x_ptr += x_dif + (VPIXELS-1)*x_stride |
add _EDI, x_dif ; x_ptr += x_dif + (VPIXELS-1)*x_stride |
207 |
add esi, [y_dif] ; y_ptr += y_dif + (VPIXELS-1)*y_stride |
%ifdef ARCH_IS_X86_64 |
208 |
|
mov TMP1d, y_dif |
209 |
|
add _ESI, TMP1 ; y_ptr += y_dif + (VPIXELS-1)*y_stride |
210 |
|
%else |
211 |
|
add _ESI, y_dif ; y_ptr += y_dif + (VPIXELS-1)*y_stride |
212 |
|
%endif |
213 |
|
|
214 |
%rep VPIXELS-1 |
%rep VPIXELS-1 |
215 |
add edi, edx |
add _EDI, _EDX |
216 |
add esi, eax |
add _ESI, _EAX |
217 |
%endrep |
%endrep |
218 |
|
|
219 |
add ebx, [uv_dif] ; u_ptr += uv_dif + ((VPIXELS/2)-1)*uv_stride |
%ifdef ARCH_IS_X86_64 |
220 |
add ecx, [uv_dif] ; v_ptr += uv_dif + ((VPIXELS/2)-1)*uv_stride |
mov TMP1d, uv_dif |
221 |
|
add _EBX, TMP1 ; u_ptr += uv_dif + ((VPIXELS/2)-1)*uv_stride |
222 |
|
add _ECX, TMP1 ; v_ptr += uv_dif + ((VPIXELS/2)-1)*uv_stride |
223 |
|
%else |
224 |
|
add _EBX, uv_dif ; u_ptr += uv_dif + ((VPIXELS/2)-1)*uv_stride |
225 |
|
add _ECX, uv_dif ; v_ptr += uv_dif + ((VPIXELS/2)-1)*uv_stride |
226 |
|
%endif |
227 |
|
|
228 |
%rep (VPIXELS/2)-1 |
%rep (VPIXELS/2)-1 |
229 |
add ebx, [uv_stride] |
%ifdef ARCH_IS_X86_64 |
230 |
add ecx, [uv_stride] |
mov TMP1d, prm_uv_stride |
231 |
|
add _EBX, TMP1 |
232 |
|
add _ECX, TMP1 |
233 |
|
%else |
234 |
|
add _EBX, prm_uv_stride |
235 |
|
add _ECX, prm_uv_stride |
236 |
|
%endif |
237 |
%endrep |
%endrep |
238 |
|
|
239 |
sub ebp, VPIXELS ; $ebp$ -= VPIXELS |
sub _EBP, VPIXELS ; $ebp$ -= VPIXELS |
240 |
jg .y_loop ; if ($ebp$ > 0) goto .y_loop |
jg .y_loop ; if ($ebp$ > 0) goto .y_loop |
241 |
|
|
242 |
; cleanup stack & undef everything |
; cleanup stack & undef everything |
243 |
|
|
244 |
add esp, localsize |
add _ESP, localsize |
245 |
pop ebp |
|
246 |
pop edi |
pop _EBP |
247 |
pop esi |
%ifndef ARCH_IS_X86_64 |
248 |
pop ebx |
pop _EDI |
249 |
|
pop _ESI |
250 |
%undef vflip |
%else |
251 |
%undef height |
%ifdef WINDOWS |
252 |
%undef width |
pop _EDI |
253 |
%undef uv_stride |
pop _ESI |
254 |
%undef y_stride |
%endif |
255 |
%undef v_ptr |
%endif |
256 |
%undef u_ptr |
pop _EBX |
257 |
%undef y_ptr |
|
258 |
%undef x_stride |
%undef prm_vflip |
259 |
%undef x_ptr |
%undef prm_height |
260 |
|
%undef prm_width |
261 |
|
%undef prm_uv_stride |
262 |
|
%undef prm_y_stride |
263 |
|
%undef prm_v_ptr |
264 |
|
%undef prm_u_ptr |
265 |
|
%undef prm_y_ptr |
266 |
|
%undef prm_x_stride |
267 |
|
%undef prm_x_ptr |
268 |
%undef _ip |
%undef _ip |
269 |
%undef x_dif |
%undef x_dif |
270 |
%undef y_dif |
%undef y_dif |
272 |
%undef fixed_width |
%undef fixed_width |
273 |
%undef tmp_height |
%undef tmp_height |
274 |
ret |
ret |
275 |
|
ENDFUNC |
276 |
%undef NAME |
%undef NAME |
277 |
%undef STACK |
%undef STACK |
278 |
%undef BYTES |
%undef BYTES |