Parent Directory | Revision Log
Revision 205 - (view) (download)
1 : | ia64p | 205 | .text |
2 : | .align 16 | ||
3 : | .global sad16_ia64# | ||
4 : | .proc sad16_ia64# | ||
5 : | sad16_ia64: | ||
6 : | |||
7 : | _LL=3 | ||
8 : | _SL=1 | ||
9 : | _OL=1 | ||
10 : | _PL=1 | ||
11 : | _AL=1 | ||
12 : | |||
13 : | alloc r9=ar.pfs,4,44,0,48 | ||
14 : | |||
15 : | mov r8 = r0 | ||
16 : | |||
17 : | mov r20 = ar.lc | ||
18 : | mov r21 = pr | ||
19 : | |||
20 : | dep.z r22 = r32, 3, 3 // erste 3 Bit mit 8 multiplizieren | ||
21 : | dep.z r23 = r33, 3, 3 // in r22 und r23 -> Schiebeflags | ||
22 : | |||
23 : | and r14 = -8, r32 // Parameter in untere Register kopieren | ||
24 : | and r15 = -8, r33 // Ref Cur mit 11111...1000 and-en | ||
25 : | mov r16 = r34 | ||
26 : | mov r17 = r35 | ||
27 : | ;; | ||
28 : | add r18 = 8, r14 // Adressenvorausberechnen | ||
29 : | add r19 = 8, r15 | ||
30 : | |||
31 : | sub r24 = 64, r22 // Schiftanzahl ausrechnen | ||
32 : | sub r25 = 64, r23 | ||
33 : | |||
34 : | add r26 = 16, r14 // Adressenvorausberechnen | ||
35 : | add r27 = 16, r15 | ||
36 : | |||
37 : | // Loop-counter initialisieren | ||
38 : | mov ar.lc = 15 // Loop 16 mal durchlaufen | ||
39 : | mov ar.ec = _LL + _SL + _OL + _PL + _AL + _AL // Die Loop am Schluss noch neun mal durchlaufen | ||
40 : | |||
41 : | // Rotating Predicate Register zuruecksetzen und P16 auf 1 | ||
42 : | mov pr.rot = 1 << 16 | ||
43 : | ;; | ||
44 : | |||
45 : | // Array-Konstrukte initialisieren | ||
46 : | .rotr _ald1[_LL+1], _ald2[_LL+1], _ald3[_LL+1], _ald4[_LL+1], _ald5[_LL+1], _ald6[_LL+1], _shru1[_SL+1], _shl1[_SL+1], _shru2[_SL], _shl2[_SL], _shru3[_SL], _shl3[_SL], _shru4[_SL], _shl4[_SL+1], _or1[_OL], _or2[_OL], _or3[_OL], _or4[_OL+1], _psadr1[_PL+1], _psadr2[_PL+1], _addr1[_AL+1] | ||
47 : | .rotp _aldp[_LL], _shp[_SL], _orp[_OL], _psadrp[_PL], _addrp1[_AL], _addrp2[_AL] | ||
48 : | |||
49 : | .L_loop_16: | ||
50 : | {.mmi | ||
51 : | (_aldp[0]) ld8 _ald1[0] = [r14], r16 // Cur Erste 8 Byte | ||
52 : | (_aldp[0]) ld8 _ald2[0] = [r18], r16 // Cur Zweite 8 Byte | ||
53 : | (_psadrp[0]) psad1 _psadr1[0] = _or2[0], _or4[0] // Psadden | ||
54 : | } | ||
55 : | {.mmi | ||
56 : | (_aldp[0]) ld8 _ald3[0] = [r26], r16 // Cur Dritte 8 Byte | ||
57 : | (_aldp[0]) ld8 _ald4[0] = [r15], r16 // Ref Erste 8 Byte | ||
58 : | (_psadrp[0]) psad1 _psadr2[0] = _or3[0], _or4[_OL] // _or2 +1 | ||
59 : | } | ||
60 : | {.mmi | ||
61 : | (_aldp[0]) ld8 _ald5[0] = [r19], r16 // Ref Zweite 8 Byte | ||
62 : | (_aldp[0]) ld8 _ald6[0] = [r27], r16 // Ref Dritte 8 Byte | ||
63 : | (_shp[0]) shr.u _shru1[0] = _ald1[_LL], r22 | ||
64 : | } | ||
65 : | {.mii | ||
66 : | (_orp[0]) or _or1[0] = _shl2[0], _shru3[0] // _shru2 + 1 und _shl2 + 1 | ||
67 : | (_shp[0]) shl _shl1[0] = _ald2[_LL], r24 | ||
68 : | (_shp[0]) shr.u _shru2[0] = _ald2[_LL], r22 | ||
69 : | } | ||
70 : | {.mii | ||
71 : | (_orp[0]) or _or2[0] = _shl3[0], _shru4[0] // _shru3 + 1 und _shl3 + 1 | ||
72 : | (_shp[0]) shl _shl2[0] = _ald3[_LL], r24 | ||
73 : | (_shp[0]) shr.u _shru3[0] = _ald4[_LL], r23 | ||
74 : | } | ||
75 : | {.mii | ||
76 : | (_orp[0]) or _or3[0] = _shl4[0], _shl4[_SL] //_shru4 + 1 und _shl4 + 1 | ||
77 : | (_shp[0]) shl _shl3[0] = _ald5[_LL], r25 | ||
78 : | (_shp[0]) shr.u _shru4[0] = _ald5[_LL], r23 | ||
79 : | } | ||
80 : | {.mmi | ||
81 : | (_orp[0]) or _or4[0] = _shru1[_SL], _shl1[_SL] | ||
82 : | (_shp[0]) shl _shl4[0]= _ald6[_LL], r25 | ||
83 : | } | ||
84 : | {.mmb | ||
85 : | (_addrp1[0]) add _addr1[0] = _psadr1[_PL], _psadr2[_PL] // Aufsummieren | ||
86 : | (_addrp2[0]) add r8 = r8, _addr1[_AL] | ||
87 : | br.ctop.sptk.few .L_loop_16 | ||
88 : | ;; | ||
89 : | } | ||
90 : | // Register zurueckschreiben | ||
91 : | mov ar.lc = r20 | ||
92 : | mov pr = r21,-1 | ||
93 : | br.ret.sptk.many rp | ||
94 : | .endp sad16_ia64# | ||
95 : | |||
96 : | |||
97 : | .align 16 | ||
98 : | .global sad8_ia64# | ||
99 : | .proc sad8_ia64# | ||
100 : | |||
101 : | sad8_ia64: | ||
102 : | |||
103 : | LL=3 | ||
104 : | SL=1 | ||
105 : | OL=1 | ||
106 : | PL=1 | ||
107 : | AL=1 | ||
108 : | |||
109 : | alloc r9=ar.pfs,3,29,0,32 | ||
110 : | mov r20 = ar.lc | ||
111 : | mov r21 = pr | ||
112 : | |||
113 : | dep.z r22 = r32, 3, 3 // erste 3 Bit mit 8 multiplizieren | ||
114 : | dep.z r23 = r33, 3, 3 // in r22 und r23 -> Schiebeflags | ||
115 : | |||
116 : | mov r8 = r0 // . . . . | ||
117 : | and r14 = -8, r32 // 0xFFFFFFFFFFFFFFF8, r32 | ||
118 : | and r15 = -8, r33 // 0xFFFFFFFFFFFFFFF8, r33 | ||
119 : | mov r16 = r34 | ||
120 : | // mov r17 = r35 | ||
121 : | ;; | ||
122 : | |||
123 : | add r18 = 8, r14 | ||
124 : | add r19 = 8, r15 | ||
125 : | |||
126 : | sub r24 = 64, r22 | ||
127 : | sub r25 = 64, r23 | ||
128 : | |||
129 : | // Loop-counter initialisieren | ||
130 : | mov ar.lc = 7 // Loop 7 mal durchlaufen | ||
131 : | mov ar.ec = LL + SL + OL + PL + AL // Die Loop am Schluss noch zehn mal durchlaufen | ||
132 : | |||
133 : | // Rotating Predicate Register zuruecksetzen und P16 auf 1 | ||
134 : | mov pr.rot = 1 << 16 | ||
135 : | ;; | ||
136 : | .rotr ald1[LL+1], ald2[LL+1], ald3[LL+1], ald4[LL+1], shru1[SL+1], shl1[SL+1], shru2[SL+1], shl2[SL+1], or1[OL+1], or2[OL+1], psadr[PL+1], addr[AL+1] | ||
137 : | .rotp aldp[LL], shp[SL], orp[OL], psadrp[PL], addrp[AL] | ||
138 : | .L_loop_8: | ||
139 : | {.mmi | ||
140 : | (aldp[0]) ld8 ald1[0] = [r14], r16 // Cur laden | ||
141 : | (aldp[0]) ld8 ald2[0] = [r18], r16 | ||
142 : | (shp[0]) shr.u shru1[0] = ald1[LL], r22 // mergen | ||
143 : | } | ||
144 : | {.mii | ||
145 : | (orp[0]) or or1[0] = shru1[SL], shl1[SL] | ||
146 : | (shp[0]) shl shl1[0] = ald2[LL], r24 | ||
147 : | (shp[0]) shr.u shru2[0] = ald3[LL], r23 // mergen | ||
148 : | } | ||
149 : | {.mmi | ||
150 : | (aldp[0]) ld8 ald3[0] = [r15], r16 // Ref laden | ||
151 : | (aldp[0]) ld8 ald4[0] = [r19], r16 | ||
152 : | (shp[0]) shl shl2[0] = ald4[LL], r25 | ||
153 : | } | ||
154 : | {.mmi | ||
155 : | (orp[0]) or or2[0] = shru2[SL], shl2[SL] | ||
156 : | (addrp[0]) add r8 = r8, psadr[PL] | ||
157 : | (psadrp[0]) psad1 psadr[0] = or1[OL], or2[OL] | ||
158 : | } | ||
159 : | {.mbb | ||
160 : | br.ctop.sptk.few .L_loop_8 | ||
161 : | ;; | ||
162 : | } | ||
163 : | |||
164 : | mov ar.lc = r20 | ||
165 : | mov pr = r21,-1 | ||
166 : | br.ret.sptk.many b0 | ||
167 : | .endp sad8_ia64# | ||
168 : | |||
169 : | |||
170 : | .common sad16bi#,8,8 | ||
171 : | .align 16 | ||
172 : | .global sad16bi_ia64# | ||
173 : | .proc sad16bi_ia64# | ||
174 : | sad16bi_ia64: | ||
175 : | .prologue | ||
176 : | .save ar.lc, r2 | ||
177 : | mov r2 = ar.lc | ||
178 : | .body | ||
179 : | zxt4 r35 = r35 | ||
180 : | mov r8 = r0 | ||
181 : | mov r23 = r0 | ||
182 : | addl r22 = 255, r0 | ||
183 : | .L21: | ||
184 : | addl r14 = 7, r0 | ||
185 : | mov r19 = r32 | ||
186 : | mov r21 = r34 | ||
187 : | mov r20 = r33 | ||
188 : | ;; | ||
189 : | mov ar.lc = r14 | ||
190 : | ;; | ||
191 : | .L105: | ||
192 : | mov r17 = r20 | ||
193 : | mov r18 = r21 | ||
194 : | ;; | ||
195 : | ld1 r14 = [r17], 1 | ||
196 : | ld1 r15 = [r18], 1 | ||
197 : | ;; | ||
198 : | add r14 = r14, r15 | ||
199 : | ;; | ||
200 : | adds r14 = 1, r14 | ||
201 : | ;; | ||
202 : | shr.u r16 = r14, 1 | ||
203 : | ;; | ||
204 : | cmp4.le p6, p7 = r0, r16 | ||
205 : | ;; | ||
206 : | (p7) mov r16 = r0 | ||
207 : | (p7) br.cond.dpnt .L96 | ||
208 : | ;; | ||
209 : | cmp4.ge p6, p7 = r22, r16 | ||
210 : | ;; | ||
211 : | (p7) addl r16 = 255, r0 | ||
212 : | .L96: | ||
213 : | ld1 r14 = [r19] | ||
214 : | adds r20 = 2, r20 | ||
215 : | adds r21 = 2, r21 | ||
216 : | ;; | ||
217 : | sub r15 = r14, r16 | ||
218 : | ;; | ||
219 : | cmp4.ge p6, p7 = 0, r15 | ||
220 : | ;; | ||
221 : | (p6) sub r14 = r16, r14 | ||
222 : | (p7) add r8 = r8, r15 | ||
223 : | ;; | ||
224 : | (p6) add r8 = r8, r14 | ||
225 : | ld1 r15 = [r18] | ||
226 : | ld1 r14 = [r17] | ||
227 : | ;; | ||
228 : | add r14 = r14, r15 | ||
229 : | adds r17 = 1, r19 | ||
230 : | ;; | ||
231 : | adds r14 = 1, r14 | ||
232 : | ;; | ||
233 : | shr.u r16 = r14, 1 | ||
234 : | ;; | ||
235 : | cmp4.le p6, p7 = r0, r16 | ||
236 : | ;; | ||
237 : | (p7) mov r16 = r0 | ||
238 : | (p7) br.cond.dpnt .L102 | ||
239 : | ;; | ||
240 : | cmp4.ge p6, p7 = r22, r16 | ||
241 : | ;; | ||
242 : | (p7) addl r16 = 255, r0 | ||
243 : | .L102: | ||
244 : | ld1 r14 = [r17] | ||
245 : | adds r19 = 2, r19 | ||
246 : | ;; | ||
247 : | sub r15 = r14, r16 | ||
248 : | ;; | ||
249 : | cmp4.ge p6, p7 = 0, r15 | ||
250 : | ;; | ||
251 : | (p7) add r8 = r8, r15 | ||
252 : | (p6) sub r14 = r16, r14 | ||
253 : | ;; | ||
254 : | (p6) add r8 = r8, r14 | ||
255 : | br.cloop.sptk.few .L105 | ||
256 : | adds r23 = 1, r23 | ||
257 : | add r32 = r32, r35 | ||
258 : | add r33 = r33, r35 | ||
259 : | add r34 = r34, r35 | ||
260 : | ;; | ||
261 : | cmp4.geu p6, p7 = 15, r23 | ||
262 : | (p6) br.cond.dptk .L21 | ||
263 : | mov ar.lc = r2 | ||
264 : | br.ret.sptk.many b0 | ||
265 : | .endp sad16bi_ia64# | ||
266 : | |||
267 : | |||
268 : | .common dev16#,8,8 | ||
269 : | .align 16 | ||
270 : | .global dev16_ia64# | ||
271 : | .proc dev16_ia64# | ||
272 : | dev16_ia64: | ||
273 : | .prologue | ||
274 : | zxt4 r33 = r33 | ||
275 : | .save ar.lc, r2 | ||
276 : | mov r2 = ar.lc | ||
277 : | .body | ||
278 : | mov r21 = r0 | ||
279 : | mov r8 = r0 | ||
280 : | mov r23 = r32 | ||
281 : | mov r24 = r0 | ||
282 : | ;; | ||
283 : | mov r25 = r33 | ||
284 : | .L50: | ||
285 : | mov r22 = r0 | ||
286 : | mov r20 = r23 | ||
287 : | ;; | ||
288 : | .L54: | ||
289 : | mov r16 = r20 | ||
290 : | adds r14 = 2, r20 | ||
291 : | adds r15 = 3, r20 | ||
292 : | ;; | ||
293 : | ld1 r17 = [r16], 1 | ||
294 : | ld1 r18 = [r14] | ||
295 : | ld1 r19 = [r15] | ||
296 : | ;; | ||
297 : | ld1 r14 = [r16] | ||
298 : | add r21 = r17, r21 | ||
299 : | adds r15 = 4, r20 | ||
300 : | ;; | ||
301 : | add r21 = r14, r21 | ||
302 : | ld1 r16 = [r15] | ||
303 : | adds r22 = 8, r22 | ||
304 : | ;; | ||
305 : | add r21 = r18, r21 | ||
306 : | adds r14 = 5, r20 | ||
307 : | adds r15 = 6, r20 | ||
308 : | ;; | ||
309 : | add r21 = r19, r21 | ||
310 : | ld1 r17 = [r14] | ||
311 : | ld1 r18 = [r15] | ||
312 : | ;; | ||
313 : | add r21 = r16, r21 | ||
314 : | adds r14 = 7, r20 | ||
315 : | cmp4.geu p6, p7 = 15, r22 | ||
316 : | ;; | ||
317 : | add r21 = r17, r21 | ||
318 : | ld1 r15 = [r14] | ||
319 : | adds r20 = 8, r20 | ||
320 : | ;; | ||
321 : | add r21 = r18, r21 | ||
322 : | ;; | ||
323 : | add r21 = r15, r21 | ||
324 : | (p6) br.cond.dptk .L54 | ||
325 : | adds r24 = 1, r24 | ||
326 : | add r23 = r23, r25 | ||
327 : | ;; | ||
328 : | cmp4.geu p6, p7 = 15, r24 | ||
329 : | (p6) br.cond.dptk .L50 | ||
330 : | extr.u r14 = r21, 8, 24 | ||
331 : | mov r23 = r32 | ||
332 : | mov r24 = r0 | ||
333 : | ;; | ||
334 : | mov r21 = r14 | ||
335 : | .L60: | ||
336 : | addl r14 = 3, r0 | ||
337 : | mov r17 = r23 | ||
338 : | ;; | ||
339 : | mov ar.lc = r14 | ||
340 : | ;; | ||
341 : | .L144: | ||
342 : | mov r16 = r17 | ||
343 : | ;; | ||
344 : | ld1 r14 = [r16], 1 | ||
345 : | ;; | ||
346 : | sub r15 = r14, r21 | ||
347 : | ;; | ||
348 : | cmp4.ge p6, p7 = 0, r15 | ||
349 : | ;; | ||
350 : | (p7) add r8 = r8, r15 | ||
351 : | (p6) sub r14 = r21, r14 | ||
352 : | ;; | ||
353 : | (p6) add r8 = r8, r14 | ||
354 : | ld1 r14 = [r16] | ||
355 : | ;; | ||
356 : | sub r15 = r14, r21 | ||
357 : | adds r16 = 2, r17 | ||
358 : | ;; | ||
359 : | cmp4.ge p6, p7 = 0, r15 | ||
360 : | ;; | ||
361 : | (p7) add r8 = r8, r15 | ||
362 : | (p6) sub r14 = r21, r14 | ||
363 : | ;; | ||
364 : | (p6) add r8 = r8, r14 | ||
365 : | ld1 r14 = [r16] | ||
366 : | ;; | ||
367 : | sub r15 = r14, r21 | ||
368 : | adds r16 = 3, r17 | ||
369 : | ;; | ||
370 : | cmp4.ge p6, p7 = 0, r15 | ||
371 : | adds r17 = 4, r17 | ||
372 : | ;; | ||
373 : | (p7) add r8 = r8, r15 | ||
374 : | (p6) sub r14 = r21, r14 | ||
375 : | ;; | ||
376 : | (p6) add r8 = r8, r14 | ||
377 : | ld1 r14 = [r16] | ||
378 : | ;; | ||
379 : | sub r15 = r14, r21 | ||
380 : | ;; | ||
381 : | cmp4.ge p6, p7 = 0, r15 | ||
382 : | ;; | ||
383 : | (p7) add r8 = r8, r15 | ||
384 : | (p6) sub r14 = r21, r14 | ||
385 : | ;; | ||
386 : | (p6) add r8 = r8, r14 | ||
387 : | br.cloop.sptk.few .L144 | ||
388 : | adds r24 = 1, r24 | ||
389 : | add r23 = r23, r33 | ||
390 : | ;; | ||
391 : | cmp4.geu p6, p7 = 15, r24 | ||
392 : | (p6) br.cond.dptk .L60 | ||
393 : | mov ar.lc = r2 | ||
394 : | br.ret.sptk.many b0 | ||
395 : | .endp dev16_ia64# |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |