[svn] / trunk / xvidcore / src / motion / ia64_asm / sad_ia64.s Repository:
ViewVC logotype

Annotation of /trunk/xvidcore/src/motion/ia64_asm/sad_ia64.s

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1855 - (view) (download)

1 : Isibaar 1855 // ****************************************************************************
2 :     // *
3 :     // * XVID MPEG-4 VIDEO CODEC
4 :     // * - IA64 sum of absolute differences -
5 :     // *
6 :     // * Copyright(C) 2002 Hannes Jütting, Christopher Özbek
7 :     // *
8 :     // * This program is free software; you can redistribute it and/or modify it
9 :     // * under the terms of the GNU General Public License as published by
10 :     // * the Free Software Foundation; either version 2 of the License, or
11 :     // * (at your option) any later version.
12 :     // *
13 :     // * This program is distributed in the hope that it will be useful,
14 :     // * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 :     // * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :     // * GNU General Public License for more details.
17 :     // *
18 :     // * You should have received a copy of the GNU General Public License
19 :     // * along with this program; if not, write to the Free Software
20 :     // * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :     // *
22 :     // * $Id: sad_ia64.s,v 1.8 2009-02-19 17:07:29 Isibaar Exp $
23 :     // *
24 :     // ***************************************************************************/
25 :     //
26 :     // ****************************************************************************
27 :     // *
28 :     // * sad_ia64.s, IA-64 sum of absolute differences
29 :     // *
30 :     // * This version was implemented during an IA-64 practical training at
31 :     // * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/)
32 :     // *
33 :     // ****************************************************************************
34 :    
35 : ia64p 250 // ------------------------------------------------------------------------------
36 :     // *
37 :     // * Optimized Assembler Versions of sad8 and sad16
38 :     // *
39 :     // ------------------------------------------------------------------------------
40 :     // *
41 :     // * Hannes Jütting and Christopher Özbek
42 :     // * {s_juetti,s_oezbek}@ira.uka.de
43 :     // *
44 :     // * Programmed for the IA64 laboratory held at University Karlsruhe 2002
45 :     // * http://www.info.uni-karlsruhe.de/~rubino/ia64p/
46 :     // *
47 :     // ------------------------------------------------------------------------------
48 :     // *
49 :     // * These are the optimized assembler versions of sad8 and sad16, which calculate
50 :     // * the sum of absolute differences between two 8x8/16x16 block matrices.
51 :     // *
52 :     // * Our approach uses:
53 :     // * - The Itanium command psad1, which solves the problem in hardware.
54 :     // * - Modulo-Scheduled Loops as the best way to loop unrolling on the IA64
55 :     // * EPIC architecture
56 :     // * - Alignment resolving to avoid memory faults
57 :     // *
58 :     // ------------------------------------------------------------------------------
59 : edgomez 851
60 : ia64p 250
61 : edgomez 851
62 :    
63 : ia64p 205 .common sad16bi#,8,8
64 :     .align 16
65 :     .global sad16bi_ia64#
66 :     .proc sad16bi_ia64#
67 :     sad16bi_ia64:
68 :     .prologue
69 :     .save ar.lc, r2
70 :     mov r2 = ar.lc
71 :     .body
72 :     zxt4 r35 = r35
73 :     mov r8 = r0
74 :     mov r23 = r0
75 :     addl r22 = 255, r0
76 :     .L21:
77 :     addl r14 = 7, r0
78 :     mov r19 = r32
79 :     mov r21 = r34
80 :     mov r20 = r33
81 :     ;;
82 :     mov ar.lc = r14
83 :     ;;
84 :     .L105:
85 :     mov r17 = r20
86 :     mov r18 = r21
87 :     ;;
88 :     ld1 r14 = [r17], 1
89 :     ld1 r15 = [r18], 1
90 :     ;;
91 :     add r14 = r14, r15
92 :     ;;
93 :     adds r14 = 1, r14
94 :     ;;
95 :     shr.u r16 = r14, 1
96 :     ;;
97 :     cmp4.le p6, p7 = r0, r16
98 :     ;;
99 :     (p7) mov r16 = r0
100 :     (p7) br.cond.dpnt .L96
101 :     ;;
102 :     cmp4.ge p6, p7 = r22, r16
103 :     ;;
104 :     (p7) addl r16 = 255, r0
105 :     .L96:
106 :     ld1 r14 = [r19]
107 :     adds r20 = 2, r20
108 :     adds r21 = 2, r21
109 :     ;;
110 :     sub r15 = r14, r16
111 :     ;;
112 :     cmp4.ge p6, p7 = 0, r15
113 :     ;;
114 :     (p6) sub r14 = r16, r14
115 :     (p7) add r8 = r8, r15
116 :     ;;
117 :     (p6) add r8 = r8, r14
118 :     ld1 r15 = [r18]
119 :     ld1 r14 = [r17]
120 :     ;;
121 :     add r14 = r14, r15
122 :     adds r17 = 1, r19
123 :     ;;
124 :     adds r14 = 1, r14
125 :     ;;
126 :     shr.u r16 = r14, 1
127 :     ;;
128 :     cmp4.le p6, p7 = r0, r16
129 :     ;;
130 :     (p7) mov r16 = r0
131 :     (p7) br.cond.dpnt .L102
132 :     ;;
133 :     cmp4.ge p6, p7 = r22, r16
134 :     ;;
135 :     (p7) addl r16 = 255, r0
136 :     .L102:
137 :     ld1 r14 = [r17]
138 :     adds r19 = 2, r19
139 :     ;;
140 :     sub r15 = r14, r16
141 :     ;;
142 :     cmp4.ge p6, p7 = 0, r15
143 :     ;;
144 :     (p7) add r8 = r8, r15
145 :     (p6) sub r14 = r16, r14
146 :     ;;
147 :     (p6) add r8 = r8, r14
148 :     br.cloop.sptk.few .L105
149 :     adds r23 = 1, r23
150 :     add r32 = r32, r35
151 :     add r33 = r33, r35
152 :     add r34 = r34, r35
153 :     ;;
154 :     cmp4.geu p6, p7 = 15, r23
155 :     (p6) br.cond.dptk .L21
156 :     mov ar.lc = r2
157 :     br.ret.sptk.many b0
158 :     .endp sad16bi_ia64#
159 :    
160 :    
161 : ia64p 230
162 :    
163 :    
164 :    
165 :    
166 :    
167 :     .text
168 : ia64p 205 .align 16
169 :     .global dev16_ia64#
170 :     .proc dev16_ia64#
171 : ia64p 230 .auto
172 : ia64p 205 dev16_ia64:
173 : ia64p 230 // renamings for better readability
174 :     stride = r18
175 :     pfs = r19 //for saving previous function state
176 :     cura0 = r20 //address of first 8-byte block of cur
177 :     cura1 = r21 //address of second 8-byte block of cur
178 :     mean0 = r22 //registers for calculating the sum in parallel
179 :     mean1 = r23
180 :     mean2 = r24
181 :     mean3 = r25
182 :     dev0 = r26 //same for the deviation
183 :     dev1 = r27
184 :     dev2 = r28
185 :     dev3 = r29
186 :    
187 : ia64p 205 .body
188 : ia64p 230 alloc pfs = ar.pfs, 2, 38, 0, 40
189 :    
190 :     mov cura0 = in0
191 :     mov stride = in1
192 :     add cura1 = 8, cura0
193 :    
194 :     .rotr c[32], psad[8] // just using rotating registers to get an array ;-)
195 :    
196 :     .explicit
197 :     {.mmi
198 :     ld8 c[0] = [cura0], stride // load them ...
199 :     ld8 c[1] = [cura1], stride
200 :     ;;
201 :     }
202 :     {.mmi
203 :     ld8 c[2] = [cura0], stride
204 :     ld8 c[3] = [cura1], stride
205 :     ;;
206 :     }
207 :     {.mmi
208 :     ld8 c[4] = [cura0], stride
209 :     ld8 c[5] = [cura1], stride
210 : ia64p 205 ;;
211 : ia64p 230 }
212 :     {.mmi
213 :     ld8 c[6] = [cura0], stride
214 :     ld8 c[7] = [cura1], stride
215 : ia64p 205 ;;
216 : ia64p 230 }
217 :     {.mmi
218 :     ld8 c[8] = [cura0], stride
219 :     ld8 c[9] = [cura1], stride
220 : ia64p 205 ;;
221 : ia64p 230 }
222 :     {.mmi
223 :     ld8 c[10] = [cura0], stride
224 :     ld8 c[11] = [cura1], stride
225 : ia64p 205 ;;
226 : ia64p 230 }
227 :     {.mii
228 :     ld8 c[12] = [cura0], stride
229 :     psad1 mean0 = c[0], r0 // get the sum of them ...
230 :     psad1 mean1 = c[1], r0
231 :     }
232 :     {.mmi
233 :     ld8 c[13] = [cura1], stride
234 :     ;;
235 :     ld8 c[14] = [cura0], stride
236 :     psad1 mean2 = c[2], r0
237 :     }
238 :     {.mii
239 :     ld8 c[15] = [cura1], stride
240 :     psad1 mean3 = c[3], r0
241 :     ;;
242 :     psad1 psad[0] = c[4], r0
243 :     }
244 :     {.mmi
245 :     ld8 c[16] = [cura0], stride
246 :     ld8 c[17] = [cura1], stride
247 :     psad1 psad[1] = c[5], r0
248 : ia64p 205 ;;
249 : ia64p 230 }
250 :     {.mii
251 :     ld8 c[18] = [cura0], stride
252 :     psad1 psad[2] = c[6], r0
253 :     psad1 psad[3] = c[7], r0
254 :     }
255 :     {.mmi
256 :     ld8 c[19] = [cura1], stride
257 :     ;;
258 :     ld8 c[20] = [cura0], stride
259 :     psad1 psad[4] = c[8], r0
260 :     }
261 :     {.mii
262 :     ld8 c[21] = [cura1], stride
263 :     psad1 psad[5] = c[9], r0
264 : ia64p 205 ;;
265 : ia64p 230 add mean0 = mean0, psad[0]
266 :     }
267 :     {.mmi
268 :     ld8 c[22] = [cura0], stride
269 :     ld8 c[23] = [cura1], stride
270 :     add mean1 = mean1, psad[1]
271 :     ;;
272 :     }
273 :     {.mii
274 :     ld8 c[24] = [cura0], stride
275 :     psad1 psad[0] = c[10], r0
276 :     psad1 psad[1] = c[11], r0
277 :     }
278 :     {.mmi
279 :     ld8 c[25] = [cura1], stride
280 :     ;;
281 :     ld8 c[26] = [cura0], stride
282 :     add mean2 = mean2, psad[2]
283 :     }
284 :     {.mii
285 :     ld8 c[27] = [cura1], stride
286 :     add mean3 = mean3, psad[3]
287 :     ;;
288 :     psad1 psad[2] = c[12], r0
289 :     }
290 :     {.mmi
291 :     ld8 c[28] = [cura0], stride
292 :     ld8 c[29] = [cura1], stride
293 :     psad1 psad[3] = c[13], r0
294 :     ;;
295 :     }
296 :     {.mii
297 :     ld8 c[30] = [cura0]
298 :     psad1 psad[6] = c[14], r0
299 :     psad1 psad[7] = c[15], r0
300 :     }
301 :     {.mmi
302 :     ld8 c[31] = [cura1]
303 :     ;;
304 :     add mean0 = mean0, psad[0]
305 :     add mean1 = mean1, psad[1]
306 :     }
307 :     {.mii
308 :     add mean2 = mean2, psad[4]
309 :     add mean3 = mean3, psad[5]
310 : ia64p 205 ;;
311 : ia64p 230 psad1 psad[0] = c[16], r0
312 :     }
313 :     {.mmi
314 :     add mean0 = mean0, psad[2]
315 :     add mean1 = mean1, psad[3]
316 :     psad1 psad[1] = c[17], r0
317 : ia64p 205 ;;
318 : ia64p 230 }
319 :     {.mii
320 :     add mean2 = mean2, psad[6]
321 :     psad1 psad[2] = c[18], r0
322 :     psad1 psad[3] = c[19], r0
323 :     }
324 :     {.mmi
325 :     add mean3 = mean3, psad[7]
326 :     ;;
327 :     add mean0 = mean0, psad[0]
328 :     psad1 psad[4] = c[20], r0
329 :     }
330 :     {.mii
331 :     add mean1 = mean1, psad[1]
332 :     psad1 psad[5] = c[21], r0
333 : ia64p 205 ;;
334 : ia64p 230 psad1 psad[6] = c[22], r0
335 :     }
336 :     {.mmi
337 :     add mean2 = mean2, psad[2]
338 :     add mean3 = mean3, psad[3]
339 :     psad1 psad[7] = c[23], r0
340 : ia64p 205 ;;
341 : ia64p 230 }
342 :     {.mii
343 :     add mean0 = mean0, psad[4]
344 :     psad1 psad[0] = c[24], r0
345 :     psad1 psad[1] = c[25], r0
346 :     }
347 :     {.mmi
348 :     add mean1 = mean1, psad[5]
349 : ia64p 205 ;;
350 : ia64p 230 add mean2 = mean2, psad[6]
351 :     psad1 psad[2] = c[26], r0
352 :     }
353 :     {.mii
354 :     add mean3 = mean3, psad[7]
355 :     psad1 psad[3] = c[27], r0
356 :     ;;
357 :     psad1 psad[4] = c[28], r0
358 :     }
359 :     {.mmi
360 :     add mean0 = mean0, psad[0]
361 :     add mean1 = mean1, psad[1]
362 :     psad1 psad[5] = c[29], r0
363 : ia64p 205 ;;
364 : ia64p 230 }
365 :     {.mii
366 :     add mean2 = mean2, psad[2]
367 :     psad1 psad[6] = c[30], r0
368 :     psad1 psad[7] = c[31], r0
369 :     }
370 :     {.mmi
371 :     add mean3 = mean3, psad[3]
372 : ia64p 205 ;;
373 : ia64p 230 add mean0 = mean0, psad[4]
374 :     add mean1 = mean1, psad[5]
375 :     }
376 :     {.mbb
377 :     add mean2 = mean2, mean3
378 :     nop.b 1
379 :     nop.b 1
380 : ia64p 205 ;;
381 : ia64p 230 }
382 :     {.mib
383 :     add mean0 = mean0, psad[6]
384 :     add mean1 = mean1, psad[7]
385 :     nop.b 1
386 : ia64p 205 ;;
387 : ia64p 230 }
388 :     {.mib
389 :     add mean0 = mean0, mean1
390 : ia64p 300 // add mean2 = 127, mean2 // this could make our division more exactly, but does not help much
391 : ia64p 205 ;;
392 : ia64p 230 }
393 :     {.mib
394 :     add mean0 = mean0, mean2
395 : ia64p 205 ;;
396 : ia64p 230 }
397 :    
398 :     {.mib
399 :     shr.u mean0 = mean0, 8 // divide them ...
400 : ia64p 205 ;;
401 : ia64p 230 }
402 :     {.mib
403 :     mux1 mean0 = mean0, @brcst
404 :     ;;
405 :     }
406 :     {.mii
407 :     nop.m 0
408 :     psad1 dev0 = c[0], mean0 // and do a sad again ...
409 :     psad1 dev1 = c[1], mean0
410 :     }
411 :     {.mii
412 :     nop.m 0
413 :     psad1 dev2 = c[2], mean0
414 :     psad1 dev3 = c[3], mean0
415 :     }
416 :     {.mii
417 :     nop.m 0
418 :     psad1 psad[0] = c[4], mean0
419 :     psad1 psad[1] = c[5], mean0
420 :     }
421 :     {.mii
422 :     nop.m 0
423 :     psad1 psad[2] = c[6], mean0
424 :     psad1 psad[3] = c[7], mean0
425 :     }
426 :     {.mii
427 :     nop.m 0
428 :     psad1 psad[4] = c[8], mean0
429 :     psad1 psad[5] = c[9], mean0
430 :     ;;
431 :     }
432 :     {.mii
433 :     add dev0 = dev0, psad[0]
434 :     psad1 psad[6] = c[10], mean0
435 :     psad1 psad[7] = c[11], mean0
436 :     }
437 :     {.mmi
438 :     add dev1 = dev1, psad[1]
439 :    
440 :     add dev2 = dev2, psad[2]
441 :     psad1 psad[0] = c[12], mean0
442 :     }
443 :     {.mii
444 :     add dev3 = dev3, psad[3]
445 :     psad1 psad[1] = c[13], mean0
446 :     ;;
447 :     psad1 psad[2] = c[14], mean0
448 :     }
449 :     {.mmi
450 :     add dev0 = dev0, psad[4]
451 :     add dev1 = dev1, psad[5]
452 :     psad1 psad[3] = c[15], mean0
453 :     }
454 :     {.mii
455 :     add dev2 = dev2, psad[6]
456 :     psad1 psad[4] = c[16], mean0
457 :     psad1 psad[5] = c[17], mean0
458 :     }
459 :     {.mmi
460 :     add dev3 = dev3, psad[7]
461 :     ;;
462 :     add dev0 = dev0, psad[0]
463 :     psad1 psad[6] = c[18], mean0
464 :     }
465 :     {.mii
466 :     add dev1 = dev1, psad[1]
467 :     psad1 psad[7] = c[19], mean0
468 :    
469 :     psad1 psad[0] = c[20], mean0
470 :     }
471 :     {.mmi
472 :     add dev2 = dev2, psad[2]
473 :     add dev3 = dev3, psad[3]
474 :     psad1 psad[1] = c[21], mean0
475 : ia64p 205 ;;
476 : ia64p 230 }
477 :     {.mii
478 :     add dev0 = dev0, psad[4]
479 :     psad1 psad[2] = c[22], mean0
480 :     psad1 psad[3] = c[23], mean0
481 :     }
482 :     {.mmi
483 :     add dev1 = dev1, psad[5]
484 :    
485 :     add dev2 = dev2, psad[6]
486 :     psad1 psad[4] = c[24], mean0
487 :     }
488 :     {.mii
489 :     add dev3 = dev3, psad[7]
490 :     psad1 psad[5] = c[25], mean0
491 :     ;;
492 :     psad1 psad[6] = c[26], mean0
493 :     }
494 :     {.mmi
495 :     add dev0 = dev0, psad[0]
496 :     add dev1 = dev1, psad[1]
497 :     psad1 psad[7] = c[27], mean0
498 :     }
499 :     {.mii
500 :     add dev2 = dev2, psad[2]
501 :     psad1 psad[0] = c[28], mean0
502 :     psad1 psad[1] = c[29], mean0
503 :     }
504 :     {.mmi
505 :     add dev3 = dev3, psad[3]
506 : ia64p 205 ;;
507 : ia64p 230 add dev0 = dev0, psad[4]
508 :     psad1 psad[2] = c[30], mean0
509 :     }
510 :     {.mii
511 :     add dev1 = dev1, psad[5]
512 :     psad1 psad[3] = c[31], mean0
513 :     ;;
514 :     add dev2 = dev2, psad[6]
515 :     }
516 :     {.mmi
517 :     add dev3 = dev3, psad[7]
518 :     add dev0 = dev0, psad[0]
519 :     add dev1 = dev1, psad[1]
520 : ia64p 205 ;;
521 : ia64p 230 }
522 :     {.mii
523 :     add dev2 = dev2, psad[2]
524 :     add dev3 = dev3, psad[3]
525 :     add ret0 = dev0, dev1
526 :     ;;
527 :     }
528 :     {.mib
529 :     add dev2 = dev2, dev3
530 :     nop.i 1
531 :     nop.b 1
532 :     ;;
533 :     }
534 :     {.mib
535 :     add ret0 = ret0, dev2
536 :     nop.i 1
537 : ia64p 205 br.ret.sptk.many b0
538 : ia64p 230 }
539 : ia64p 205 .endp dev16_ia64#
540 : ia64p 319
541 :    
542 :     // ###########################################################
543 :     // ###########################################################
544 :     // Neue version von gruppe 01 ################################
545 :     // ###########################################################
546 :     // ###########################################################
547 :    
548 :    
549 :    
550 :     .text
551 :     .align 16
552 :     .global sad16_ia64#
553 :     .proc sad16_ia64#
554 :     sad16_ia64:
555 :     alloc r1 = ar.pfs, 4, 76, 0, 0
556 :     mov r2 = pr
557 :     dep r14 = r0, r33, 0, 3 // r14 = (r33 div 8)*8 (aligned version of ref)
558 :     dep.z r31 = r33, 0, 3 // r31 = r33 mod 8 (misalignment of ref)
559 :     ;;
560 :     mov r64 = r34 //(1) calculate multiples of stride
561 :     shl r65 = r34, 1 //(2) for being able to load all the
562 :     shladd r66 = r34, 1, r34 //(3) data at once
563 :     shl r67 = r34, 2 //(4)
564 :     shladd r68 = r34, 2, r34 //(5)
565 :     shl r71 = r34, 3 //(8)
566 :     shladd r72 = r34, 3, r34 //(9)
567 :     ;;
568 :     shl r69 = r66, 1 //(6)
569 :     shladd r70 = r66, 1, r34 //(7)
570 :     shl r73 = r68, 1 //(10)
571 :     shladd r74 = r68, 1, r34 //(11)
572 :     shl r75 = r66, 2 //(12)
573 :     shladd r76 = r66, 2, r34 //(13)
574 :     shladd r77 = r66, 2, r65 //(14)
575 :     shladd r78 = r66, 2, r66 //(15)
576 :     ;;
577 :     cmp.eq p16, p17 = 0, r31 // prepare predicates according to the misalignment
578 :     cmp.eq p18, p19 = 2, r31 // ref
579 :     cmp.eq p20, p21 = 4, r31
580 :     cmp.eq p22, p23 = 6, r31
581 :     cmp.eq p24, p25 = 1, r31
582 :     cmp.eq p26, p27 = 3, r31
583 :     cmp.eq p28, p29 = 5, r31
584 :     mov r96 = r14 // and calculate all the adresses where we have
585 :     mov r33 = r32 // to load from
586 :     add r97 = r14, r64
587 :     add r35 = r32, r64
588 :     add r98 = r14, r65
589 :     add r37 = r32, r65
590 :     add r99 = r14, r66
591 :     add r39 = r32, r66
592 :     add r100 = r14, r67
593 :     add r41 = r32, r67
594 :     add r101 = r14, r68
595 :     add r43 = r32, r68
596 :     add r102 = r14, r69
597 :     add r45 = r32, r69
598 :     add r103 = r14, r70
599 :     add r47 = r32, r70
600 :     add r104 = r14, r71
601 :     add r49 = r32, r71
602 :     add r105 = r14, r72
603 :     add r51 = r32, r72
604 :     add r106 = r14, r73
605 :     add r53 = r32, r73
606 :     add r107 = r14, r74
607 :     add r55 = r32, r74
608 :     add r108 = r14, r75
609 :     add r57 = r32, r75
610 :     add r109 = r14, r76
611 :     add r59 = r32, r76
612 :     add r110 = r14, r77
613 :     add r61 = r32, r77
614 :     add r111 = r14, r78
615 :     add r63 = r32, r78
616 :     ;;
617 :     ld8 r32 = [r33], 8 // Load all the data which is needed for the sad
618 :     ld8 r34 = [r35], 8 // in the registers. the goal is to have the array
619 :     ld8 r36 = [r37], 8 // adressed by cur in the registers r32 - r63 and
620 :     ld8 r38 = [r39], 8 // the aray adressed by ref in the registers
621 :     ld8 r40 = [r41], 8 // r64 - r95. The registers r96 - r111 are needed
622 :     ld8 r42 = [r43], 8 // to load the aligned 24 bits in which the
623 :     ld8 r44 = [r45], 8 // needed misaligned 16 bits must be.
624 :     ld8 r46 = [r47], 8 // After loading we start a preprocessing which
625 :     ld8 r48 = [r49], 8 // guarantees that the data adressed by ref is in
626 :     ld8 r50 = [r51], 8 // the registers r64 - r95.
627 :     ld8 r52 = [r53], 8
628 :     ld8 r54 = [r55], 8
629 :     ld8 r56 = [r57], 8
630 :     ld8 r58 = [r59], 8
631 :     ld8 r60 = [r61], 8
632 :     ld8 r62 = [r63], 8
633 :     ld8 r64 = [r96], 8
634 :     ld8 r66 = [r97], 8
635 :     ld8 r68 = [r98], 8
636 :     ld8 r70 = [r99], 8
637 :     ld8 r72 = [r100], 8
638 :     ld8 r74 = [r101], 8
639 :     ld8 r76 = [r102], 8
640 :     ld8 r78 = [r103], 8
641 :     ld8 r80 = [r104], 8
642 :     ld8 r82 = [r105], 8
643 :     ld8 r84 = [r106], 8
644 :     ld8 r86 = [r107], 8
645 :     ld8 r88 = [r108], 8
646 :     ld8 r90 = [r109], 8
647 :     ld8 r92 = [r110], 8
648 :     ld8 r94 = [r111], 8
649 :     ;;
650 :     ld8 r33 = [r33]
651 :     ld8 r35 = [r35]
652 :     ld8 r37 = [r37]
653 :     ld8 r39 = [r39]
654 :     ld8 r41 = [r41]
655 :     ld8 r43 = [r43]
656 :     ld8 r45 = [r45]
657 :     ld8 r47 = [r47]
658 :     ld8 r49 = [r49]
659 :     ld8 r51 = [r51]
660 :     ld8 r53 = [r53]
661 :     ld8 r55 = [r55]
662 :     ld8 r57 = [r57]
663 :     ld8 r59 = [r59]
664 :     ld8 r61 = [r61]
665 :     ld8 r63 = [r63]
666 :     ld8 r65 = [r96], 8
667 :     ld8 r67 = [r97], 8
668 :     ld8 r69 = [r98], 8
669 :     ld8 r71 = [r99], 8
670 :     ld8 r73 = [r100], 8
671 :     ld8 r75 = [r101], 8
672 :     ld8 r77 = [r102], 8
673 :     ld8 r79 = [r103], 8
674 :     ld8 r81 = [r104], 8
675 :     ld8 r83 = [r105], 8
676 :     ld8 r85 = [r106], 8
677 :     ld8 r87 = [r107], 8
678 :     ld8 r89 = [r108], 8
679 :     ld8 r91 = [r109], 8
680 :     ld8 r93 = [r110], 8
681 :     ld8 r95 = [r111], 8
682 :     (p16) br.cond.dptk.many .Lber // If ref is aligned, everything is loaded and we can start the calculation
683 :     ;;
684 :     ld8 r96 = [r96] // If not, we have to load a bit more
685 :     ld8 r97 = [r97]
686 :     ld8 r98 = [r98]
687 :     ld8 r99 = [r99]
688 :     ld8 r100 = [r100]
689 :     ld8 r101 = [r101]
690 :     ld8 r102 = [r102]
691 :     ld8 r103 = [r103]
692 :     ld8 r104 = [r104]
693 :     ld8 r105 = [r105]
694 :     ld8 r106 = [r106]
695 :     ld8 r107 = [r107]
696 :     ld8 r108 = [r108]
697 :     ld8 r109 = [r109]
698 :     ld8 r110 = [r110]
699 :     ld8 r111 = [r111]
700 :     (p24) br.cond.dptk.many .Lmod1 // according to the misalignment, we have
701 :     (p18) br.cond.dpnt.many .Lmod2 // to jump to different preprocessing routines
702 :     (p26) br.cond.dpnt.many .Lmod3
703 :     (p20) br.cond.dpnt.many .Lmod4
704 :     (p28) br.cond.dpnt.many .Lmod5
705 :     (p22) br.cond.dpnt.many .Lmod6
706 :     ;;
707 :     .Lmod7: // this jump point is not needed
708 :     shrp r64 = r65, r64, 56 // in these blocks, we do the preprocessing
709 :     shrp r65 = r96, r65, 56
710 :     shrp r66 = r67, r66, 56
711 :     shrp r67 = r97, r67, 56
712 :     shrp r68 = r69, r68, 56
713 :     shrp r69 = r98, r69, 56
714 :     shrp r70 = r71, r70, 56
715 :     shrp r71 = r99, r71, 56
716 :     shrp r72 = r73, r72, 56
717 :     shrp r73 = r100, r73, 56
718 :     shrp r74 = r75, r74, 56
719 :     shrp r75 = r101, r75, 56
720 :     shrp r76 = r77, r76, 56
721 :     shrp r77 = r102, r77, 56
722 :     shrp r78 = r79, r78, 56
723 :     shrp r79 = r103, r79, 56
724 :     shrp r80 = r81, r80, 56
725 :     shrp r81 = r104, r81, 56
726 :     shrp r82 = r83, r82, 56
727 :     shrp r83 = r105, r83, 56
728 :     shrp r84 = r85, r84, 56
729 :     shrp r85 = r106, r85, 56
730 :     shrp r86 = r87, r86, 56
731 :     shrp r87 = r107, r87, 56
732 :     shrp r88 = r89, r88, 56
733 :     shrp r89 = r108, r89, 56
734 :     shrp r90 = r91, r90, 56
735 :     shrp r91 = r109, r91, 56
736 :     shrp r92 = r93, r92, 56
737 :     shrp r93 = r110, r93, 56
738 :     shrp r94 = r95, r94, 56
739 :     shrp r95 = r111, r95, 56
740 :     br.cond.sptk.many .Lber // and then we jump to the calculation
741 :     ;;
742 :     .Lmod6:
743 :     shrp r64 = r65, r64, 48
744 :     shrp r65 = r96, r65, 48
745 :     shrp r66 = r67, r66, 48
746 :     shrp r67 = r97, r67, 48
747 :     shrp r68 = r69, r68, 48
748 :     shrp r69 = r98, r69, 48
749 :     shrp r70 = r71, r70, 48
750 :     shrp r71 = r99, r71, 48
751 :     shrp r72 = r73, r72, 48
752 :     shrp r73 = r100, r73, 48
753 :     shrp r74 = r75, r74, 48
754 :     shrp r75 = r101, r75, 48
755 :     shrp r76 = r77, r76, 48
756 :     shrp r77 = r102, r77, 48
757 :     shrp r78 = r79, r78, 48
758 :     shrp r79 = r103, r79, 48
759 :     shrp r80 = r81, r80, 48
760 :     shrp r81 = r104, r81, 48
761 :     shrp r82 = r83, r82, 48
762 :     shrp r83 = r105, r83, 48
763 :     shrp r84 = r85, r84, 48
764 :     shrp r85 = r106, r85, 48
765 :     shrp r86 = r87, r86, 48
766 :     shrp r87 = r107, r87, 48
767 :     shrp r88 = r89, r88, 48
768 :     shrp r89 = r108, r89, 48
769 :     shrp r90 = r91, r90, 48
770 :     shrp r91 = r109, r91, 48
771 :     shrp r92 = r93, r92, 48
772 :     shrp r93 = r110, r93, 48
773 :     shrp r94 = r95, r94, 48
774 :     shrp r95 = r111, r95, 48
775 :     br.cond.sptk.many .Lber
776 :     ;;
777 :     .Lmod5:
778 :     shrp r64 = r65, r64, 40
779 :     shrp r65 = r96, r65, 40
780 :     shrp r66 = r67, r66, 40
781 :     shrp r67 = r97, r67, 40
782 :     shrp r68 = r69, r68, 40
783 :     shrp r69 = r98, r69, 40
784 :     shrp r70 = r71, r70, 40
785 :     shrp r71 = r99, r71, 40
786 :     shrp r72 = r73, r72, 40
787 :     shrp r73 = r100, r73, 40
788 :     shrp r74 = r75, r74, 40
789 :     shrp r75 = r101, r75, 40
790 :     shrp r76 = r77, r76, 40
791 :     shrp r77 = r102, r77, 40
792 :     shrp r78 = r79, r78, 40
793 :     shrp r79 = r103, r79, 40
794 :     shrp r80 = r81, r80, 40
795 :     shrp r81 = r104, r81, 40
796 :     shrp r82 = r83, r82, 40
797 :     shrp r83 = r105, r83, 40
798 :     shrp r84 = r85, r84, 40
799 :     shrp r85 = r106, r85, 40
800 :     shrp r86 = r87, r86, 40
801 :     shrp r87 = r107, r87, 40
802 :     shrp r88 = r89, r88, 40
803 :     shrp r89 = r108, r89, 40
804 :     shrp r90 = r91, r90, 40
805 :     shrp r91 = r109, r91, 40
806 :     shrp r92 = r93, r92, 40
807 :     shrp r93 = r110, r93, 40
808 :     shrp r94 = r95, r94, 40
809 :     shrp r95 = r111, r95, 40
810 :     br.cond.sptk.many .Lber
811 :     ;;
812 :     .Lmod4:
813 :     shrp r64 = r65, r64, 32
814 :     shrp r65 = r96, r65, 32
815 :     shrp r66 = r67, r66, 32
816 :     shrp r67 = r97, r67, 32
817 :     shrp r68 = r69, r68, 32
818 :     shrp r69 = r98, r69, 32
819 :     shrp r70 = r71, r70, 32
820 :     shrp r71 = r99, r71, 32
821 :     shrp r72 = r73, r72, 32
822 :     shrp r73 = r100, r73, 32
823 :     shrp r74 = r75, r74, 32
824 :     shrp r75 = r101, r75, 32
825 :     shrp r76 = r77, r76, 32
826 :     shrp r77 = r102, r77, 32
827 :     shrp r78 = r79, r78, 32
828 :     shrp r79 = r103, r79, 32
829 :     shrp r80 = r81, r80, 32
830 :     shrp r81 = r104, r81, 32
831 :     shrp r82 = r83, r82, 32
832 :     shrp r83 = r105, r83, 32
833 :     shrp r84 = r85, r84, 32
834 :     shrp r85 = r106, r85, 32
835 :     shrp r86 = r87, r86, 32
836 :     shrp r87 = r107, r87, 32
837 :     shrp r88 = r89, r88, 32
838 :     shrp r89 = r108, r89, 32
839 :     shrp r90 = r91, r90, 32
840 :     shrp r91 = r109, r91, 32
841 :     shrp r92 = r93, r92, 32
842 :     shrp r93 = r110, r93, 32
843 :     shrp r94 = r95, r94, 32
844 :     shrp r95 = r111, r95, 32
845 :     br.cond.sptk.many .Lber
846 :     ;;
847 :     .Lmod3:
848 :     shrp r64 = r65, r64, 24
849 :     shrp r65 = r96, r65, 24
850 :     shrp r66 = r67, r66, 24
851 :     shrp r67 = r97, r67, 24
852 :     shrp r68 = r69, r68, 24
853 :     shrp r69 = r98, r69, 24
854 :     shrp r70 = r71, r70, 24
855 :     shrp r71 = r99, r71, 24
856 :     shrp r72 = r73, r72, 24
857 :     shrp r73 = r100, r73, 24
858 :     shrp r74 = r75, r74, 24
859 :     shrp r75 = r101, r75, 24
860 :     shrp r76 = r77, r76, 24
861 :     shrp r77 = r102, r77, 24
862 :     shrp r78 = r79, r78, 24
863 :     shrp r79 = r103, r79, 24
864 :     shrp r80 = r81, r80, 24
865 :     shrp r81 = r104, r81, 24
866 :     shrp r82 = r83, r82, 24
867 :     shrp r83 = r105, r83, 24
868 :     shrp r84 = r85, r84, 24
869 :     shrp r85 = r106, r85, 24
870 :     shrp r86 = r87, r86, 24
871 :     shrp r87 = r107, r87, 24
872 :     shrp r88 = r89, r88, 24
873 :     shrp r89 = r108, r89, 24
874 :     shrp r90 = r91, r90, 24
875 :     shrp r91 = r109, r91, 24
876 :     shrp r92 = r93, r92, 24
877 :     shrp r93 = r110, r93, 24
878 :     shrp r94 = r95, r94, 24
879 :     shrp r95 = r111, r95, 24
880 :     br.cond.sptk.many .Lber
881 :     ;;
882 :     .Lmod2:
883 :     shrp r64 = r65, r64, 16
884 :     shrp r65 = r96, r65, 16
885 :     shrp r66 = r67, r66, 16
886 :     shrp r67 = r97, r67, 16
887 :     shrp r68 = r69, r68, 16
888 :     shrp r69 = r98, r69, 16
889 :     shrp r70 = r71, r70, 16
890 :     shrp r71 = r99, r71, 16
891 :     shrp r72 = r73, r72, 16
892 :     shrp r73 = r100, r73, 16
893 :     shrp r74 = r75, r74, 16
894 :     shrp r75 = r101, r75, 16
895 :     shrp r76 = r77, r76, 16
896 :     shrp r77 = r102, r77, 16
897 :     shrp r78 = r79, r78, 16
898 :     shrp r79 = r103, r79, 16
899 :     shrp r80 = r81, r80, 16
900 :     shrp r81 = r104, r81, 16
901 :     shrp r82 = r83, r82, 16
902 :     shrp r83 = r105, r83, 16
903 :     shrp r84 = r85, r84, 16
904 :     shrp r85 = r106, r85, 16
905 :     shrp r86 = r87, r86, 16
906 :     shrp r87 = r107, r87, 16
907 :     shrp r88 = r89, r88, 16
908 :     shrp r89 = r108, r89, 16
909 :     shrp r90 = r91, r90, 16
910 :     shrp r91 = r109, r91, 16
911 :     shrp r92 = r93, r92, 16
912 :     shrp r93 = r110, r93, 16
913 :     shrp r94 = r95, r94, 16
914 :     shrp r95 = r111, r95, 16
915 :     br.cond.sptk.many .Lber
916 :     ;;
917 :     .Lmod1:
918 :     shrp r64 = r65, r64, 8
919 :     shrp r65 = r96, r65, 8
920 :     shrp r66 = r67, r66, 8
921 :     shrp r67 = r97, r67, 8
922 :     shrp r68 = r69, r68, 8
923 :     shrp r69 = r98, r69, 8
924 :     shrp r70 = r71, r70, 8
925 :     shrp r71 = r99, r71, 8
926 :     shrp r72 = r73, r72, 8
927 :     shrp r73 = r100, r73, 8
928 :     shrp r74 = r75, r74, 8
929 :     shrp r75 = r101, r75, 8
930 :     shrp r76 = r77, r76, 8
931 :     shrp r77 = r102, r77, 8
932 :     shrp r78 = r79, r78, 8
933 :     shrp r79 = r103, r79, 8
934 :     shrp r80 = r81, r80, 8
935 :     shrp r81 = r104, r81, 8
936 :     shrp r82 = r83, r82, 8
937 :     shrp r83 = r105, r83, 8
938 :     shrp r84 = r85, r84, 8
939 :     shrp r85 = r106, r85, 8
940 :     shrp r86 = r87, r86, 8
941 :     shrp r87 = r107, r87, 8
942 :     shrp r88 = r89, r88, 8
943 :     shrp r89 = r108, r89, 8
944 :     shrp r90 = r91, r90, 8
945 :     shrp r91 = r109, r91, 8
946 :     shrp r92 = r93, r92, 8
947 :     shrp r93 = r110, r93, 8
948 :     shrp r94 = r95, r94, 8
949 :     shrp r95 = r111, r95, 8
950 :     .Lber:
951 :     ;;
952 :     psad1 r32 = r32, r64 // Here we do the calculation.
953 :     psad1 r33 = r33, r65 // The machine is providing a fast method
954 :     psad1 r34 = r34, r66 // for calculating sad, so we use it
955 :     psad1 r35 = r35, r67
956 :     psad1 r36 = r36, r68
957 :     psad1 r37 = r37, r69
958 :     psad1 r38 = r38, r70
959 :     psad1 r39 = r39, r71
960 :     psad1 r40 = r40, r72
961 :     psad1 r41 = r41, r73
962 :     psad1 r42 = r42, r74
963 :     psad1 r43 = r43, r75
964 :     psad1 r44 = r44, r76
965 :     psad1 r45 = r45, r77
966 :     psad1 r46 = r46, r78
967 :     psad1 r47 = r47, r79
968 :     psad1 r48 = r48, r80
969 :     psad1 r49 = r49, r81
970 :     psad1 r50 = r50, r82
971 :     psad1 r51 = r51, r83
972 :     psad1 r52 = r52, r84
973 :     psad1 r53 = r53, r85
974 :     psad1 r54 = r54, r86
975 :     psad1 r55 = r55, r87
976 :     psad1 r56 = r56, r88
977 :     psad1 r57 = r57, r89
978 :     psad1 r58 = r58, r90
979 :     psad1 r59 = r59, r91
980 :     psad1 r60 = r60, r92
981 :     psad1 r61 = r61, r93
982 :     psad1 r62 = r62, r94
983 :     psad1 r63 = r63, r95
984 :     ;;
985 :     add r32 = r32, r63 // at last, we have to sum up
986 :     add r33 = r33, r62 // in 5 stages
987 :     add r34 = r34, r61
988 :     add r35 = r35, r60
989 :     add r36 = r36, r59
990 :     add r37 = r37, r58
991 :     add r38 = r38, r57
992 :     add r39 = r39, r56
993 :     add r40 = r40, r55
994 :     add r41 = r41, r54
995 :     add r42 = r42, r53
996 :     add r43 = r43, r52
997 :     add r44 = r44, r51
998 :     add r45 = r45, r50
999 :     add r46 = r46, r49
1000 :     add r47 = r47, r48
1001 :     ;;
1002 :     add r32 = r32, r47
1003 :     add r33 = r33, r46
1004 :     add r34 = r34, r45
1005 :     add r35 = r35, r44
1006 :     add r36 = r36, r43
1007 :     add r37 = r37, r42
1008 :     add r38 = r38, r41
1009 :     add r39 = r39, r40
1010 :     ;;
1011 :     add r32 = r32, r39
1012 :     add r33 = r33, r38
1013 :     add r34 = r34, r37
1014 :     add r35 = r35, r36
1015 :     ;;
1016 :     add r32 = r32, r35
1017 :     add r33 = r33, r34
1018 :     ;;
1019 :     add r8 = r32, r33 // and store the result in r8
1020 :     mov pr = r2, -1
1021 :     mov ar.pfs = r1
1022 :     br.ret.sptk.many b0
1023 :     .endp sad16_ia64#
1024 :    
1025 :    
1026 :    
1027 :    
1028 :     .align 16
1029 :     .global sad8_ia64#
1030 :     .proc sad8_ia64#
1031 :     sad8_ia64:
1032 :     alloc r1 = ar.pfs, 3, 21, 0, 0
1033 :     mov r2 = pr
1034 :     dep r14 = r0, r33, 0, 3 // calculate aligned version of ref
1035 :     dep.z r31 = r33, 0, 3 // calculate misalignment of ref
1036 :     ;;
1037 :     mov r40 = r34 //(1) calculate multiples of stride
1038 :     shl r41 = r34, 1 //(2)
1039 :     shladd r42 = r34, 1, r34 //(3)
1040 :     shl r43 = r34, 2 //(4)
1041 :     shladd r44 = r34, 2, r34 //(5)
1042 :     ;;
1043 :     cmp.eq p16, p17 = 0, r31 // set predicates according to the misalignment of ref
1044 :     cmp.eq p18, p19 = 2, r31
1045 :     shl r45 = r42, 1 //(6)
1046 :     cmp.eq p20, p21 = 4, r31
1047 :     cmp.eq p22, p23 = 6, r31
1048 :     shladd r46 = r42, 1, r34 //(7)
1049 :     cmp.eq p24, p25 = 1, r31
1050 :     cmp.eq p26, p27 = 3, r31
1051 :     cmp.eq p28, p29 = 5, r31
1052 :     ;;
1053 :     mov r48 = r14 // calculate memory adresses of data
1054 :     add r33 = r32, r40
1055 :     add r49 = r14, r40
1056 :     add r34 = r32, r41
1057 :     add r50 = r14, r41
1058 :     add r35 = r32, r42
1059 :     add r51 = r14, r42
1060 :     add r36 = r32, r43
1061 :     add r52 = r14, r43
1062 :     add r37 = r32, r44
1063 :     add r53 = r14, r44
1064 :     add r38 = r32, r45
1065 :     add r54 = r14, r45
1066 :     add r39 = r32, r46
1067 :     add r55 = r14, r46
1068 :     ;;
1069 :     ld8 r32 = [r32] // load everythingund alles wird geladen
1070 :     ld8 r33 = [r33] // cur is located in r32 - r39
1071 :     ld8 r34 = [r34] // ref in r40 - r47
1072 :     ld8 r35 = [r35]
1073 :     ld8 r36 = [r36]
1074 :     ld8 r37 = [r37]
1075 :     ld8 r38 = [r38]
1076 :     ld8 r39 = [r39]
1077 :     ld8 r40 = [r48] ,8
1078 :     ld8 r41 = [r49] ,8
1079 :     ld8 r42 = [r50] ,8
1080 :     ld8 r43 = [r51] ,8
1081 :     ld8 r44 = [r52] ,8
1082 :     ld8 r45 = [r53] ,8
1083 :     ld8 r46 = [r54] ,8
1084 :     ld8 r47 = [r55] ,8
1085 :     (p16) br.cond.dptk.many .Lber2 // if ref is aligned, we can start the calculation
1086 :     ;;
1087 :     ld8 r48 = [r48] // if not, we have to load some more
1088 :     ld8 r49 = [r49] // because of the alignment of ld8
1089 :     ld8 r50 = [r50]
1090 :     ld8 r51 = [r51]
1091 :     ld8 r52 = [r52]
1092 :     ld8 r53 = [r53]
1093 :     ld8 r54 = [r54]
1094 :     ld8 r55 = [r55]
1095 :     (p24) br.cond.dptk.many .Lmode1
1096 :     (p18) br.cond.dpnt.many .Lmode2
1097 :     (p26) br.cond.dpnt.many .Lmode3
1098 :     (p20) br.cond.dpnt.many .Lmode4
1099 :     (p28) br.cond.dpnt.many .Lmode5
1100 :     (p22) br.cond.dpnt.many .Lmode6
1101 :     ;;
1102 :     .Lmode7: // this jump piont is not needed, it is for better understandment
1103 :     shrp r40 = r48, r40, 56 // here we do some preprocessing on the data
1104 :     shrp r41 = r49, r41, 56 // this is because of the alignment problem of ref
1105 :     shrp r42 = r50, r42, 56
1106 :     shrp r43 = r51, r43, 56
1107 :     shrp r44 = r52, r44, 56
1108 :     shrp r45 = r53, r45, 56
1109 :     shrp r46 = r54, r46, 56
1110 :     shrp r47 = r55, r47, 56
1111 :     br.cond.sptk.many .Lber2
1112 :     ;;
1113 :     .Lmode6:
1114 :     shrp r40 = r48, r40, 48
1115 :     shrp r41 = r49, r41, 48
1116 :     shrp r42 = r50, r42, 48
1117 :     shrp r43 = r51, r43, 48
1118 :     shrp r44 = r52, r44, 48
1119 :     shrp r45 = r53, r45, 48
1120 :     shrp r46 = r54, r46, 48
1121 :     shrp r47 = r55, r47, 48
1122 :     br.cond.sptk.many .Lber2
1123 :     ;;
1124 :     .Lmode5:
1125 :     shrp r40 = r48, r40, 40
1126 :     shrp r41 = r49, r41, 40
1127 :     shrp r42 = r50, r42, 40
1128 :     shrp r43 = r51, r43, 40
1129 :     shrp r44 = r52, r44, 40
1130 :     shrp r45 = r53, r45, 40
1131 :     shrp r46 = r54, r46, 40
1132 :     shrp r47 = r55, r47, 40
1133 :     br.cond.sptk.many .Lber2
1134 :     ;;
1135 :     .Lmode4:
1136 :     shrp r40 = r48, r40, 32
1137 :     shrp r41 = r49, r41, 32
1138 :     shrp r42 = r50, r42, 32
1139 :     shrp r43 = r51, r43, 32
1140 :     shrp r44 = r52, r44, 32
1141 :     shrp r45 = r53, r45, 32
1142 :     shrp r46 = r54, r46, 32
1143 :     shrp r47 = r55, r47, 32
1144 :     br.cond.sptk.many .Lber2
1145 :     ;;
1146 :     .Lmode3:
1147 :     shrp r40 = r48, r40, 24
1148 :     shrp r41 = r49, r41, 24
1149 :     shrp r42 = r50, r42, 24
1150 :     shrp r43 = r51, r43, 24
1151 :     shrp r44 = r52, r44, 24
1152 :     shrp r45 = r53, r45, 24
1153 :     shrp r46 = r54, r46, 24
1154 :     shrp r47 = r55, r47, 24
1155 :     br.cond.sptk.many .Lber2
1156 :     ;;
1157 :     .Lmode2:
1158 :     shrp r40 = r48, r40, 16
1159 :     shrp r41 = r49, r41, 16
1160 :     shrp r42 = r50, r42, 16
1161 :     shrp r43 = r51, r43, 16
1162 :     shrp r44 = r52, r44, 16
1163 :     shrp r45 = r53, r45, 16
1164 :     shrp r46 = r54, r46, 16
1165 :     shrp r47 = r55, r47, 16
1166 :     br.cond.sptk.many .Lber2
1167 :     ;;
1168 :     .Lmode1:
1169 :     shrp r40 = r48, r40, 8
1170 :     shrp r41 = r49, r41, 8
1171 :     shrp r42 = r50, r42, 8
1172 :     shrp r43 = r51, r43, 8
1173 :     shrp r44 = r52, r44, 8
1174 :     shrp r45 = r53, r45, 8
1175 :     shrp r46 = r54, r46, 8
1176 :     shrp r47 = r55, r47, 8
1177 :     .Lber2:
1178 :     ;;
1179 :     psad1 r32 = r32, r40 // we start calculating sad
1180 :     psad1 r33 = r33, r41 // using th psad1 command of IA64
1181 :     psad1 r34 = r34, r42
1182 :     psad1 r35 = r35, r43
1183 :     psad1 r36 = r36, r44
1184 :     psad1 r37 = r37, r45
1185 :     psad1 r38 = r38, r46
1186 :     psad1 r39 = r39, r47
1187 :     ;;
1188 :     add r32 = r32, r33 // then we sum up everything
1189 :     add r33 = r34, r35
1190 :     add r34 = r36, r37
1191 :     add r35 = r38, r39
1192 :     ;;
1193 :     add r32 = r32, r33
1194 :     add r33 = r34, r35
1195 :     ;;
1196 :     add r8 = r32, r33 // and store the result un r8
1197 :     mov pr = r2, -1
1198 :     mov ar.pfs = r1
1199 :     br.ret.sptk.many b0
1200 :     .endp sad8_ia64#

No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4