[svn] / trunk / xvidcore / src / motion / ia64_asm / sad_ia64.s Repository:
ViewVC logotype

Annotation of /trunk/xvidcore/src/motion/ia64_asm/sad_ia64.s

Parent Directory Parent Directory | Revision Log Revision Log


Revision 430 - (view) (download)

1 : chl 430 // /****************************************************************************
2 :     // *
3 :     // * XVID MPEG-4 VIDEO CODEC
4 :     // * - ia64 sum of absolute differences -
5 :     // *
6 :     // * Copyright(C) 2002 Hannes Jütting <s_juetti@ira.uka.de>
7 :     // * Copyright(C) 2002 Christopher Özbek <s_oezbek@ira.uka.de>
8 :     // *
9 :     // * This program is an implementation of a part of one or more MPEG-4
10 :     // * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
11 :     // * to use this software module in hardware or software products are
12 :     // * advised that its use may infringe existing patents or copyrights, and
13 :     // * any such use would be at such party's own risk. The original
14 :     // * developer of this software module and his/her company, and subsequent
15 :     // * editors and their companies, will have no liability for use of this
16 :     // * software or modifications or derivatives thereof.
17 :     // *
18 :     // * This program is free software; you can redistribute it and/or modify
19 :     // * it under the terms of the GNU General Public License as published by
20 :     // * the Free Software Foundation; either version 2 of the License, or
21 :     // * (at your option) any later version.
22 :     // *
23 :     // * This program is distributed in the hope that it will be useful,
24 :     // * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 :     // * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 :     // * GNU General Public License for more details.
27 :     // *
28 :     // * You should have received a copy of the GNU General Public License
29 :     // * along with this program; if not, write to the Free Software
30 :     // * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31 :     // *
32 :     // ****************************************************************************/
33 :    
34 : ia64p 250 // ------------------------------------------------------------------------------
35 :     // *
36 :     // * Optimized Assembler Versions of sad8 and sad16
37 :     // *
38 :     // ------------------------------------------------------------------------------
39 :     // *
40 :     // * Hannes Jütting and Christopher Özbek
41 :     // * {s_juetti,s_oezbek}@ira.uka.de
42 :     // *
43 :     // * Programmed for the IA64 laboratory held at University Karlsruhe 2002
44 :     // * http://www.info.uni-karlsruhe.de/~rubino/ia64p/
45 :     // *
46 :     // ------------------------------------------------------------------------------
47 :     // *
48 :     // * These are the optimized assembler versions of sad8 and sad16, which calculate
49 :     // * the sum of absolute differences between two 8x8/16x16 block matrices.
50 :     // *
51 :     // * Our approach uses:
52 :     // * - The Itanium command psad1, which solves the problem in hardware.
53 :     // * - Modulo-Scheduled Loops as the best way to loop unrolling on the IA64
54 :     // * EPIC architecture
55 :     // * - Alignment resolving to avoid memory faults
56 :     // *
57 :     // ------------------------------------------------------------------------------
58 :    
59 : ia64p 205 .common sad16bi#,8,8
60 :     .align 16
61 :     .global sad16bi_ia64#
62 :     .proc sad16bi_ia64#
63 :     sad16bi_ia64:
64 :     .prologue
65 :     .save ar.lc, r2
66 :     mov r2 = ar.lc
67 :     .body
68 :     zxt4 r35 = r35
69 :     mov r8 = r0
70 :     mov r23 = r0
71 :     addl r22 = 255, r0
72 :     .L21:
73 :     addl r14 = 7, r0
74 :     mov r19 = r32
75 :     mov r21 = r34
76 :     mov r20 = r33
77 :     ;;
78 :     mov ar.lc = r14
79 :     ;;
80 :     .L105:
81 :     mov r17 = r20
82 :     mov r18 = r21
83 :     ;;
84 :     ld1 r14 = [r17], 1
85 :     ld1 r15 = [r18], 1
86 :     ;;
87 :     add r14 = r14, r15
88 :     ;;
89 :     adds r14 = 1, r14
90 :     ;;
91 :     shr.u r16 = r14, 1
92 :     ;;
93 :     cmp4.le p6, p7 = r0, r16
94 :     ;;
95 :     (p7) mov r16 = r0
96 :     (p7) br.cond.dpnt .L96
97 :     ;;
98 :     cmp4.ge p6, p7 = r22, r16
99 :     ;;
100 :     (p7) addl r16 = 255, r0
101 :     .L96:
102 :     ld1 r14 = [r19]
103 :     adds r20 = 2, r20
104 :     adds r21 = 2, r21
105 :     ;;
106 :     sub r15 = r14, r16
107 :     ;;
108 :     cmp4.ge p6, p7 = 0, r15
109 :     ;;
110 :     (p6) sub r14 = r16, r14
111 :     (p7) add r8 = r8, r15
112 :     ;;
113 :     (p6) add r8 = r8, r14
114 :     ld1 r15 = [r18]
115 :     ld1 r14 = [r17]
116 :     ;;
117 :     add r14 = r14, r15
118 :     adds r17 = 1, r19
119 :     ;;
120 :     adds r14 = 1, r14
121 :     ;;
122 :     shr.u r16 = r14, 1
123 :     ;;
124 :     cmp4.le p6, p7 = r0, r16
125 :     ;;
126 :     (p7) mov r16 = r0
127 :     (p7) br.cond.dpnt .L102
128 :     ;;
129 :     cmp4.ge p6, p7 = r22, r16
130 :     ;;
131 :     (p7) addl r16 = 255, r0
132 :     .L102:
133 :     ld1 r14 = [r17]
134 :     adds r19 = 2, r19
135 :     ;;
136 :     sub r15 = r14, r16
137 :     ;;
138 :     cmp4.ge p6, p7 = 0, r15
139 :     ;;
140 :     (p7) add r8 = r8, r15
141 :     (p6) sub r14 = r16, r14
142 :     ;;
143 :     (p6) add r8 = r8, r14
144 :     br.cloop.sptk.few .L105
145 :     adds r23 = 1, r23
146 :     add r32 = r32, r35
147 :     add r33 = r33, r35
148 :     add r34 = r34, r35
149 :     ;;
150 :     cmp4.geu p6, p7 = 15, r23
151 :     (p6) br.cond.dptk .L21
152 :     mov ar.lc = r2
153 :     br.ret.sptk.many b0
154 :     .endp sad16bi_ia64#
155 :    
156 :    
157 : ia64p 230
158 :    
159 :    
160 :    
161 :    
162 :    
163 :     .text
164 : ia64p 205 .align 16
165 :     .global dev16_ia64#
166 :     .proc dev16_ia64#
167 : ia64p 230 .auto
168 : ia64p 205 dev16_ia64:
169 : ia64p 230 // renamings for better readability
170 :     stride = r18
171 :     pfs = r19 //for saving previous function state
172 :     cura0 = r20 //address of first 8-byte block of cur
173 :     cura1 = r21 //address of second 8-byte block of cur
174 :     mean0 = r22 //registers for calculating the sum in parallel
175 :     mean1 = r23
176 :     mean2 = r24
177 :     mean3 = r25
178 :     dev0 = r26 //same for the deviation
179 :     dev1 = r27
180 :     dev2 = r28
181 :     dev3 = r29
182 :    
183 : ia64p 205 .body
184 : ia64p 230 alloc pfs = ar.pfs, 2, 38, 0, 40
185 :    
186 :     mov cura0 = in0
187 :     mov stride = in1
188 :     add cura1 = 8, cura0
189 :    
190 :     .rotr c[32], psad[8] // just using rotating registers to get an array ;-)
191 :    
192 :     .explicit
193 :     {.mmi
194 :     ld8 c[0] = [cura0], stride // load them ...
195 :     ld8 c[1] = [cura1], stride
196 :     ;;
197 :     }
198 :     {.mmi
199 :     ld8 c[2] = [cura0], stride
200 :     ld8 c[3] = [cura1], stride
201 :     ;;
202 :     }
203 :     {.mmi
204 :     ld8 c[4] = [cura0], stride
205 :     ld8 c[5] = [cura1], stride
206 : ia64p 205 ;;
207 : ia64p 230 }
208 :     {.mmi
209 :     ld8 c[6] = [cura0], stride
210 :     ld8 c[7] = [cura1], stride
211 : ia64p 205 ;;
212 : ia64p 230 }
213 :     {.mmi
214 :     ld8 c[8] = [cura0], stride
215 :     ld8 c[9] = [cura1], stride
216 : ia64p 205 ;;
217 : ia64p 230 }
218 :     {.mmi
219 :     ld8 c[10] = [cura0], stride
220 :     ld8 c[11] = [cura1], stride
221 : ia64p 205 ;;
222 : ia64p 230 }
223 :     {.mii
224 :     ld8 c[12] = [cura0], stride
225 :     psad1 mean0 = c[0], r0 // get the sum of them ...
226 :     psad1 mean1 = c[1], r0
227 :     }
228 :     {.mmi
229 :     ld8 c[13] = [cura1], stride
230 :     ;;
231 :     ld8 c[14] = [cura0], stride
232 :     psad1 mean2 = c[2], r0
233 :     }
234 :     {.mii
235 :     ld8 c[15] = [cura1], stride
236 :     psad1 mean3 = c[3], r0
237 :     ;;
238 :     psad1 psad[0] = c[4], r0
239 :     }
240 :     {.mmi
241 :     ld8 c[16] = [cura0], stride
242 :     ld8 c[17] = [cura1], stride
243 :     psad1 psad[1] = c[5], r0
244 : ia64p 205 ;;
245 : ia64p 230 }
246 :     {.mii
247 :     ld8 c[18] = [cura0], stride
248 :     psad1 psad[2] = c[6], r0
249 :     psad1 psad[3] = c[7], r0
250 :     }
251 :     {.mmi
252 :     ld8 c[19] = [cura1], stride
253 :     ;;
254 :     ld8 c[20] = [cura0], stride
255 :     psad1 psad[4] = c[8], r0
256 :     }
257 :     {.mii
258 :     ld8 c[21] = [cura1], stride
259 :     psad1 psad[5] = c[9], r0
260 : ia64p 205 ;;
261 : ia64p 230 add mean0 = mean0, psad[0]
262 :     }
263 :     {.mmi
264 :     ld8 c[22] = [cura0], stride
265 :     ld8 c[23] = [cura1], stride
266 :     add mean1 = mean1, psad[1]
267 :     ;;
268 :     }
269 :     {.mii
270 :     ld8 c[24] = [cura0], stride
271 :     psad1 psad[0] = c[10], r0
272 :     psad1 psad[1] = c[11], r0
273 :     }
274 :     {.mmi
275 :     ld8 c[25] = [cura1], stride
276 :     ;;
277 :     ld8 c[26] = [cura0], stride
278 :     add mean2 = mean2, psad[2]
279 :     }
280 :     {.mii
281 :     ld8 c[27] = [cura1], stride
282 :     add mean3 = mean3, psad[3]
283 :     ;;
284 :     psad1 psad[2] = c[12], r0
285 :     }
286 :     {.mmi
287 :     ld8 c[28] = [cura0], stride
288 :     ld8 c[29] = [cura1], stride
289 :     psad1 psad[3] = c[13], r0
290 :     ;;
291 :     }
292 :     {.mii
293 :     ld8 c[30] = [cura0]
294 :     psad1 psad[6] = c[14], r0
295 :     psad1 psad[7] = c[15], r0
296 :     }
297 :     {.mmi
298 :     ld8 c[31] = [cura1]
299 :     ;;
300 :     add mean0 = mean0, psad[0]
301 :     add mean1 = mean1, psad[1]
302 :     }
303 :     {.mii
304 :     add mean2 = mean2, psad[4]
305 :     add mean3 = mean3, psad[5]
306 : ia64p 205 ;;
307 : ia64p 230 psad1 psad[0] = c[16], r0
308 :     }
309 :     {.mmi
310 :     add mean0 = mean0, psad[2]
311 :     add mean1 = mean1, psad[3]
312 :     psad1 psad[1] = c[17], r0
313 : ia64p 205 ;;
314 : ia64p 230 }
315 :     {.mii
316 :     add mean2 = mean2, psad[6]
317 :     psad1 psad[2] = c[18], r0
318 :     psad1 psad[3] = c[19], r0
319 :     }
320 :     {.mmi
321 :     add mean3 = mean3, psad[7]
322 :     ;;
323 :     add mean0 = mean0, psad[0]
324 :     psad1 psad[4] = c[20], r0
325 :     }
326 :     {.mii
327 :     add mean1 = mean1, psad[1]
328 :     psad1 psad[5] = c[21], r0
329 : ia64p 205 ;;
330 : ia64p 230 psad1 psad[6] = c[22], r0
331 :     }
332 :     {.mmi
333 :     add mean2 = mean2, psad[2]
334 :     add mean3 = mean3, psad[3]
335 :     psad1 psad[7] = c[23], r0
336 : ia64p 205 ;;
337 : ia64p 230 }
338 :     {.mii
339 :     add mean0 = mean0, psad[4]
340 :     psad1 psad[0] = c[24], r0
341 :     psad1 psad[1] = c[25], r0
342 :     }
343 :     {.mmi
344 :     add mean1 = mean1, psad[5]
345 : ia64p 205 ;;
346 : ia64p 230 add mean2 = mean2, psad[6]
347 :     psad1 psad[2] = c[26], r0
348 :     }
349 :     {.mii
350 :     add mean3 = mean3, psad[7]
351 :     psad1 psad[3] = c[27], r0
352 :     ;;
353 :     psad1 psad[4] = c[28], r0
354 :     }
355 :     {.mmi
356 :     add mean0 = mean0, psad[0]
357 :     add mean1 = mean1, psad[1]
358 :     psad1 psad[5] = c[29], r0
359 : ia64p 205 ;;
360 : ia64p 230 }
361 :     {.mii
362 :     add mean2 = mean2, psad[2]
363 :     psad1 psad[6] = c[30], r0
364 :     psad1 psad[7] = c[31], r0
365 :     }
366 :     {.mmi
367 :     add mean3 = mean3, psad[3]
368 : ia64p 205 ;;
369 : ia64p 230 add mean0 = mean0, psad[4]
370 :     add mean1 = mean1, psad[5]
371 :     }
372 :     {.mbb
373 :     add mean2 = mean2, mean3
374 :     nop.b 1
375 :     nop.b 1
376 : ia64p 205 ;;
377 : ia64p 230 }
378 :     {.mib
379 :     add mean0 = mean0, psad[6]
380 :     add mean1 = mean1, psad[7]
381 :     nop.b 1
382 : ia64p 205 ;;
383 : ia64p 230 }
384 :     {.mib
385 :     add mean0 = mean0, mean1
386 : ia64p 300 // add mean2 = 127, mean2 // this could make our division more exactly, but does not help much
387 : ia64p 205 ;;
388 : ia64p 230 }
389 :     {.mib
390 :     add mean0 = mean0, mean2
391 : ia64p 205 ;;
392 : ia64p 230 }
393 :    
394 :     {.mib
395 :     shr.u mean0 = mean0, 8 // divide them ...
396 : ia64p 205 ;;
397 : ia64p 230 }
398 :     {.mib
399 :     mux1 mean0 = mean0, @brcst
400 :     ;;
401 :     }
402 :     {.mii
403 :     nop.m 0
404 :     psad1 dev0 = c[0], mean0 // and do a sad again ...
405 :     psad1 dev1 = c[1], mean0
406 :     }
407 :     {.mii
408 :     nop.m 0
409 :     psad1 dev2 = c[2], mean0
410 :     psad1 dev3 = c[3], mean0
411 :     }
412 :     {.mii
413 :     nop.m 0
414 :     psad1 psad[0] = c[4], mean0
415 :     psad1 psad[1] = c[5], mean0
416 :     }
417 :     {.mii
418 :     nop.m 0
419 :     psad1 psad[2] = c[6], mean0
420 :     psad1 psad[3] = c[7], mean0
421 :     }
422 :     {.mii
423 :     nop.m 0
424 :     psad1 psad[4] = c[8], mean0
425 :     psad1 psad[5] = c[9], mean0
426 :     ;;
427 :     }
428 :     {.mii
429 :     add dev0 = dev0, psad[0]
430 :     psad1 psad[6] = c[10], mean0
431 :     psad1 psad[7] = c[11], mean0
432 :     }
433 :     {.mmi
434 :     add dev1 = dev1, psad[1]
435 :    
436 :     add dev2 = dev2, psad[2]
437 :     psad1 psad[0] = c[12], mean0
438 :     }
439 :     {.mii
440 :     add dev3 = dev3, psad[3]
441 :     psad1 psad[1] = c[13], mean0
442 :     ;;
443 :     psad1 psad[2] = c[14], mean0
444 :     }
445 :     {.mmi
446 :     add dev0 = dev0, psad[4]
447 :     add dev1 = dev1, psad[5]
448 :     psad1 psad[3] = c[15], mean0
449 :     }
450 :     {.mii
451 :     add dev2 = dev2, psad[6]
452 :     psad1 psad[4] = c[16], mean0
453 :     psad1 psad[5] = c[17], mean0
454 :     }
455 :     {.mmi
456 :     add dev3 = dev3, psad[7]
457 :     ;;
458 :     add dev0 = dev0, psad[0]
459 :     psad1 psad[6] = c[18], mean0
460 :     }
461 :     {.mii
462 :     add dev1 = dev1, psad[1]
463 :     psad1 psad[7] = c[19], mean0
464 :    
465 :     psad1 psad[0] = c[20], mean0
466 :     }
467 :     {.mmi
468 :     add dev2 = dev2, psad[2]
469 :     add dev3 = dev3, psad[3]
470 :     psad1 psad[1] = c[21], mean0
471 : ia64p 205 ;;
472 : ia64p 230 }
473 :     {.mii
474 :     add dev0 = dev0, psad[4]
475 :     psad1 psad[2] = c[22], mean0
476 :     psad1 psad[3] = c[23], mean0
477 :     }
478 :     {.mmi
479 :     add dev1 = dev1, psad[5]
480 :    
481 :     add dev2 = dev2, psad[6]
482 :     psad1 psad[4] = c[24], mean0
483 :     }
484 :     {.mii
485 :     add dev3 = dev3, psad[7]
486 :     psad1 psad[5] = c[25], mean0
487 :     ;;
488 :     psad1 psad[6] = c[26], mean0
489 :     }
490 :     {.mmi
491 :     add dev0 = dev0, psad[0]
492 :     add dev1 = dev1, psad[1]
493 :     psad1 psad[7] = c[27], mean0
494 :     }
495 :     {.mii
496 :     add dev2 = dev2, psad[2]
497 :     psad1 psad[0] = c[28], mean0
498 :     psad1 psad[1] = c[29], mean0
499 :     }
500 :     {.mmi
501 :     add dev3 = dev3, psad[3]
502 : ia64p 205 ;;
503 : ia64p 230 add dev0 = dev0, psad[4]
504 :     psad1 psad[2] = c[30], mean0
505 :     }
506 :     {.mii
507 :     add dev1 = dev1, psad[5]
508 :     psad1 psad[3] = c[31], mean0
509 :     ;;
510 :     add dev2 = dev2, psad[6]
511 :     }
512 :     {.mmi
513 :     add dev3 = dev3, psad[7]
514 :     add dev0 = dev0, psad[0]
515 :     add dev1 = dev1, psad[1]
516 : ia64p 205 ;;
517 : ia64p 230 }
518 :     {.mii
519 :     add dev2 = dev2, psad[2]
520 :     add dev3 = dev3, psad[3]
521 :     add ret0 = dev0, dev1
522 :     ;;
523 :     }
524 :     {.mib
525 :     add dev2 = dev2, dev3
526 :     nop.i 1
527 :     nop.b 1
528 :     ;;
529 :     }
530 :     {.mib
531 :     add ret0 = ret0, dev2
532 :     nop.i 1
533 : ia64p 205 br.ret.sptk.many b0
534 : ia64p 230 }
535 : ia64p 205 .endp dev16_ia64#
536 : ia64p 319
537 :    
538 :     // ###########################################################
539 :     // ###########################################################
540 :     // Neue version von gruppe 01 ################################
541 :     // ###########################################################
542 :     // ###########################################################
543 :    
544 :    
545 :    
546 :     .text
547 :     .align 16
548 :     .global sad16_ia64#
549 :     .proc sad16_ia64#
550 :     sad16_ia64:
551 :     alloc r1 = ar.pfs, 4, 76, 0, 0
552 :     mov r2 = pr
553 :     dep r14 = r0, r33, 0, 3 // r14 = (r33 div 8)*8 (aligned version of ref)
554 :     dep.z r31 = r33, 0, 3 // r31 = r33 mod 8 (misalignment of ref)
555 :     ;;
556 :     mov r64 = r34 //(1) calculate multiples of stride
557 :     shl r65 = r34, 1 //(2) for being able to load all the
558 :     shladd r66 = r34, 1, r34 //(3) data at once
559 :     shl r67 = r34, 2 //(4)
560 :     shladd r68 = r34, 2, r34 //(5)
561 :     shl r71 = r34, 3 //(8)
562 :     shladd r72 = r34, 3, r34 //(9)
563 :     ;;
564 :     shl r69 = r66, 1 //(6)
565 :     shladd r70 = r66, 1, r34 //(7)
566 :     shl r73 = r68, 1 //(10)
567 :     shladd r74 = r68, 1, r34 //(11)
568 :     shl r75 = r66, 2 //(12)
569 :     shladd r76 = r66, 2, r34 //(13)
570 :     shladd r77 = r66, 2, r65 //(14)
571 :     shladd r78 = r66, 2, r66 //(15)
572 :     ;;
573 :     cmp.eq p16, p17 = 0, r31 // prepare predicates according to the misalignment
574 :     cmp.eq p18, p19 = 2, r31 // ref
575 :     cmp.eq p20, p21 = 4, r31
576 :     cmp.eq p22, p23 = 6, r31
577 :     cmp.eq p24, p25 = 1, r31
578 :     cmp.eq p26, p27 = 3, r31
579 :     cmp.eq p28, p29 = 5, r31
580 :     mov r96 = r14 // and calculate all the adresses where we have
581 :     mov r33 = r32 // to load from
582 :     add r97 = r14, r64
583 :     add r35 = r32, r64
584 :     add r98 = r14, r65
585 :     add r37 = r32, r65
586 :     add r99 = r14, r66
587 :     add r39 = r32, r66
588 :     add r100 = r14, r67
589 :     add r41 = r32, r67
590 :     add r101 = r14, r68
591 :     add r43 = r32, r68
592 :     add r102 = r14, r69
593 :     add r45 = r32, r69
594 :     add r103 = r14, r70
595 :     add r47 = r32, r70
596 :     add r104 = r14, r71
597 :     add r49 = r32, r71
598 :     add r105 = r14, r72
599 :     add r51 = r32, r72
600 :     add r106 = r14, r73
601 :     add r53 = r32, r73
602 :     add r107 = r14, r74
603 :     add r55 = r32, r74
604 :     add r108 = r14, r75
605 :     add r57 = r32, r75
606 :     add r109 = r14, r76
607 :     add r59 = r32, r76
608 :     add r110 = r14, r77
609 :     add r61 = r32, r77
610 :     add r111 = r14, r78
611 :     add r63 = r32, r78
612 :     ;;
613 :     ld8 r32 = [r33], 8 // Load all the data which is needed for the sad
614 :     ld8 r34 = [r35], 8 // in the registers. the goal is to have the array
615 :     ld8 r36 = [r37], 8 // adressed by cur in the registers r32 - r63 and
616 :     ld8 r38 = [r39], 8 // the aray adressed by ref in the registers
617 :     ld8 r40 = [r41], 8 // r64 - r95. The registers r96 - r111 are needed
618 :     ld8 r42 = [r43], 8 // to load the aligned 24 bits in which the
619 :     ld8 r44 = [r45], 8 // needed misaligned 16 bits must be.
620 :     ld8 r46 = [r47], 8 // After loading we start a preprocessing which
621 :     ld8 r48 = [r49], 8 // guarantees that the data adressed by ref is in
622 :     ld8 r50 = [r51], 8 // the registers r64 - r95.
623 :     ld8 r52 = [r53], 8
624 :     ld8 r54 = [r55], 8
625 :     ld8 r56 = [r57], 8
626 :     ld8 r58 = [r59], 8
627 :     ld8 r60 = [r61], 8
628 :     ld8 r62 = [r63], 8
629 :     ld8 r64 = [r96], 8
630 :     ld8 r66 = [r97], 8
631 :     ld8 r68 = [r98], 8
632 :     ld8 r70 = [r99], 8
633 :     ld8 r72 = [r100], 8
634 :     ld8 r74 = [r101], 8
635 :     ld8 r76 = [r102], 8
636 :     ld8 r78 = [r103], 8
637 :     ld8 r80 = [r104], 8
638 :     ld8 r82 = [r105], 8
639 :     ld8 r84 = [r106], 8
640 :     ld8 r86 = [r107], 8
641 :     ld8 r88 = [r108], 8
642 :     ld8 r90 = [r109], 8
643 :     ld8 r92 = [r110], 8
644 :     ld8 r94 = [r111], 8
645 :     ;;
646 :     ld8 r33 = [r33]
647 :     ld8 r35 = [r35]
648 :     ld8 r37 = [r37]
649 :     ld8 r39 = [r39]
650 :     ld8 r41 = [r41]
651 :     ld8 r43 = [r43]
652 :     ld8 r45 = [r45]
653 :     ld8 r47 = [r47]
654 :     ld8 r49 = [r49]
655 :     ld8 r51 = [r51]
656 :     ld8 r53 = [r53]
657 :     ld8 r55 = [r55]
658 :     ld8 r57 = [r57]
659 :     ld8 r59 = [r59]
660 :     ld8 r61 = [r61]
661 :     ld8 r63 = [r63]
662 :     ld8 r65 = [r96], 8
663 :     ld8 r67 = [r97], 8
664 :     ld8 r69 = [r98], 8
665 :     ld8 r71 = [r99], 8
666 :     ld8 r73 = [r100], 8
667 :     ld8 r75 = [r101], 8
668 :     ld8 r77 = [r102], 8
669 :     ld8 r79 = [r103], 8
670 :     ld8 r81 = [r104], 8
671 :     ld8 r83 = [r105], 8
672 :     ld8 r85 = [r106], 8
673 :     ld8 r87 = [r107], 8
674 :     ld8 r89 = [r108], 8
675 :     ld8 r91 = [r109], 8
676 :     ld8 r93 = [r110], 8
677 :     ld8 r95 = [r111], 8
678 :     (p16) br.cond.dptk.many .Lber // If ref is aligned, everything is loaded and we can start the calculation
679 :     ;;
680 :     ld8 r96 = [r96] // If not, we have to load a bit more
681 :     ld8 r97 = [r97]
682 :     ld8 r98 = [r98]
683 :     ld8 r99 = [r99]
684 :     ld8 r100 = [r100]
685 :     ld8 r101 = [r101]
686 :     ld8 r102 = [r102]
687 :     ld8 r103 = [r103]
688 :     ld8 r104 = [r104]
689 :     ld8 r105 = [r105]
690 :     ld8 r106 = [r106]
691 :     ld8 r107 = [r107]
692 :     ld8 r108 = [r108]
693 :     ld8 r109 = [r109]
694 :     ld8 r110 = [r110]
695 :     ld8 r111 = [r111]
696 :     (p24) br.cond.dptk.many .Lmod1 // according to the misalignment, we have
697 :     (p18) br.cond.dpnt.many .Lmod2 // to jump to different preprocessing routines
698 :     (p26) br.cond.dpnt.many .Lmod3
699 :     (p20) br.cond.dpnt.many .Lmod4
700 :     (p28) br.cond.dpnt.many .Lmod5
701 :     (p22) br.cond.dpnt.many .Lmod6
702 :     ;;
703 :     .Lmod7: // this jump point is not needed
704 :     shrp r64 = r65, r64, 56 // in these blocks, we do the preprocessing
705 :     shrp r65 = r96, r65, 56
706 :     shrp r66 = r67, r66, 56
707 :     shrp r67 = r97, r67, 56
708 :     shrp r68 = r69, r68, 56
709 :     shrp r69 = r98, r69, 56
710 :     shrp r70 = r71, r70, 56
711 :     shrp r71 = r99, r71, 56
712 :     shrp r72 = r73, r72, 56
713 :     shrp r73 = r100, r73, 56
714 :     shrp r74 = r75, r74, 56
715 :     shrp r75 = r101, r75, 56
716 :     shrp r76 = r77, r76, 56
717 :     shrp r77 = r102, r77, 56
718 :     shrp r78 = r79, r78, 56
719 :     shrp r79 = r103, r79, 56
720 :     shrp r80 = r81, r80, 56
721 :     shrp r81 = r104, r81, 56
722 :     shrp r82 = r83, r82, 56
723 :     shrp r83 = r105, r83, 56
724 :     shrp r84 = r85, r84, 56
725 :     shrp r85 = r106, r85, 56
726 :     shrp r86 = r87, r86, 56
727 :     shrp r87 = r107, r87, 56
728 :     shrp r88 = r89, r88, 56
729 :     shrp r89 = r108, r89, 56
730 :     shrp r90 = r91, r90, 56
731 :     shrp r91 = r109, r91, 56
732 :     shrp r92 = r93, r92, 56
733 :     shrp r93 = r110, r93, 56
734 :     shrp r94 = r95, r94, 56
735 :     shrp r95 = r111, r95, 56
736 :     br.cond.sptk.many .Lber // and then we jump to the calculation
737 :     ;;
738 :     .Lmod6:
739 :     shrp r64 = r65, r64, 48
740 :     shrp r65 = r96, r65, 48
741 :     shrp r66 = r67, r66, 48
742 :     shrp r67 = r97, r67, 48
743 :     shrp r68 = r69, r68, 48
744 :     shrp r69 = r98, r69, 48
745 :     shrp r70 = r71, r70, 48
746 :     shrp r71 = r99, r71, 48
747 :     shrp r72 = r73, r72, 48
748 :     shrp r73 = r100, r73, 48
749 :     shrp r74 = r75, r74, 48
750 :     shrp r75 = r101, r75, 48
751 :     shrp r76 = r77, r76, 48
752 :     shrp r77 = r102, r77, 48
753 :     shrp r78 = r79, r78, 48
754 :     shrp r79 = r103, r79, 48
755 :     shrp r80 = r81, r80, 48
756 :     shrp r81 = r104, r81, 48
757 :     shrp r82 = r83, r82, 48
758 :     shrp r83 = r105, r83, 48
759 :     shrp r84 = r85, r84, 48
760 :     shrp r85 = r106, r85, 48
761 :     shrp r86 = r87, r86, 48
762 :     shrp r87 = r107, r87, 48
763 :     shrp r88 = r89, r88, 48
764 :     shrp r89 = r108, r89, 48
765 :     shrp r90 = r91, r90, 48
766 :     shrp r91 = r109, r91, 48
767 :     shrp r92 = r93, r92, 48
768 :     shrp r93 = r110, r93, 48
769 :     shrp r94 = r95, r94, 48
770 :     shrp r95 = r111, r95, 48
771 :     br.cond.sptk.many .Lber
772 :     ;;
773 :     .Lmod5:
774 :     shrp r64 = r65, r64, 40
775 :     shrp r65 = r96, r65, 40
776 :     shrp r66 = r67, r66, 40
777 :     shrp r67 = r97, r67, 40
778 :     shrp r68 = r69, r68, 40
779 :     shrp r69 = r98, r69, 40
780 :     shrp r70 = r71, r70, 40
781 :     shrp r71 = r99, r71, 40
782 :     shrp r72 = r73, r72, 40
783 :     shrp r73 = r100, r73, 40
784 :     shrp r74 = r75, r74, 40
785 :     shrp r75 = r101, r75, 40
786 :     shrp r76 = r77, r76, 40
787 :     shrp r77 = r102, r77, 40
788 :     shrp r78 = r79, r78, 40
789 :     shrp r79 = r103, r79, 40
790 :     shrp r80 = r81, r80, 40
791 :     shrp r81 = r104, r81, 40
792 :     shrp r82 = r83, r82, 40
793 :     shrp r83 = r105, r83, 40
794 :     shrp r84 = r85, r84, 40
795 :     shrp r85 = r106, r85, 40
796 :     shrp r86 = r87, r86, 40
797 :     shrp r87 = r107, r87, 40
798 :     shrp r88 = r89, r88, 40
799 :     shrp r89 = r108, r89, 40
800 :     shrp r90 = r91, r90, 40
801 :     shrp r91 = r109, r91, 40
802 :     shrp r92 = r93, r92, 40
803 :     shrp r93 = r110, r93, 40
804 :     shrp r94 = r95, r94, 40
805 :     shrp r95 = r111, r95, 40
806 :     br.cond.sptk.many .Lber
807 :     ;;
808 :     .Lmod4:
809 :     shrp r64 = r65, r64, 32
810 :     shrp r65 = r96, r65, 32
811 :     shrp r66 = r67, r66, 32
812 :     shrp r67 = r97, r67, 32
813 :     shrp r68 = r69, r68, 32
814 :     shrp r69 = r98, r69, 32
815 :     shrp r70 = r71, r70, 32
816 :     shrp r71 = r99, r71, 32
817 :     shrp r72 = r73, r72, 32
818 :     shrp r73 = r100, r73, 32
819 :     shrp r74 = r75, r74, 32
820 :     shrp r75 = r101, r75, 32
821 :     shrp r76 = r77, r76, 32
822 :     shrp r77 = r102, r77, 32
823 :     shrp r78 = r79, r78, 32
824 :     shrp r79 = r103, r79, 32
825 :     shrp r80 = r81, r80, 32
826 :     shrp r81 = r104, r81, 32
827 :     shrp r82 = r83, r82, 32
828 :     shrp r83 = r105, r83, 32
829 :     shrp r84 = r85, r84, 32
830 :     shrp r85 = r106, r85, 32
831 :     shrp r86 = r87, r86, 32
832 :     shrp r87 = r107, r87, 32
833 :     shrp r88 = r89, r88, 32
834 :     shrp r89 = r108, r89, 32
835 :     shrp r90 = r91, r90, 32
836 :     shrp r91 = r109, r91, 32
837 :     shrp r92 = r93, r92, 32
838 :     shrp r93 = r110, r93, 32
839 :     shrp r94 = r95, r94, 32
840 :     shrp r95 = r111, r95, 32
841 :     br.cond.sptk.many .Lber
842 :     ;;
843 :     .Lmod3:
844 :     shrp r64 = r65, r64, 24
845 :     shrp r65 = r96, r65, 24
846 :     shrp r66 = r67, r66, 24
847 :     shrp r67 = r97, r67, 24
848 :     shrp r68 = r69, r68, 24
849 :     shrp r69 = r98, r69, 24
850 :     shrp r70 = r71, r70, 24
851 :     shrp r71 = r99, r71, 24
852 :     shrp r72 = r73, r72, 24
853 :     shrp r73 = r100, r73, 24
854 :     shrp r74 = r75, r74, 24
855 :     shrp r75 = r101, r75, 24
856 :     shrp r76 = r77, r76, 24
857 :     shrp r77 = r102, r77, 24
858 :     shrp r78 = r79, r78, 24
859 :     shrp r79 = r103, r79, 24
860 :     shrp r80 = r81, r80, 24
861 :     shrp r81 = r104, r81, 24
862 :     shrp r82 = r83, r82, 24
863 :     shrp r83 = r105, r83, 24
864 :     shrp r84 = r85, r84, 24
865 :     shrp r85 = r106, r85, 24
866 :     shrp r86 = r87, r86, 24
867 :     shrp r87 = r107, r87, 24
868 :     shrp r88 = r89, r88, 24
869 :     shrp r89 = r108, r89, 24
870 :     shrp r90 = r91, r90, 24
871 :     shrp r91 = r109, r91, 24
872 :     shrp r92 = r93, r92, 24
873 :     shrp r93 = r110, r93, 24
874 :     shrp r94 = r95, r94, 24
875 :     shrp r95 = r111, r95, 24
876 :     br.cond.sptk.many .Lber
877 :     ;;
878 :     .Lmod2:
879 :     shrp r64 = r65, r64, 16
880 :     shrp r65 = r96, r65, 16
881 :     shrp r66 = r67, r66, 16
882 :     shrp r67 = r97, r67, 16
883 :     shrp r68 = r69, r68, 16
884 :     shrp r69 = r98, r69, 16
885 :     shrp r70 = r71, r70, 16
886 :     shrp r71 = r99, r71, 16
887 :     shrp r72 = r73, r72, 16
888 :     shrp r73 = r100, r73, 16
889 :     shrp r74 = r75, r74, 16
890 :     shrp r75 = r101, r75, 16
891 :     shrp r76 = r77, r76, 16
892 :     shrp r77 = r102, r77, 16
893 :     shrp r78 = r79, r78, 16
894 :     shrp r79 = r103, r79, 16
895 :     shrp r80 = r81, r80, 16
896 :     shrp r81 = r104, r81, 16
897 :     shrp r82 = r83, r82, 16
898 :     shrp r83 = r105, r83, 16
899 :     shrp r84 = r85, r84, 16
900 :     shrp r85 = r106, r85, 16
901 :     shrp r86 = r87, r86, 16
902 :     shrp r87 = r107, r87, 16
903 :     shrp r88 = r89, r88, 16
904 :     shrp r89 = r108, r89, 16
905 :     shrp r90 = r91, r90, 16
906 :     shrp r91 = r109, r91, 16
907 :     shrp r92 = r93, r92, 16
908 :     shrp r93 = r110, r93, 16
909 :     shrp r94 = r95, r94, 16
910 :     shrp r95 = r111, r95, 16
911 :     br.cond.sptk.many .Lber
912 :     ;;
913 :     .Lmod1:
914 :     shrp r64 = r65, r64, 8
915 :     shrp r65 = r96, r65, 8
916 :     shrp r66 = r67, r66, 8
917 :     shrp r67 = r97, r67, 8
918 :     shrp r68 = r69, r68, 8
919 :     shrp r69 = r98, r69, 8
920 :     shrp r70 = r71, r70, 8
921 :     shrp r71 = r99, r71, 8
922 :     shrp r72 = r73, r72, 8
923 :     shrp r73 = r100, r73, 8
924 :     shrp r74 = r75, r74, 8
925 :     shrp r75 = r101, r75, 8
926 :     shrp r76 = r77, r76, 8
927 :     shrp r77 = r102, r77, 8
928 :     shrp r78 = r79, r78, 8
929 :     shrp r79 = r103, r79, 8
930 :     shrp r80 = r81, r80, 8
931 :     shrp r81 = r104, r81, 8
932 :     shrp r82 = r83, r82, 8
933 :     shrp r83 = r105, r83, 8
934 :     shrp r84 = r85, r84, 8
935 :     shrp r85 = r106, r85, 8
936 :     shrp r86 = r87, r86, 8
937 :     shrp r87 = r107, r87, 8
938 :     shrp r88 = r89, r88, 8
939 :     shrp r89 = r108, r89, 8
940 :     shrp r90 = r91, r90, 8
941 :     shrp r91 = r109, r91, 8
942 :     shrp r92 = r93, r92, 8
943 :     shrp r93 = r110, r93, 8
944 :     shrp r94 = r95, r94, 8
945 :     shrp r95 = r111, r95, 8
946 :     .Lber:
947 :     ;;
948 :     psad1 r32 = r32, r64 // Here we do the calculation.
949 :     psad1 r33 = r33, r65 // The machine is providing a fast method
950 :     psad1 r34 = r34, r66 // for calculating sad, so we use it
951 :     psad1 r35 = r35, r67
952 :     psad1 r36 = r36, r68
953 :     psad1 r37 = r37, r69
954 :     psad1 r38 = r38, r70
955 :     psad1 r39 = r39, r71
956 :     psad1 r40 = r40, r72
957 :     psad1 r41 = r41, r73
958 :     psad1 r42 = r42, r74
959 :     psad1 r43 = r43, r75
960 :     psad1 r44 = r44, r76
961 :     psad1 r45 = r45, r77
962 :     psad1 r46 = r46, r78
963 :     psad1 r47 = r47, r79
964 :     psad1 r48 = r48, r80
965 :     psad1 r49 = r49, r81
966 :     psad1 r50 = r50, r82
967 :     psad1 r51 = r51, r83
968 :     psad1 r52 = r52, r84
969 :     psad1 r53 = r53, r85
970 :     psad1 r54 = r54, r86
971 :     psad1 r55 = r55, r87
972 :     psad1 r56 = r56, r88
973 :     psad1 r57 = r57, r89
974 :     psad1 r58 = r58, r90
975 :     psad1 r59 = r59, r91
976 :     psad1 r60 = r60, r92
977 :     psad1 r61 = r61, r93
978 :     psad1 r62 = r62, r94
979 :     psad1 r63 = r63, r95
980 :     ;;
981 :     add r32 = r32, r63 // at last, we have to sum up
982 :     add r33 = r33, r62 // in 5 stages
983 :     add r34 = r34, r61
984 :     add r35 = r35, r60
985 :     add r36 = r36, r59
986 :     add r37 = r37, r58
987 :     add r38 = r38, r57
988 :     add r39 = r39, r56
989 :     add r40 = r40, r55
990 :     add r41 = r41, r54
991 :     add r42 = r42, r53
992 :     add r43 = r43, r52
993 :     add r44 = r44, r51
994 :     add r45 = r45, r50
995 :     add r46 = r46, r49
996 :     add r47 = r47, r48
997 :     ;;
998 :     add r32 = r32, r47
999 :     add r33 = r33, r46
1000 :     add r34 = r34, r45
1001 :     add r35 = r35, r44
1002 :     add r36 = r36, r43
1003 :     add r37 = r37, r42
1004 :     add r38 = r38, r41
1005 :     add r39 = r39, r40
1006 :     ;;
1007 :     add r32 = r32, r39
1008 :     add r33 = r33, r38
1009 :     add r34 = r34, r37
1010 :     add r35 = r35, r36
1011 :     ;;
1012 :     add r32 = r32, r35
1013 :     add r33 = r33, r34
1014 :     ;;
1015 :     add r8 = r32, r33 // and store the result in r8
1016 :     mov pr = r2, -1
1017 :     mov ar.pfs = r1
1018 :     br.ret.sptk.many b0
1019 :     .endp sad16_ia64#
1020 :    
1021 :    
1022 :    
1023 :    
1024 :     .align 16
1025 :     .global sad8_ia64#
1026 :     .proc sad8_ia64#
1027 :     sad8_ia64:
1028 :     alloc r1 = ar.pfs, 3, 21, 0, 0
1029 :     mov r2 = pr
1030 :     dep r14 = r0, r33, 0, 3 // calculate aligned version of ref
1031 :     dep.z r31 = r33, 0, 3 // calculate misalignment of ref
1032 :     ;;
1033 :     mov r40 = r34 //(1) calculate multiples of stride
1034 :     shl r41 = r34, 1 //(2)
1035 :     shladd r42 = r34, 1, r34 //(3)
1036 :     shl r43 = r34, 2 //(4)
1037 :     shladd r44 = r34, 2, r34 //(5)
1038 :     ;;
1039 :     cmp.eq p16, p17 = 0, r31 // set predicates according to the misalignment of ref
1040 :     cmp.eq p18, p19 = 2, r31
1041 :     shl r45 = r42, 1 //(6)
1042 :     cmp.eq p20, p21 = 4, r31
1043 :     cmp.eq p22, p23 = 6, r31
1044 :     shladd r46 = r42, 1, r34 //(7)
1045 :     cmp.eq p24, p25 = 1, r31
1046 :     cmp.eq p26, p27 = 3, r31
1047 :     cmp.eq p28, p29 = 5, r31
1048 :     ;;
1049 :     mov r48 = r14 // calculate memory adresses of data
1050 :     add r33 = r32, r40
1051 :     add r49 = r14, r40
1052 :     add r34 = r32, r41
1053 :     add r50 = r14, r41
1054 :     add r35 = r32, r42
1055 :     add r51 = r14, r42
1056 :     add r36 = r32, r43
1057 :     add r52 = r14, r43
1058 :     add r37 = r32, r44
1059 :     add r53 = r14, r44
1060 :     add r38 = r32, r45
1061 :     add r54 = r14, r45
1062 :     add r39 = r32, r46
1063 :     add r55 = r14, r46
1064 :     ;;
1065 :     ld8 r32 = [r32] // load everythingund alles wird geladen
1066 :     ld8 r33 = [r33] // cur is located in r32 - r39
1067 :     ld8 r34 = [r34] // ref in r40 - r47
1068 :     ld8 r35 = [r35]
1069 :     ld8 r36 = [r36]
1070 :     ld8 r37 = [r37]
1071 :     ld8 r38 = [r38]
1072 :     ld8 r39 = [r39]
1073 :     ld8 r40 = [r48] ,8
1074 :     ld8 r41 = [r49] ,8
1075 :     ld8 r42 = [r50] ,8
1076 :     ld8 r43 = [r51] ,8
1077 :     ld8 r44 = [r52] ,8
1078 :     ld8 r45 = [r53] ,8
1079 :     ld8 r46 = [r54] ,8
1080 :     ld8 r47 = [r55] ,8
1081 :     (p16) br.cond.dptk.many .Lber2 // if ref is aligned, we can start the calculation
1082 :     ;;
1083 :     ld8 r48 = [r48] // if not, we have to load some more
1084 :     ld8 r49 = [r49] // because of the alignment of ld8
1085 :     ld8 r50 = [r50]
1086 :     ld8 r51 = [r51]
1087 :     ld8 r52 = [r52]
1088 :     ld8 r53 = [r53]
1089 :     ld8 r54 = [r54]
1090 :     ld8 r55 = [r55]
1091 :     (p24) br.cond.dptk.many .Lmode1
1092 :     (p18) br.cond.dpnt.many .Lmode2
1093 :     (p26) br.cond.dpnt.many .Lmode3
1094 :     (p20) br.cond.dpnt.many .Lmode4
1095 :     (p28) br.cond.dpnt.many .Lmode5
1096 :     (p22) br.cond.dpnt.many .Lmode6
1097 :     ;;
1098 :     .Lmode7: // this jump piont is not needed, it is for better understandment
1099 :     shrp r40 = r48, r40, 56 // here we do some preprocessing on the data
1100 :     shrp r41 = r49, r41, 56 // this is because of the alignment problem of ref
1101 :     shrp r42 = r50, r42, 56
1102 :     shrp r43 = r51, r43, 56
1103 :     shrp r44 = r52, r44, 56
1104 :     shrp r45 = r53, r45, 56
1105 :     shrp r46 = r54, r46, 56
1106 :     shrp r47 = r55, r47, 56
1107 :     br.cond.sptk.many .Lber2
1108 :     ;;
1109 :     .Lmode6:
1110 :     shrp r40 = r48, r40, 48
1111 :     shrp r41 = r49, r41, 48
1112 :     shrp r42 = r50, r42, 48
1113 :     shrp r43 = r51, r43, 48
1114 :     shrp r44 = r52, r44, 48
1115 :     shrp r45 = r53, r45, 48
1116 :     shrp r46 = r54, r46, 48
1117 :     shrp r47 = r55, r47, 48
1118 :     br.cond.sptk.many .Lber2
1119 :     ;;
1120 :     .Lmode5:
1121 :     shrp r40 = r48, r40, 40
1122 :     shrp r41 = r49, r41, 40
1123 :     shrp r42 = r50, r42, 40
1124 :     shrp r43 = r51, r43, 40
1125 :     shrp r44 = r52, r44, 40
1126 :     shrp r45 = r53, r45, 40
1127 :     shrp r46 = r54, r46, 40
1128 :     shrp r47 = r55, r47, 40
1129 :     br.cond.sptk.many .Lber2
1130 :     ;;
1131 :     .Lmode4:
1132 :     shrp r40 = r48, r40, 32
1133 :     shrp r41 = r49, r41, 32
1134 :     shrp r42 = r50, r42, 32
1135 :     shrp r43 = r51, r43, 32
1136 :     shrp r44 = r52, r44, 32
1137 :     shrp r45 = r53, r45, 32
1138 :     shrp r46 = r54, r46, 32
1139 :     shrp r47 = r55, r47, 32
1140 :     br.cond.sptk.many .Lber2
1141 :     ;;
1142 :     .Lmode3:
1143 :     shrp r40 = r48, r40, 24
1144 :     shrp r41 = r49, r41, 24
1145 :     shrp r42 = r50, r42, 24
1146 :     shrp r43 = r51, r43, 24
1147 :     shrp r44 = r52, r44, 24
1148 :     shrp r45 = r53, r45, 24
1149 :     shrp r46 = r54, r46, 24
1150 :     shrp r47 = r55, r47, 24
1151 :     br.cond.sptk.many .Lber2
1152 :     ;;
1153 :     .Lmode2:
1154 :     shrp r40 = r48, r40, 16
1155 :     shrp r41 = r49, r41, 16
1156 :     shrp r42 = r50, r42, 16
1157 :     shrp r43 = r51, r43, 16
1158 :     shrp r44 = r52, r44, 16
1159 :     shrp r45 = r53, r45, 16
1160 :     shrp r46 = r54, r46, 16
1161 :     shrp r47 = r55, r47, 16
1162 :     br.cond.sptk.many .Lber2
1163 :     ;;
1164 :     .Lmode1:
1165 :     shrp r40 = r48, r40, 8
1166 :     shrp r41 = r49, r41, 8
1167 :     shrp r42 = r50, r42, 8
1168 :     shrp r43 = r51, r43, 8
1169 :     shrp r44 = r52, r44, 8
1170 :     shrp r45 = r53, r45, 8
1171 :     shrp r46 = r54, r46, 8
1172 :     shrp r47 = r55, r47, 8
1173 :     .Lber2:
1174 :     ;;
1175 :     psad1 r32 = r32, r40 // we start calculating sad
1176 :     psad1 r33 = r33, r41 // using th psad1 command of IA64
1177 :     psad1 r34 = r34, r42
1178 :     psad1 r35 = r35, r43
1179 :     psad1 r36 = r36, r44
1180 :     psad1 r37 = r37, r45
1181 :     psad1 r38 = r38, r46
1182 :     psad1 r39 = r39, r47
1183 :     ;;
1184 :     add r32 = r32, r33 // then we sum up everything
1185 :     add r33 = r34, r35
1186 :     add r34 = r36, r37
1187 :     add r35 = r38, r39
1188 :     ;;
1189 :     add r32 = r32, r33
1190 :     add r33 = r34, r35
1191 :     ;;
1192 :     add r8 = r32, r33 // and store the result un r8
1193 :     mov pr = r2, -1
1194 :     mov ar.pfs = r1
1195 :     br.ret.sptk.many b0
1196 :     .endp sad8_ia64#

No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4