[svn] / trunk / xvidcore / src / motion / ia64_asm / sad_ia64.s Repository:
ViewVC logotype

Annotation of /trunk/xvidcore/src/motion/ia64_asm/sad_ia64.s

Parent Directory Parent Directory | Revision Log Revision Log


Revision 319 - (view) (download)

1 : ia64p 250 // ------------------------------------------------------------------------------
2 :     // *
3 :     // * Optimized Assembler Versions of sad8 and sad16
4 :     // *
5 :     // ------------------------------------------------------------------------------
6 :     // *
7 :     // * Hannes Jütting and Christopher Özbek
8 :     // * {s_juetti,s_oezbek}@ira.uka.de
9 :     // *
10 :     // * Programmed for the IA64 laboratory held at University Karlsruhe 2002
11 :     // * http://www.info.uni-karlsruhe.de/~rubino/ia64p/
12 :     // *
13 :     // ------------------------------------------------------------------------------
14 :     // *
15 :     // * These are the optimized assembler versions of sad8 and sad16, which calculate
16 :     // * the sum of absolute differences between two 8x8/16x16 block matrices.
17 :     // *
18 :     // * Our approach uses:
19 :     // * - The Itanium command psad1, which solves the problem in hardware.
20 :     // * - Modulo-Scheduled Loops as the best way to loop unrolling on the IA64
21 :     // * EPIC architecture
22 :     // * - Alignment resolving to avoid memory faults
23 :     // *
24 :     // ------------------------------------------------------------------------------
25 :    
26 :    
27 : ia64p 205
28 : ia64p 250
29 : ia64p 205 .common sad16bi#,8,8
30 :     .align 16
31 :     .global sad16bi_ia64#
32 :     .proc sad16bi_ia64#
33 :     sad16bi_ia64:
34 :     .prologue
35 :     .save ar.lc, r2
36 :     mov r2 = ar.lc
37 :     .body
38 :     zxt4 r35 = r35
39 :     mov r8 = r0
40 :     mov r23 = r0
41 :     addl r22 = 255, r0
42 :     .L21:
43 :     addl r14 = 7, r0
44 :     mov r19 = r32
45 :     mov r21 = r34
46 :     mov r20 = r33
47 :     ;;
48 :     mov ar.lc = r14
49 :     ;;
50 :     .L105:
51 :     mov r17 = r20
52 :     mov r18 = r21
53 :     ;;
54 :     ld1 r14 = [r17], 1
55 :     ld1 r15 = [r18], 1
56 :     ;;
57 :     add r14 = r14, r15
58 :     ;;
59 :     adds r14 = 1, r14
60 :     ;;
61 :     shr.u r16 = r14, 1
62 :     ;;
63 :     cmp4.le p6, p7 = r0, r16
64 :     ;;
65 :     (p7) mov r16 = r0
66 :     (p7) br.cond.dpnt .L96
67 :     ;;
68 :     cmp4.ge p6, p7 = r22, r16
69 :     ;;
70 :     (p7) addl r16 = 255, r0
71 :     .L96:
72 :     ld1 r14 = [r19]
73 :     adds r20 = 2, r20
74 :     adds r21 = 2, r21
75 :     ;;
76 :     sub r15 = r14, r16
77 :     ;;
78 :     cmp4.ge p6, p7 = 0, r15
79 :     ;;
80 :     (p6) sub r14 = r16, r14
81 :     (p7) add r8 = r8, r15
82 :     ;;
83 :     (p6) add r8 = r8, r14
84 :     ld1 r15 = [r18]
85 :     ld1 r14 = [r17]
86 :     ;;
87 :     add r14 = r14, r15
88 :     adds r17 = 1, r19
89 :     ;;
90 :     adds r14 = 1, r14
91 :     ;;
92 :     shr.u r16 = r14, 1
93 :     ;;
94 :     cmp4.le p6, p7 = r0, r16
95 :     ;;
96 :     (p7) mov r16 = r0
97 :     (p7) br.cond.dpnt .L102
98 :     ;;
99 :     cmp4.ge p6, p7 = r22, r16
100 :     ;;
101 :     (p7) addl r16 = 255, r0
102 :     .L102:
103 :     ld1 r14 = [r17]
104 :     adds r19 = 2, r19
105 :     ;;
106 :     sub r15 = r14, r16
107 :     ;;
108 :     cmp4.ge p6, p7 = 0, r15
109 :     ;;
110 :     (p7) add r8 = r8, r15
111 :     (p6) sub r14 = r16, r14
112 :     ;;
113 :     (p6) add r8 = r8, r14
114 :     br.cloop.sptk.few .L105
115 :     adds r23 = 1, r23
116 :     add r32 = r32, r35
117 :     add r33 = r33, r35
118 :     add r34 = r34, r35
119 :     ;;
120 :     cmp4.geu p6, p7 = 15, r23
121 :     (p6) br.cond.dptk .L21
122 :     mov ar.lc = r2
123 :     br.ret.sptk.many b0
124 :     .endp sad16bi_ia64#
125 :    
126 :    
127 : ia64p 230
128 :    
129 :    
130 :    
131 :    
132 :    
133 :     .text
134 : ia64p 205 .align 16
135 :     .global dev16_ia64#
136 :     .proc dev16_ia64#
137 : ia64p 230 .auto
138 : ia64p 205 dev16_ia64:
139 : ia64p 230 // renamings for better readability
140 :     stride = r18
141 :     pfs = r19 //for saving previous function state
142 :     cura0 = r20 //address of first 8-byte block of cur
143 :     cura1 = r21 //address of second 8-byte block of cur
144 :     mean0 = r22 //registers for calculating the sum in parallel
145 :     mean1 = r23
146 :     mean2 = r24
147 :     mean3 = r25
148 :     dev0 = r26 //same for the deviation
149 :     dev1 = r27
150 :     dev2 = r28
151 :     dev3 = r29
152 :    
153 : ia64p 205 .body
154 : ia64p 230 alloc pfs = ar.pfs, 2, 38, 0, 40
155 :    
156 :     mov cura0 = in0
157 :     mov stride = in1
158 :     add cura1 = 8, cura0
159 :    
160 :     .rotr c[32], psad[8] // just using rotating registers to get an array ;-)
161 :    
162 :     .explicit
163 :     {.mmi
164 :     ld8 c[0] = [cura0], stride // load them ...
165 :     ld8 c[1] = [cura1], stride
166 :     ;;
167 :     }
168 :     {.mmi
169 :     ld8 c[2] = [cura0], stride
170 :     ld8 c[3] = [cura1], stride
171 :     ;;
172 :     }
173 :     {.mmi
174 :     ld8 c[4] = [cura0], stride
175 :     ld8 c[5] = [cura1], stride
176 : ia64p 205 ;;
177 : ia64p 230 }
178 :     {.mmi
179 :     ld8 c[6] = [cura0], stride
180 :     ld8 c[7] = [cura1], stride
181 : ia64p 205 ;;
182 : ia64p 230 }
183 :     {.mmi
184 :     ld8 c[8] = [cura0], stride
185 :     ld8 c[9] = [cura1], stride
186 : ia64p 205 ;;
187 : ia64p 230 }
188 :     {.mmi
189 :     ld8 c[10] = [cura0], stride
190 :     ld8 c[11] = [cura1], stride
191 : ia64p 205 ;;
192 : ia64p 230 }
193 :     {.mii
194 :     ld8 c[12] = [cura0], stride
195 :     psad1 mean0 = c[0], r0 // get the sum of them ...
196 :     psad1 mean1 = c[1], r0
197 :     }
198 :     {.mmi
199 :     ld8 c[13] = [cura1], stride
200 :     ;;
201 :     ld8 c[14] = [cura0], stride
202 :     psad1 mean2 = c[2], r0
203 :     }
204 :     {.mii
205 :     ld8 c[15] = [cura1], stride
206 :     psad1 mean3 = c[3], r0
207 :     ;;
208 :     psad1 psad[0] = c[4], r0
209 :     }
210 :     {.mmi
211 :     ld8 c[16] = [cura0], stride
212 :     ld8 c[17] = [cura1], stride
213 :     psad1 psad[1] = c[5], r0
214 : ia64p 205 ;;
215 : ia64p 230 }
216 :     {.mii
217 :     ld8 c[18] = [cura0], stride
218 :     psad1 psad[2] = c[6], r0
219 :     psad1 psad[3] = c[7], r0
220 :     }
221 :     {.mmi
222 :     ld8 c[19] = [cura1], stride
223 :     ;;
224 :     ld8 c[20] = [cura0], stride
225 :     psad1 psad[4] = c[8], r0
226 :     }
227 :     {.mii
228 :     ld8 c[21] = [cura1], stride
229 :     psad1 psad[5] = c[9], r0
230 : ia64p 205 ;;
231 : ia64p 230 add mean0 = mean0, psad[0]
232 :     }
233 :     {.mmi
234 :     ld8 c[22] = [cura0], stride
235 :     ld8 c[23] = [cura1], stride
236 :     add mean1 = mean1, psad[1]
237 :     ;;
238 :     }
239 :     {.mii
240 :     ld8 c[24] = [cura0], stride
241 :     psad1 psad[0] = c[10], r0
242 :     psad1 psad[1] = c[11], r0
243 :     }
244 :     {.mmi
245 :     ld8 c[25] = [cura1], stride
246 :     ;;
247 :     ld8 c[26] = [cura0], stride
248 :     add mean2 = mean2, psad[2]
249 :     }
250 :     {.mii
251 :     ld8 c[27] = [cura1], stride
252 :     add mean3 = mean3, psad[3]
253 :     ;;
254 :     psad1 psad[2] = c[12], r0
255 :     }
256 :     {.mmi
257 :     ld8 c[28] = [cura0], stride
258 :     ld8 c[29] = [cura1], stride
259 :     psad1 psad[3] = c[13], r0
260 :     ;;
261 :     }
262 :     {.mii
263 :     ld8 c[30] = [cura0]
264 :     psad1 psad[6] = c[14], r0
265 :     psad1 psad[7] = c[15], r0
266 :     }
267 :     {.mmi
268 :     ld8 c[31] = [cura1]
269 :     ;;
270 :     add mean0 = mean0, psad[0]
271 :     add mean1 = mean1, psad[1]
272 :     }
273 :     {.mii
274 :     add mean2 = mean2, psad[4]
275 :     add mean3 = mean3, psad[5]
276 : ia64p 205 ;;
277 : ia64p 230 psad1 psad[0] = c[16], r0
278 :     }
279 :     {.mmi
280 :     add mean0 = mean0, psad[2]
281 :     add mean1 = mean1, psad[3]
282 :     psad1 psad[1] = c[17], r0
283 : ia64p 205 ;;
284 : ia64p 230 }
285 :     {.mii
286 :     add mean2 = mean2, psad[6]
287 :     psad1 psad[2] = c[18], r0
288 :     psad1 psad[3] = c[19], r0
289 :     }
290 :     {.mmi
291 :     add mean3 = mean3, psad[7]
292 :     ;;
293 :     add mean0 = mean0, psad[0]
294 :     psad1 psad[4] = c[20], r0
295 :     }
296 :     {.mii
297 :     add mean1 = mean1, psad[1]
298 :     psad1 psad[5] = c[21], r0
299 : ia64p 205 ;;
300 : ia64p 230 psad1 psad[6] = c[22], r0
301 :     }
302 :     {.mmi
303 :     add mean2 = mean2, psad[2]
304 :     add mean3 = mean3, psad[3]
305 :     psad1 psad[7] = c[23], r0
306 : ia64p 205 ;;
307 : ia64p 230 }
308 :     {.mii
309 :     add mean0 = mean0, psad[4]
310 :     psad1 psad[0] = c[24], r0
311 :     psad1 psad[1] = c[25], r0
312 :     }
313 :     {.mmi
314 :     add mean1 = mean1, psad[5]
315 : ia64p 205 ;;
316 : ia64p 230 add mean2 = mean2, psad[6]
317 :     psad1 psad[2] = c[26], r0
318 :     }
319 :     {.mii
320 :     add mean3 = mean3, psad[7]
321 :     psad1 psad[3] = c[27], r0
322 :     ;;
323 :     psad1 psad[4] = c[28], r0
324 :     }
325 :     {.mmi
326 :     add mean0 = mean0, psad[0]
327 :     add mean1 = mean1, psad[1]
328 :     psad1 psad[5] = c[29], r0
329 : ia64p 205 ;;
330 : ia64p 230 }
331 :     {.mii
332 :     add mean2 = mean2, psad[2]
333 :     psad1 psad[6] = c[30], r0
334 :     psad1 psad[7] = c[31], r0
335 :     }
336 :     {.mmi
337 :     add mean3 = mean3, psad[3]
338 : ia64p 205 ;;
339 : ia64p 230 add mean0 = mean0, psad[4]
340 :     add mean1 = mean1, psad[5]
341 :     }
342 :     {.mbb
343 :     add mean2 = mean2, mean3
344 :     nop.b 1
345 :     nop.b 1
346 : ia64p 205 ;;
347 : ia64p 230 }
348 :     {.mib
349 :     add mean0 = mean0, psad[6]
350 :     add mean1 = mean1, psad[7]
351 :     nop.b 1
352 : ia64p 205 ;;
353 : ia64p 230 }
354 :     {.mib
355 :     add mean0 = mean0, mean1
356 : ia64p 300 // add mean2 = 127, mean2 // this could make our division more exactly, but does not help much
357 : ia64p 205 ;;
358 : ia64p 230 }
359 :     {.mib
360 :     add mean0 = mean0, mean2
361 : ia64p 205 ;;
362 : ia64p 230 }
363 :    
364 :     {.mib
365 :     shr.u mean0 = mean0, 8 // divide them ...
366 : ia64p 205 ;;
367 : ia64p 230 }
368 :     {.mib
369 :     mux1 mean0 = mean0, @brcst
370 :     ;;
371 :     }
372 :     {.mii
373 :     nop.m 0
374 :     psad1 dev0 = c[0], mean0 // and do a sad again ...
375 :     psad1 dev1 = c[1], mean0
376 :     }
377 :     {.mii
378 :     nop.m 0
379 :     psad1 dev2 = c[2], mean0
380 :     psad1 dev3 = c[3], mean0
381 :     }
382 :     {.mii
383 :     nop.m 0
384 :     psad1 psad[0] = c[4], mean0
385 :     psad1 psad[1] = c[5], mean0
386 :     }
387 :     {.mii
388 :     nop.m 0
389 :     psad1 psad[2] = c[6], mean0
390 :     psad1 psad[3] = c[7], mean0
391 :     }
392 :     {.mii
393 :     nop.m 0
394 :     psad1 psad[4] = c[8], mean0
395 :     psad1 psad[5] = c[9], mean0
396 :     ;;
397 :     }
398 :     {.mii
399 :     add dev0 = dev0, psad[0]
400 :     psad1 psad[6] = c[10], mean0
401 :     psad1 psad[7] = c[11], mean0
402 :     }
403 :     {.mmi
404 :     add dev1 = dev1, psad[1]
405 :    
406 :     add dev2 = dev2, psad[2]
407 :     psad1 psad[0] = c[12], mean0
408 :     }
409 :     {.mii
410 :     add dev3 = dev3, psad[3]
411 :     psad1 psad[1] = c[13], mean0
412 :     ;;
413 :     psad1 psad[2] = c[14], mean0
414 :     }
415 :     {.mmi
416 :     add dev0 = dev0, psad[4]
417 :     add dev1 = dev1, psad[5]
418 :     psad1 psad[3] = c[15], mean0
419 :     }
420 :     {.mii
421 :     add dev2 = dev2, psad[6]
422 :     psad1 psad[4] = c[16], mean0
423 :     psad1 psad[5] = c[17], mean0
424 :     }
425 :     {.mmi
426 :     add dev3 = dev3, psad[7]
427 :     ;;
428 :     add dev0 = dev0, psad[0]
429 :     psad1 psad[6] = c[18], mean0
430 :     }
431 :     {.mii
432 :     add dev1 = dev1, psad[1]
433 :     psad1 psad[7] = c[19], mean0
434 :    
435 :     psad1 psad[0] = c[20], mean0
436 :     }
437 :     {.mmi
438 :     add dev2 = dev2, psad[2]
439 :     add dev3 = dev3, psad[3]
440 :     psad1 psad[1] = c[21], mean0
441 : ia64p 205 ;;
442 : ia64p 230 }
443 :     {.mii
444 :     add dev0 = dev0, psad[4]
445 :     psad1 psad[2] = c[22], mean0
446 :     psad1 psad[3] = c[23], mean0
447 :     }
448 :     {.mmi
449 :     add dev1 = dev1, psad[5]
450 :    
451 :     add dev2 = dev2, psad[6]
452 :     psad1 psad[4] = c[24], mean0
453 :     }
454 :     {.mii
455 :     add dev3 = dev3, psad[7]
456 :     psad1 psad[5] = c[25], mean0
457 :     ;;
458 :     psad1 psad[6] = c[26], mean0
459 :     }
460 :     {.mmi
461 :     add dev0 = dev0, psad[0]
462 :     add dev1 = dev1, psad[1]
463 :     psad1 psad[7] = c[27], mean0
464 :     }
465 :     {.mii
466 :     add dev2 = dev2, psad[2]
467 :     psad1 psad[0] = c[28], mean0
468 :     psad1 psad[1] = c[29], mean0
469 :     }
470 :     {.mmi
471 :     add dev3 = dev3, psad[3]
472 : ia64p 205 ;;
473 : ia64p 230 add dev0 = dev0, psad[4]
474 :     psad1 psad[2] = c[30], mean0
475 :     }
476 :     {.mii
477 :     add dev1 = dev1, psad[5]
478 :     psad1 psad[3] = c[31], mean0
479 :     ;;
480 :     add dev2 = dev2, psad[6]
481 :     }
482 :     {.mmi
483 :     add dev3 = dev3, psad[7]
484 :     add dev0 = dev0, psad[0]
485 :     add dev1 = dev1, psad[1]
486 : ia64p 205 ;;
487 : ia64p 230 }
488 :     {.mii
489 :     add dev2 = dev2, psad[2]
490 :     add dev3 = dev3, psad[3]
491 :     add ret0 = dev0, dev1
492 :     ;;
493 :     }
494 :     {.mib
495 :     add dev2 = dev2, dev3
496 :     nop.i 1
497 :     nop.b 1
498 :     ;;
499 :     }
500 :     {.mib
501 :     add ret0 = ret0, dev2
502 :     nop.i 1
503 : ia64p 205 br.ret.sptk.many b0
504 : ia64p 230 }
505 : ia64p 205 .endp dev16_ia64#
506 : ia64p 319
507 :    
508 :     // ###########################################################
509 :     // ###########################################################
510 :     // Neue version von gruppe 01 ################################
511 :     // ###########################################################
512 :     // ###########################################################
513 :    
514 :    
515 :    
516 :     .text
517 :     .align 16
518 :     .global sad16_ia64#
519 :     .proc sad16_ia64#
520 :     sad16_ia64:
521 :     alloc r1 = ar.pfs, 4, 76, 0, 0
522 :     mov r2 = pr
523 :     dep r14 = r0, r33, 0, 3 // r14 = (r33 div 8)*8 (aligned version of ref)
524 :     dep.z r31 = r33, 0, 3 // r31 = r33 mod 8 (misalignment of ref)
525 :     ;;
526 :     mov r64 = r34 //(1) calculate multiples of stride
527 :     shl r65 = r34, 1 //(2) for being able to load all the
528 :     shladd r66 = r34, 1, r34 //(3) data at once
529 :     shl r67 = r34, 2 //(4)
530 :     shladd r68 = r34, 2, r34 //(5)
531 :     shl r71 = r34, 3 //(8)
532 :     shladd r72 = r34, 3, r34 //(9)
533 :     ;;
534 :     shl r69 = r66, 1 //(6)
535 :     shladd r70 = r66, 1, r34 //(7)
536 :     shl r73 = r68, 1 //(10)
537 :     shladd r74 = r68, 1, r34 //(11)
538 :     shl r75 = r66, 2 //(12)
539 :     shladd r76 = r66, 2, r34 //(13)
540 :     shladd r77 = r66, 2, r65 //(14)
541 :     shladd r78 = r66, 2, r66 //(15)
542 :     ;;
543 :     cmp.eq p16, p17 = 0, r31 // prepare predicates according to the misalignment
544 :     cmp.eq p18, p19 = 2, r31 // ref
545 :     cmp.eq p20, p21 = 4, r31
546 :     cmp.eq p22, p23 = 6, r31
547 :     cmp.eq p24, p25 = 1, r31
548 :     cmp.eq p26, p27 = 3, r31
549 :     cmp.eq p28, p29 = 5, r31
550 :     mov r96 = r14 // and calculate all the adresses where we have
551 :     mov r33 = r32 // to load from
552 :     add r97 = r14, r64
553 :     add r35 = r32, r64
554 :     add r98 = r14, r65
555 :     add r37 = r32, r65
556 :     add r99 = r14, r66
557 :     add r39 = r32, r66
558 :     add r100 = r14, r67
559 :     add r41 = r32, r67
560 :     add r101 = r14, r68
561 :     add r43 = r32, r68
562 :     add r102 = r14, r69
563 :     add r45 = r32, r69
564 :     add r103 = r14, r70
565 :     add r47 = r32, r70
566 :     add r104 = r14, r71
567 :     add r49 = r32, r71
568 :     add r105 = r14, r72
569 :     add r51 = r32, r72
570 :     add r106 = r14, r73
571 :     add r53 = r32, r73
572 :     add r107 = r14, r74
573 :     add r55 = r32, r74
574 :     add r108 = r14, r75
575 :     add r57 = r32, r75
576 :     add r109 = r14, r76
577 :     add r59 = r32, r76
578 :     add r110 = r14, r77
579 :     add r61 = r32, r77
580 :     add r111 = r14, r78
581 :     add r63 = r32, r78
582 :     ;;
583 :     ld8 r32 = [r33], 8 // Load all the data which is needed for the sad
584 :     ld8 r34 = [r35], 8 // in the registers. the goal is to have the array
585 :     ld8 r36 = [r37], 8 // adressed by cur in the registers r32 - r63 and
586 :     ld8 r38 = [r39], 8 // the aray adressed by ref in the registers
587 :     ld8 r40 = [r41], 8 // r64 - r95. The registers r96 - r111 are needed
588 :     ld8 r42 = [r43], 8 // to load the aligned 24 bits in which the
589 :     ld8 r44 = [r45], 8 // needed misaligned 16 bits must be.
590 :     ld8 r46 = [r47], 8 // After loading we start a preprocessing which
591 :     ld8 r48 = [r49], 8 // guarantees that the data adressed by ref is in
592 :     ld8 r50 = [r51], 8 // the registers r64 - r95.
593 :     ld8 r52 = [r53], 8
594 :     ld8 r54 = [r55], 8
595 :     ld8 r56 = [r57], 8
596 :     ld8 r58 = [r59], 8
597 :     ld8 r60 = [r61], 8
598 :     ld8 r62 = [r63], 8
599 :     ld8 r64 = [r96], 8
600 :     ld8 r66 = [r97], 8
601 :     ld8 r68 = [r98], 8
602 :     ld8 r70 = [r99], 8
603 :     ld8 r72 = [r100], 8
604 :     ld8 r74 = [r101], 8
605 :     ld8 r76 = [r102], 8
606 :     ld8 r78 = [r103], 8
607 :     ld8 r80 = [r104], 8
608 :     ld8 r82 = [r105], 8
609 :     ld8 r84 = [r106], 8
610 :     ld8 r86 = [r107], 8
611 :     ld8 r88 = [r108], 8
612 :     ld8 r90 = [r109], 8
613 :     ld8 r92 = [r110], 8
614 :     ld8 r94 = [r111], 8
615 :     ;;
616 :     ld8 r33 = [r33]
617 :     ld8 r35 = [r35]
618 :     ld8 r37 = [r37]
619 :     ld8 r39 = [r39]
620 :     ld8 r41 = [r41]
621 :     ld8 r43 = [r43]
622 :     ld8 r45 = [r45]
623 :     ld8 r47 = [r47]
624 :     ld8 r49 = [r49]
625 :     ld8 r51 = [r51]
626 :     ld8 r53 = [r53]
627 :     ld8 r55 = [r55]
628 :     ld8 r57 = [r57]
629 :     ld8 r59 = [r59]
630 :     ld8 r61 = [r61]
631 :     ld8 r63 = [r63]
632 :     ld8 r65 = [r96], 8
633 :     ld8 r67 = [r97], 8
634 :     ld8 r69 = [r98], 8
635 :     ld8 r71 = [r99], 8
636 :     ld8 r73 = [r100], 8
637 :     ld8 r75 = [r101], 8
638 :     ld8 r77 = [r102], 8
639 :     ld8 r79 = [r103], 8
640 :     ld8 r81 = [r104], 8
641 :     ld8 r83 = [r105], 8
642 :     ld8 r85 = [r106], 8
643 :     ld8 r87 = [r107], 8
644 :     ld8 r89 = [r108], 8
645 :     ld8 r91 = [r109], 8
646 :     ld8 r93 = [r110], 8
647 :     ld8 r95 = [r111], 8
648 :     (p16) br.cond.dptk.many .Lber // If ref is aligned, everything is loaded and we can start the calculation
649 :     ;;
650 :     ld8 r96 = [r96] // If not, we have to load a bit more
651 :     ld8 r97 = [r97]
652 :     ld8 r98 = [r98]
653 :     ld8 r99 = [r99]
654 :     ld8 r100 = [r100]
655 :     ld8 r101 = [r101]
656 :     ld8 r102 = [r102]
657 :     ld8 r103 = [r103]
658 :     ld8 r104 = [r104]
659 :     ld8 r105 = [r105]
660 :     ld8 r106 = [r106]
661 :     ld8 r107 = [r107]
662 :     ld8 r108 = [r108]
663 :     ld8 r109 = [r109]
664 :     ld8 r110 = [r110]
665 :     ld8 r111 = [r111]
666 :     (p24) br.cond.dptk.many .Lmod1 // according to the misalignment, we have
667 :     (p18) br.cond.dpnt.many .Lmod2 // to jump to different preprocessing routines
668 :     (p26) br.cond.dpnt.many .Lmod3
669 :     (p20) br.cond.dpnt.many .Lmod4
670 :     (p28) br.cond.dpnt.many .Lmod5
671 :     (p22) br.cond.dpnt.many .Lmod6
672 :     ;;
673 :     .Lmod7: // this jump point is not needed
674 :     shrp r64 = r65, r64, 56 // in these blocks, we do the preprocessing
675 :     shrp r65 = r96, r65, 56
676 :     shrp r66 = r67, r66, 56
677 :     shrp r67 = r97, r67, 56
678 :     shrp r68 = r69, r68, 56
679 :     shrp r69 = r98, r69, 56
680 :     shrp r70 = r71, r70, 56
681 :     shrp r71 = r99, r71, 56
682 :     shrp r72 = r73, r72, 56
683 :     shrp r73 = r100, r73, 56
684 :     shrp r74 = r75, r74, 56
685 :     shrp r75 = r101, r75, 56
686 :     shrp r76 = r77, r76, 56
687 :     shrp r77 = r102, r77, 56
688 :     shrp r78 = r79, r78, 56
689 :     shrp r79 = r103, r79, 56
690 :     shrp r80 = r81, r80, 56
691 :     shrp r81 = r104, r81, 56
692 :     shrp r82 = r83, r82, 56
693 :     shrp r83 = r105, r83, 56
694 :     shrp r84 = r85, r84, 56
695 :     shrp r85 = r106, r85, 56
696 :     shrp r86 = r87, r86, 56
697 :     shrp r87 = r107, r87, 56
698 :     shrp r88 = r89, r88, 56
699 :     shrp r89 = r108, r89, 56
700 :     shrp r90 = r91, r90, 56
701 :     shrp r91 = r109, r91, 56
702 :     shrp r92 = r93, r92, 56
703 :     shrp r93 = r110, r93, 56
704 :     shrp r94 = r95, r94, 56
705 :     shrp r95 = r111, r95, 56
706 :     br.cond.sptk.many .Lber // and then we jump to the calculation
707 :     ;;
708 :     .Lmod6:
709 :     shrp r64 = r65, r64, 48
710 :     shrp r65 = r96, r65, 48
711 :     shrp r66 = r67, r66, 48
712 :     shrp r67 = r97, r67, 48
713 :     shrp r68 = r69, r68, 48
714 :     shrp r69 = r98, r69, 48
715 :     shrp r70 = r71, r70, 48
716 :     shrp r71 = r99, r71, 48
717 :     shrp r72 = r73, r72, 48
718 :     shrp r73 = r100, r73, 48
719 :     shrp r74 = r75, r74, 48
720 :     shrp r75 = r101, r75, 48
721 :     shrp r76 = r77, r76, 48
722 :     shrp r77 = r102, r77, 48
723 :     shrp r78 = r79, r78, 48
724 :     shrp r79 = r103, r79, 48
725 :     shrp r80 = r81, r80, 48
726 :     shrp r81 = r104, r81, 48
727 :     shrp r82 = r83, r82, 48
728 :     shrp r83 = r105, r83, 48
729 :     shrp r84 = r85, r84, 48
730 :     shrp r85 = r106, r85, 48
731 :     shrp r86 = r87, r86, 48
732 :     shrp r87 = r107, r87, 48
733 :     shrp r88 = r89, r88, 48
734 :     shrp r89 = r108, r89, 48
735 :     shrp r90 = r91, r90, 48
736 :     shrp r91 = r109, r91, 48
737 :     shrp r92 = r93, r92, 48
738 :     shrp r93 = r110, r93, 48
739 :     shrp r94 = r95, r94, 48
740 :     shrp r95 = r111, r95, 48
741 :     br.cond.sptk.many .Lber
742 :     ;;
743 :     .Lmod5:
744 :     shrp r64 = r65, r64, 40
745 :     shrp r65 = r96, r65, 40
746 :     shrp r66 = r67, r66, 40
747 :     shrp r67 = r97, r67, 40
748 :     shrp r68 = r69, r68, 40
749 :     shrp r69 = r98, r69, 40
750 :     shrp r70 = r71, r70, 40
751 :     shrp r71 = r99, r71, 40
752 :     shrp r72 = r73, r72, 40
753 :     shrp r73 = r100, r73, 40
754 :     shrp r74 = r75, r74, 40
755 :     shrp r75 = r101, r75, 40
756 :     shrp r76 = r77, r76, 40
757 :     shrp r77 = r102, r77, 40
758 :     shrp r78 = r79, r78, 40
759 :     shrp r79 = r103, r79, 40
760 :     shrp r80 = r81, r80, 40
761 :     shrp r81 = r104, r81, 40
762 :     shrp r82 = r83, r82, 40
763 :     shrp r83 = r105, r83, 40
764 :     shrp r84 = r85, r84, 40
765 :     shrp r85 = r106, r85, 40
766 :     shrp r86 = r87, r86, 40
767 :     shrp r87 = r107, r87, 40
768 :     shrp r88 = r89, r88, 40
769 :     shrp r89 = r108, r89, 40
770 :     shrp r90 = r91, r90, 40
771 :     shrp r91 = r109, r91, 40
772 :     shrp r92 = r93, r92, 40
773 :     shrp r93 = r110, r93, 40
774 :     shrp r94 = r95, r94, 40
775 :     shrp r95 = r111, r95, 40
776 :     br.cond.sptk.many .Lber
777 :     ;;
778 :     .Lmod4:
779 :     shrp r64 = r65, r64, 32
780 :     shrp r65 = r96, r65, 32
781 :     shrp r66 = r67, r66, 32
782 :     shrp r67 = r97, r67, 32
783 :     shrp r68 = r69, r68, 32
784 :     shrp r69 = r98, r69, 32
785 :     shrp r70 = r71, r70, 32
786 :     shrp r71 = r99, r71, 32
787 :     shrp r72 = r73, r72, 32
788 :     shrp r73 = r100, r73, 32
789 :     shrp r74 = r75, r74, 32
790 :     shrp r75 = r101, r75, 32
791 :     shrp r76 = r77, r76, 32
792 :     shrp r77 = r102, r77, 32
793 :     shrp r78 = r79, r78, 32
794 :     shrp r79 = r103, r79, 32
795 :     shrp r80 = r81, r80, 32
796 :     shrp r81 = r104, r81, 32
797 :     shrp r82 = r83, r82, 32
798 :     shrp r83 = r105, r83, 32
799 :     shrp r84 = r85, r84, 32
800 :     shrp r85 = r106, r85, 32
801 :     shrp r86 = r87, r86, 32
802 :     shrp r87 = r107, r87, 32
803 :     shrp r88 = r89, r88, 32
804 :     shrp r89 = r108, r89, 32
805 :     shrp r90 = r91, r90, 32
806 :     shrp r91 = r109, r91, 32
807 :     shrp r92 = r93, r92, 32
808 :     shrp r93 = r110, r93, 32
809 :     shrp r94 = r95, r94, 32
810 :     shrp r95 = r111, r95, 32
811 :     br.cond.sptk.many .Lber
812 :     ;;
813 :     .Lmod3:
814 :     shrp r64 = r65, r64, 24
815 :     shrp r65 = r96, r65, 24
816 :     shrp r66 = r67, r66, 24
817 :     shrp r67 = r97, r67, 24
818 :     shrp r68 = r69, r68, 24
819 :     shrp r69 = r98, r69, 24
820 :     shrp r70 = r71, r70, 24
821 :     shrp r71 = r99, r71, 24
822 :     shrp r72 = r73, r72, 24
823 :     shrp r73 = r100, r73, 24
824 :     shrp r74 = r75, r74, 24
825 :     shrp r75 = r101, r75, 24
826 :     shrp r76 = r77, r76, 24
827 :     shrp r77 = r102, r77, 24
828 :     shrp r78 = r79, r78, 24
829 :     shrp r79 = r103, r79, 24
830 :     shrp r80 = r81, r80, 24
831 :     shrp r81 = r104, r81, 24
832 :     shrp r82 = r83, r82, 24
833 :     shrp r83 = r105, r83, 24
834 :     shrp r84 = r85, r84, 24
835 :     shrp r85 = r106, r85, 24
836 :     shrp r86 = r87, r86, 24
837 :     shrp r87 = r107, r87, 24
838 :     shrp r88 = r89, r88, 24
839 :     shrp r89 = r108, r89, 24
840 :     shrp r90 = r91, r90, 24
841 :     shrp r91 = r109, r91, 24
842 :     shrp r92 = r93, r92, 24
843 :     shrp r93 = r110, r93, 24
844 :     shrp r94 = r95, r94, 24
845 :     shrp r95 = r111, r95, 24
846 :     br.cond.sptk.many .Lber
847 :     ;;
848 :     .Lmod2:
849 :     shrp r64 = r65, r64, 16
850 :     shrp r65 = r96, r65, 16
851 :     shrp r66 = r67, r66, 16
852 :     shrp r67 = r97, r67, 16
853 :     shrp r68 = r69, r68, 16
854 :     shrp r69 = r98, r69, 16
855 :     shrp r70 = r71, r70, 16
856 :     shrp r71 = r99, r71, 16
857 :     shrp r72 = r73, r72, 16
858 :     shrp r73 = r100, r73, 16
859 :     shrp r74 = r75, r74, 16
860 :     shrp r75 = r101, r75, 16
861 :     shrp r76 = r77, r76, 16
862 :     shrp r77 = r102, r77, 16
863 :     shrp r78 = r79, r78, 16
864 :     shrp r79 = r103, r79, 16
865 :     shrp r80 = r81, r80, 16
866 :     shrp r81 = r104, r81, 16
867 :     shrp r82 = r83, r82, 16
868 :     shrp r83 = r105, r83, 16
869 :     shrp r84 = r85, r84, 16
870 :     shrp r85 = r106, r85, 16
871 :     shrp r86 = r87, r86, 16
872 :     shrp r87 = r107, r87, 16
873 :     shrp r88 = r89, r88, 16
874 :     shrp r89 = r108, r89, 16
875 :     shrp r90 = r91, r90, 16
876 :     shrp r91 = r109, r91, 16
877 :     shrp r92 = r93, r92, 16
878 :     shrp r93 = r110, r93, 16
879 :     shrp r94 = r95, r94, 16
880 :     shrp r95 = r111, r95, 16
881 :     br.cond.sptk.many .Lber
882 :     ;;
883 :     .Lmod1:
884 :     shrp r64 = r65, r64, 8
885 :     shrp r65 = r96, r65, 8
886 :     shrp r66 = r67, r66, 8
887 :     shrp r67 = r97, r67, 8
888 :     shrp r68 = r69, r68, 8
889 :     shrp r69 = r98, r69, 8
890 :     shrp r70 = r71, r70, 8
891 :     shrp r71 = r99, r71, 8
892 :     shrp r72 = r73, r72, 8
893 :     shrp r73 = r100, r73, 8
894 :     shrp r74 = r75, r74, 8
895 :     shrp r75 = r101, r75, 8
896 :     shrp r76 = r77, r76, 8
897 :     shrp r77 = r102, r77, 8
898 :     shrp r78 = r79, r78, 8
899 :     shrp r79 = r103, r79, 8
900 :     shrp r80 = r81, r80, 8
901 :     shrp r81 = r104, r81, 8
902 :     shrp r82 = r83, r82, 8
903 :     shrp r83 = r105, r83, 8
904 :     shrp r84 = r85, r84, 8
905 :     shrp r85 = r106, r85, 8
906 :     shrp r86 = r87, r86, 8
907 :     shrp r87 = r107, r87, 8
908 :     shrp r88 = r89, r88, 8
909 :     shrp r89 = r108, r89, 8
910 :     shrp r90 = r91, r90, 8
911 :     shrp r91 = r109, r91, 8
912 :     shrp r92 = r93, r92, 8
913 :     shrp r93 = r110, r93, 8
914 :     shrp r94 = r95, r94, 8
915 :     shrp r95 = r111, r95, 8
916 :     .Lber:
917 :     ;;
918 :     psad1 r32 = r32, r64 // Here we do the calculation.
919 :     psad1 r33 = r33, r65 // The machine is providing a fast method
920 :     psad1 r34 = r34, r66 // for calculating sad, so we use it
921 :     psad1 r35 = r35, r67
922 :     psad1 r36 = r36, r68
923 :     psad1 r37 = r37, r69
924 :     psad1 r38 = r38, r70
925 :     psad1 r39 = r39, r71
926 :     psad1 r40 = r40, r72
927 :     psad1 r41 = r41, r73
928 :     psad1 r42 = r42, r74
929 :     psad1 r43 = r43, r75
930 :     psad1 r44 = r44, r76
931 :     psad1 r45 = r45, r77
932 :     psad1 r46 = r46, r78
933 :     psad1 r47 = r47, r79
934 :     psad1 r48 = r48, r80
935 :     psad1 r49 = r49, r81
936 :     psad1 r50 = r50, r82
937 :     psad1 r51 = r51, r83
938 :     psad1 r52 = r52, r84
939 :     psad1 r53 = r53, r85
940 :     psad1 r54 = r54, r86
941 :     psad1 r55 = r55, r87
942 :     psad1 r56 = r56, r88
943 :     psad1 r57 = r57, r89
944 :     psad1 r58 = r58, r90
945 :     psad1 r59 = r59, r91
946 :     psad1 r60 = r60, r92
947 :     psad1 r61 = r61, r93
948 :     psad1 r62 = r62, r94
949 :     psad1 r63 = r63, r95
950 :     ;;
951 :     add r32 = r32, r63 // at last, we have to sum up
952 :     add r33 = r33, r62 // in 5 stages
953 :     add r34 = r34, r61
954 :     add r35 = r35, r60
955 :     add r36 = r36, r59
956 :     add r37 = r37, r58
957 :     add r38 = r38, r57
958 :     add r39 = r39, r56
959 :     add r40 = r40, r55
960 :     add r41 = r41, r54
961 :     add r42 = r42, r53
962 :     add r43 = r43, r52
963 :     add r44 = r44, r51
964 :     add r45 = r45, r50
965 :     add r46 = r46, r49
966 :     add r47 = r47, r48
967 :     ;;
968 :     add r32 = r32, r47
969 :     add r33 = r33, r46
970 :     add r34 = r34, r45
971 :     add r35 = r35, r44
972 :     add r36 = r36, r43
973 :     add r37 = r37, r42
974 :     add r38 = r38, r41
975 :     add r39 = r39, r40
976 :     ;;
977 :     add r32 = r32, r39
978 :     add r33 = r33, r38
979 :     add r34 = r34, r37
980 :     add r35 = r35, r36
981 :     ;;
982 :     add r32 = r32, r35
983 :     add r33 = r33, r34
984 :     ;;
985 :     add r8 = r32, r33 // and store the result in r8
986 :     mov pr = r2, -1
987 :     mov ar.pfs = r1
988 :     br.ret.sptk.many b0
989 :     .endp sad16_ia64#
990 :    
991 :    
992 :    
993 :    
994 :     .align 16
995 :     .global sad8_ia64#
996 :     .proc sad8_ia64#
997 :     sad8_ia64:
998 :     alloc r1 = ar.pfs, 3, 21, 0, 0
999 :     mov r2 = pr
1000 :     dep r14 = r0, r33, 0, 3 // calculate aligned version of ref
1001 :     dep.z r31 = r33, 0, 3 // calculate misalignment of ref
1002 :     ;;
1003 :     mov r40 = r34 //(1) calculate multiples of stride
1004 :     shl r41 = r34, 1 //(2)
1005 :     shladd r42 = r34, 1, r34 //(3)
1006 :     shl r43 = r34, 2 //(4)
1007 :     shladd r44 = r34, 2, r34 //(5)
1008 :     ;;
1009 :     cmp.eq p16, p17 = 0, r31 // set predicates according to the misalignment of ref
1010 :     cmp.eq p18, p19 = 2, r31
1011 :     shl r45 = r42, 1 //(6)
1012 :     cmp.eq p20, p21 = 4, r31
1013 :     cmp.eq p22, p23 = 6, r31
1014 :     shladd r46 = r42, 1, r34 //(7)
1015 :     cmp.eq p24, p25 = 1, r31
1016 :     cmp.eq p26, p27 = 3, r31
1017 :     cmp.eq p28, p29 = 5, r31
1018 :     ;;
1019 :     mov r48 = r14 // calculate memory adresses of data
1020 :     add r33 = r32, r40
1021 :     add r49 = r14, r40
1022 :     add r34 = r32, r41
1023 :     add r50 = r14, r41
1024 :     add r35 = r32, r42
1025 :     add r51 = r14, r42
1026 :     add r36 = r32, r43
1027 :     add r52 = r14, r43
1028 :     add r37 = r32, r44
1029 :     add r53 = r14, r44
1030 :     add r38 = r32, r45
1031 :     add r54 = r14, r45
1032 :     add r39 = r32, r46
1033 :     add r55 = r14, r46
1034 :     ;;
1035 :     ld8 r32 = [r32] // load everythingund alles wird geladen
1036 :     ld8 r33 = [r33] // cur is located in r32 - r39
1037 :     ld8 r34 = [r34] // ref in r40 - r47
1038 :     ld8 r35 = [r35]
1039 :     ld8 r36 = [r36]
1040 :     ld8 r37 = [r37]
1041 :     ld8 r38 = [r38]
1042 :     ld8 r39 = [r39]
1043 :     ld8 r40 = [r48] ,8
1044 :     ld8 r41 = [r49] ,8
1045 :     ld8 r42 = [r50] ,8
1046 :     ld8 r43 = [r51] ,8
1047 :     ld8 r44 = [r52] ,8
1048 :     ld8 r45 = [r53] ,8
1049 :     ld8 r46 = [r54] ,8
1050 :     ld8 r47 = [r55] ,8
1051 :     (p16) br.cond.dptk.many .Lber2 // if ref is aligned, we can start the calculation
1052 :     ;;
1053 :     ld8 r48 = [r48] // if not, we have to load some more
1054 :     ld8 r49 = [r49] // because of the alignment of ld8
1055 :     ld8 r50 = [r50]
1056 :     ld8 r51 = [r51]
1057 :     ld8 r52 = [r52]
1058 :     ld8 r53 = [r53]
1059 :     ld8 r54 = [r54]
1060 :     ld8 r55 = [r55]
1061 :     (p24) br.cond.dptk.many .Lmode1
1062 :     (p18) br.cond.dpnt.many .Lmode2
1063 :     (p26) br.cond.dpnt.many .Lmode3
1064 :     (p20) br.cond.dpnt.many .Lmode4
1065 :     (p28) br.cond.dpnt.many .Lmode5
1066 :     (p22) br.cond.dpnt.many .Lmode6
1067 :     ;;
1068 :     .Lmode7: // this jump piont is not needed, it is for better understandment
1069 :     shrp r40 = r48, r40, 56 // here we do some preprocessing on the data
1070 :     shrp r41 = r49, r41, 56 // this is because of the alignment problem of ref
1071 :     shrp r42 = r50, r42, 56
1072 :     shrp r43 = r51, r43, 56
1073 :     shrp r44 = r52, r44, 56
1074 :     shrp r45 = r53, r45, 56
1075 :     shrp r46 = r54, r46, 56
1076 :     shrp r47 = r55, r47, 56
1077 :     br.cond.sptk.many .Lber2
1078 :     ;;
1079 :     .Lmode6:
1080 :     shrp r40 = r48, r40, 48
1081 :     shrp r41 = r49, r41, 48
1082 :     shrp r42 = r50, r42, 48
1083 :     shrp r43 = r51, r43, 48
1084 :     shrp r44 = r52, r44, 48
1085 :     shrp r45 = r53, r45, 48
1086 :     shrp r46 = r54, r46, 48
1087 :     shrp r47 = r55, r47, 48
1088 :     br.cond.sptk.many .Lber2
1089 :     ;;
1090 :     .Lmode5:
1091 :     shrp r40 = r48, r40, 40
1092 :     shrp r41 = r49, r41, 40
1093 :     shrp r42 = r50, r42, 40
1094 :     shrp r43 = r51, r43, 40
1095 :     shrp r44 = r52, r44, 40
1096 :     shrp r45 = r53, r45, 40
1097 :     shrp r46 = r54, r46, 40
1098 :     shrp r47 = r55, r47, 40
1099 :     br.cond.sptk.many .Lber2
1100 :     ;;
1101 :     .Lmode4:
1102 :     shrp r40 = r48, r40, 32
1103 :     shrp r41 = r49, r41, 32
1104 :     shrp r42 = r50, r42, 32
1105 :     shrp r43 = r51, r43, 32
1106 :     shrp r44 = r52, r44, 32
1107 :     shrp r45 = r53, r45, 32
1108 :     shrp r46 = r54, r46, 32
1109 :     shrp r47 = r55, r47, 32
1110 :     br.cond.sptk.many .Lber2
1111 :     ;;
1112 :     .Lmode3:
1113 :     shrp r40 = r48, r40, 24
1114 :     shrp r41 = r49, r41, 24
1115 :     shrp r42 = r50, r42, 24
1116 :     shrp r43 = r51, r43, 24
1117 :     shrp r44 = r52, r44, 24
1118 :     shrp r45 = r53, r45, 24
1119 :     shrp r46 = r54, r46, 24
1120 :     shrp r47 = r55, r47, 24
1121 :     br.cond.sptk.many .Lber2
1122 :     ;;
1123 :     .Lmode2:
1124 :     shrp r40 = r48, r40, 16
1125 :     shrp r41 = r49, r41, 16
1126 :     shrp r42 = r50, r42, 16
1127 :     shrp r43 = r51, r43, 16
1128 :     shrp r44 = r52, r44, 16
1129 :     shrp r45 = r53, r45, 16
1130 :     shrp r46 = r54, r46, 16
1131 :     shrp r47 = r55, r47, 16
1132 :     br.cond.sptk.many .Lber2
1133 :     ;;
1134 :     .Lmode1:
1135 :     shrp r40 = r48, r40, 8
1136 :     shrp r41 = r49, r41, 8
1137 :     shrp r42 = r50, r42, 8
1138 :     shrp r43 = r51, r43, 8
1139 :     shrp r44 = r52, r44, 8
1140 :     shrp r45 = r53, r45, 8
1141 :     shrp r46 = r54, r46, 8
1142 :     shrp r47 = r55, r47, 8
1143 :     .Lber2:
1144 :     ;;
1145 :     psad1 r32 = r32, r40 // we start calculating sad
1146 :     psad1 r33 = r33, r41 // using th psad1 command of IA64
1147 :     psad1 r34 = r34, r42
1148 :     psad1 r35 = r35, r43
1149 :     psad1 r36 = r36, r44
1150 :     psad1 r37 = r37, r45
1151 :     psad1 r38 = r38, r46
1152 :     psad1 r39 = r39, r47
1153 :     ;;
1154 :     add r32 = r32, r33 // then we sum up everything
1155 :     add r33 = r34, r35
1156 :     add r34 = r36, r37
1157 :     add r35 = r38, r39
1158 :     ;;
1159 :     add r32 = r32, r33
1160 :     add r33 = r34, r35
1161 :     ;;
1162 :     add r8 = r32, r33 // and store the result un r8
1163 :     mov pr = r2, -1
1164 :     mov ar.pfs = r1
1165 :     br.ret.sptk.many b0
1166 :     .endp sad8_ia64#

No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4