--- trunk/xvidcore/src/motion/ia64_asm/sad_ia64.s 2002/06/20 14:25:44 230 +++ trunk/xvidcore/src/motion/ia64_asm/sad_ia64.s 2002/07/20 12:48:54 319 @@ -1,170 +1,29 @@ -.text - .align 16 - .global sad16_ia64# - .proc sad16_ia64# -sad16_ia64: - -_LL=3 -_SL=1 -_OL=1 -_PL=1 -_AL=1 - - alloc r9=ar.pfs,4,44,0,48 - - mov r8 = r0 - - mov r20 = ar.lc - mov r21 = pr - - dep.z r22 = r32, 3, 3 // erste 3 Bit mit 8 multiplizieren - dep.z r23 = r33, 3, 3 // in r22 und r23 -> Schiebeflags - - and r14 = -8, r32 // Parameter in untere Register kopieren - and r15 = -8, r33 // Ref Cur mit 11111...1000 and-en - mov r16 = r34 - mov r17 = r35 - ;; - add r18 = 8, r14 // Adressenvorausberechnen - add r19 = 8, r15 - - sub r24 = 64, r22 // Schiftanzahl ausrechnen - sub r25 = 64, r23 - - add r26 = 16, r14 // Adressenvorausberechnen - add r27 = 16, r15 - - // Loop-counter initialisieren - mov ar.lc = 15 // Loop 16 mal durchlaufen - mov ar.ec = _LL + _SL + _OL + _PL + _AL + _AL // Die Loop am Schluss noch neun mal durchlaufen - - // Rotating Predicate Register zuruecksetzen und P16 auf 1 - mov pr.rot = 1 << 16 - ;; - - // Array-Konstrukte initialisieren - .rotr _ald1[_LL+1], _ald2[_LL+1], _ald3[_LL+1], _ald4[_LL+1], _ald5[_LL+1], _ald6[_LL+1], _shru1[_SL+1], _shl1[_SL+1], _shru2[_SL], _shl2[_SL], _shru3[_SL], _shl3[_SL], _shru4[_SL], _shl4[_SL+1], _or1[_OL], _or2[_OL], _or3[_OL], _or4[_OL+1], _psadr1[_PL+1], _psadr2[_PL+1], _addr1[_AL+1] - .rotp _aldp[_LL], _shp[_SL], _orp[_OL], _psadrp[_PL], _addrp1[_AL], _addrp2[_AL] - -.L_loop_16: - {.mmi - (_aldp[0]) ld8 _ald1[0] = [r14], r16 // Cur Erste 8 Byte - (_aldp[0]) ld8 _ald2[0] = [r18], r16 // Cur Zweite 8 Byte - (_psadrp[0]) psad1 _psadr1[0] = _or2[0], _or4[0] // Psadden - } - {.mmi - (_aldp[0]) ld8 _ald3[0] = [r26], r16 // Cur Dritte 8 Byte - (_aldp[0]) ld8 _ald4[0] = [r15], r16 // Ref Erste 8 Byte - (_psadrp[0]) psad1 _psadr2[0] = _or3[0], _or4[_OL] // _or2 +1 - } - {.mmi - (_aldp[0]) ld8 _ald5[0] = [r19], r16 // Ref Zweite 8 Byte - (_aldp[0]) ld8 _ald6[0] = [r27], r16 // Ref Dritte 8 Byte - (_shp[0]) shr.u _shru1[0] = _ald1[_LL], r22 - } - {.mii - (_orp[0]) or _or1[0] = _shl2[0], _shru3[0] // _shru2 + 1 und _shl2 + 1 - (_shp[0]) shl _shl1[0] = _ald2[_LL], r24 - (_shp[0]) shr.u _shru2[0] = _ald2[_LL], r22 - } - {.mii - (_orp[0]) or _or2[0] = _shl3[0], _shru4[0] // _shru3 + 1 und _shl3 + 1 - (_shp[0]) shl _shl2[0] = _ald3[_LL], r24 - (_shp[0]) shr.u _shru3[0] = _ald4[_LL], r23 - } - {.mii - (_orp[0]) or _or3[0] = _shl4[0], _shl4[_SL] //_shru4 + 1 und _shl4 + 1 - (_shp[0]) shl _shl3[0] = _ald5[_LL], r25 - (_shp[0]) shr.u _shru4[0] = _ald5[_LL], r23 - } - {.mmi - (_orp[0]) or _or4[0] = _shru1[_SL], _shl1[_SL] - (_shp[0]) shl _shl4[0]= _ald6[_LL], r25 - } - {.mmb - (_addrp1[0]) add _addr1[0] = _psadr1[_PL], _psadr2[_PL] // Aufsummieren - (_addrp2[0]) add r8 = r8, _addr1[_AL] - br.ctop.sptk.few .L_loop_16 - ;; - } - // Register zurueckschreiben - mov ar.lc = r20 - mov pr = r21,-1 - br.ret.sptk.many rp - .endp sad16_ia64# +// ------------------------------------------------------------------------------ +// * +// * Optimized Assembler Versions of sad8 and sad16 +// * +// ------------------------------------------------------------------------------ +// * +// * Hannes Jütting and Christopher Özbek +// * {s_juetti,s_oezbek}@ira.uka.de +// * +// * Programmed for the IA64 laboratory held at University Karlsruhe 2002 +// * http://www.info.uni-karlsruhe.de/~rubino/ia64p/ +// * +// ------------------------------------------------------------------------------ +// * +// * These are the optimized assembler versions of sad8 and sad16, which calculate +// * the sum of absolute differences between two 8x8/16x16 block matrices. +// * +// * Our approach uses: +// * - The Itanium command psad1, which solves the problem in hardware. +// * - Modulo-Scheduled Loops as the best way to loop unrolling on the IA64 +// * EPIC architecture +// * - Alignment resolving to avoid memory faults +// * +// ------------------------------------------------------------------------------ - .align 16 - .global sad8_ia64# - .proc sad8_ia64# - -sad8_ia64: - -LL=3 -SL=1 -OL=1 -PL=1 -AL=1 - - alloc r9=ar.pfs,3,29,0,32 - mov r20 = ar.lc - mov r21 = pr - - dep.z r22 = r32, 3, 3 // erste 3 Bit mit 8 multiplizieren - dep.z r23 = r33, 3, 3 // in r22 und r23 -> Schiebeflags - - mov r8 = r0 // . . . . - and r14 = -8, r32 // 0xFFFFFFFFFFFFFFF8, r32 - and r15 = -8, r33 // 0xFFFFFFFFFFFFFFF8, r33 - mov r16 = r34 -// mov r17 = r35 - ;; - - add r18 = 8, r14 - add r19 = 8, r15 - - sub r24 = 64, r22 - sub r25 = 64, r23 - - // Loop-counter initialisieren - mov ar.lc = 7 // Loop 7 mal durchlaufen - mov ar.ec = LL + SL + OL + PL + AL // Die Loop am Schluss noch zehn mal durchlaufen - - // Rotating Predicate Register zuruecksetzen und P16 auf 1 - mov pr.rot = 1 << 16 - ;; - .rotr ald1[LL+1], ald2[LL+1], ald3[LL+1], ald4[LL+1], shru1[SL+1], shl1[SL+1], shru2[SL+1], shl2[SL+1], or1[OL+1], or2[OL+1], psadr[PL+1], addr[AL+1] - .rotp aldp[LL], shp[SL], orp[OL], psadrp[PL], addrp[AL] -.L_loop_8: - {.mmi - (aldp[0]) ld8 ald1[0] = [r14], r16 // Cur laden - (aldp[0]) ld8 ald2[0] = [r18], r16 - (shp[0]) shr.u shru1[0] = ald1[LL], r22 // mergen - } - {.mii - (orp[0]) or or1[0] = shru1[SL], shl1[SL] - (shp[0]) shl shl1[0] = ald2[LL], r24 - (shp[0]) shr.u shru2[0] = ald3[LL], r23 // mergen - } - {.mmi - (aldp[0]) ld8 ald3[0] = [r15], r16 // Ref laden - (aldp[0]) ld8 ald4[0] = [r19], r16 - (shp[0]) shl shl2[0] = ald4[LL], r25 - } - {.mmi - (orp[0]) or or2[0] = shru2[SL], shl2[SL] - (addrp[0]) add r8 = r8, psadr[PL] - (psadrp[0]) psad1 psadr[0] = or1[OL], or2[OL] - } - {.mbb - br.ctop.sptk.few .L_loop_8 - ;; - } - - mov ar.lc = r20 - mov pr = r21,-1 - br.ret.sptk.many b0 - .endp sad8_ia64# .common sad16bi#,8,8 @@ -494,7 +353,7 @@ } {.mib add mean0 = mean0, mean1 - // add mean2 = 127, mean2 // this could make our division more exact, but does not help much + // add mean2 = 127, mean2 // this could make our division more exactly, but does not help much ;; } {.mib @@ -644,3 +503,664 @@ br.ret.sptk.many b0 } .endp dev16_ia64# + + +// ########################################################### +// ########################################################### +// Neue version von gruppe 01 ################################ +// ########################################################### +// ########################################################### + + + + .text + .align 16 + .global sad16_ia64# + .proc sad16_ia64# +sad16_ia64: + alloc r1 = ar.pfs, 4, 76, 0, 0 + mov r2 = pr + dep r14 = r0, r33, 0, 3 // r14 = (r33 div 8)*8 (aligned version of ref) + dep.z r31 = r33, 0, 3 // r31 = r33 mod 8 (misalignment of ref) + ;; + mov r64 = r34 //(1) calculate multiples of stride + shl r65 = r34, 1 //(2) for being able to load all the + shladd r66 = r34, 1, r34 //(3) data at once + shl r67 = r34, 2 //(4) + shladd r68 = r34, 2, r34 //(5) + shl r71 = r34, 3 //(8) + shladd r72 = r34, 3, r34 //(9) + ;; + shl r69 = r66, 1 //(6) + shladd r70 = r66, 1, r34 //(7) + shl r73 = r68, 1 //(10) + shladd r74 = r68, 1, r34 //(11) + shl r75 = r66, 2 //(12) + shladd r76 = r66, 2, r34 //(13) + shladd r77 = r66, 2, r65 //(14) + shladd r78 = r66, 2, r66 //(15) + ;; + cmp.eq p16, p17 = 0, r31 // prepare predicates according to the misalignment + cmp.eq p18, p19 = 2, r31 // ref + cmp.eq p20, p21 = 4, r31 + cmp.eq p22, p23 = 6, r31 + cmp.eq p24, p25 = 1, r31 + cmp.eq p26, p27 = 3, r31 + cmp.eq p28, p29 = 5, r31 + mov r96 = r14 // and calculate all the adresses where we have + mov r33 = r32 // to load from + add r97 = r14, r64 + add r35 = r32, r64 + add r98 = r14, r65 + add r37 = r32, r65 + add r99 = r14, r66 + add r39 = r32, r66 + add r100 = r14, r67 + add r41 = r32, r67 + add r101 = r14, r68 + add r43 = r32, r68 + add r102 = r14, r69 + add r45 = r32, r69 + add r103 = r14, r70 + add r47 = r32, r70 + add r104 = r14, r71 + add r49 = r32, r71 + add r105 = r14, r72 + add r51 = r32, r72 + add r106 = r14, r73 + add r53 = r32, r73 + add r107 = r14, r74 + add r55 = r32, r74 + add r108 = r14, r75 + add r57 = r32, r75 + add r109 = r14, r76 + add r59 = r32, r76 + add r110 = r14, r77 + add r61 = r32, r77 + add r111 = r14, r78 + add r63 = r32, r78 + ;; + ld8 r32 = [r33], 8 // Load all the data which is needed for the sad + ld8 r34 = [r35], 8 // in the registers. the goal is to have the array + ld8 r36 = [r37], 8 // adressed by cur in the registers r32 - r63 and + ld8 r38 = [r39], 8 // the aray adressed by ref in the registers + ld8 r40 = [r41], 8 // r64 - r95. The registers r96 - r111 are needed + ld8 r42 = [r43], 8 // to load the aligned 24 bits in which the + ld8 r44 = [r45], 8 // needed misaligned 16 bits must be. + ld8 r46 = [r47], 8 // After loading we start a preprocessing which + ld8 r48 = [r49], 8 // guarantees that the data adressed by ref is in + ld8 r50 = [r51], 8 // the registers r64 - r95. + ld8 r52 = [r53], 8 + ld8 r54 = [r55], 8 + ld8 r56 = [r57], 8 + ld8 r58 = [r59], 8 + ld8 r60 = [r61], 8 + ld8 r62 = [r63], 8 + ld8 r64 = [r96], 8 + ld8 r66 = [r97], 8 + ld8 r68 = [r98], 8 + ld8 r70 = [r99], 8 + ld8 r72 = [r100], 8 + ld8 r74 = [r101], 8 + ld8 r76 = [r102], 8 + ld8 r78 = [r103], 8 + ld8 r80 = [r104], 8 + ld8 r82 = [r105], 8 + ld8 r84 = [r106], 8 + ld8 r86 = [r107], 8 + ld8 r88 = [r108], 8 + ld8 r90 = [r109], 8 + ld8 r92 = [r110], 8 + ld8 r94 = [r111], 8 + ;; + ld8 r33 = [r33] + ld8 r35 = [r35] + ld8 r37 = [r37] + ld8 r39 = [r39] + ld8 r41 = [r41] + ld8 r43 = [r43] + ld8 r45 = [r45] + ld8 r47 = [r47] + ld8 r49 = [r49] + ld8 r51 = [r51] + ld8 r53 = [r53] + ld8 r55 = [r55] + ld8 r57 = [r57] + ld8 r59 = [r59] + ld8 r61 = [r61] + ld8 r63 = [r63] + ld8 r65 = [r96], 8 + ld8 r67 = [r97], 8 + ld8 r69 = [r98], 8 + ld8 r71 = [r99], 8 + ld8 r73 = [r100], 8 + ld8 r75 = [r101], 8 + ld8 r77 = [r102], 8 + ld8 r79 = [r103], 8 + ld8 r81 = [r104], 8 + ld8 r83 = [r105], 8 + ld8 r85 = [r106], 8 + ld8 r87 = [r107], 8 + ld8 r89 = [r108], 8 + ld8 r91 = [r109], 8 + ld8 r93 = [r110], 8 + ld8 r95 = [r111], 8 + (p16) br.cond.dptk.many .Lber // If ref is aligned, everything is loaded and we can start the calculation + ;; + ld8 r96 = [r96] // If not, we have to load a bit more + ld8 r97 = [r97] + ld8 r98 = [r98] + ld8 r99 = [r99] + ld8 r100 = [r100] + ld8 r101 = [r101] + ld8 r102 = [r102] + ld8 r103 = [r103] + ld8 r104 = [r104] + ld8 r105 = [r105] + ld8 r106 = [r106] + ld8 r107 = [r107] + ld8 r108 = [r108] + ld8 r109 = [r109] + ld8 r110 = [r110] + ld8 r111 = [r111] + (p24) br.cond.dptk.many .Lmod1 // according to the misalignment, we have + (p18) br.cond.dpnt.many .Lmod2 // to jump to different preprocessing routines + (p26) br.cond.dpnt.many .Lmod3 + (p20) br.cond.dpnt.many .Lmod4 + (p28) br.cond.dpnt.many .Lmod5 + (p22) br.cond.dpnt.many .Lmod6 + ;; +.Lmod7: // this jump point is not needed + shrp r64 = r65, r64, 56 // in these blocks, we do the preprocessing + shrp r65 = r96, r65, 56 + shrp r66 = r67, r66, 56 + shrp r67 = r97, r67, 56 + shrp r68 = r69, r68, 56 + shrp r69 = r98, r69, 56 + shrp r70 = r71, r70, 56 + shrp r71 = r99, r71, 56 + shrp r72 = r73, r72, 56 + shrp r73 = r100, r73, 56 + shrp r74 = r75, r74, 56 + shrp r75 = r101, r75, 56 + shrp r76 = r77, r76, 56 + shrp r77 = r102, r77, 56 + shrp r78 = r79, r78, 56 + shrp r79 = r103, r79, 56 + shrp r80 = r81, r80, 56 + shrp r81 = r104, r81, 56 + shrp r82 = r83, r82, 56 + shrp r83 = r105, r83, 56 + shrp r84 = r85, r84, 56 + shrp r85 = r106, r85, 56 + shrp r86 = r87, r86, 56 + shrp r87 = r107, r87, 56 + shrp r88 = r89, r88, 56 + shrp r89 = r108, r89, 56 + shrp r90 = r91, r90, 56 + shrp r91 = r109, r91, 56 + shrp r92 = r93, r92, 56 + shrp r93 = r110, r93, 56 + shrp r94 = r95, r94, 56 + shrp r95 = r111, r95, 56 + br.cond.sptk.many .Lber // and then we jump to the calculation + ;; +.Lmod6: + shrp r64 = r65, r64, 48 + shrp r65 = r96, r65, 48 + shrp r66 = r67, r66, 48 + shrp r67 = r97, r67, 48 + shrp r68 = r69, r68, 48 + shrp r69 = r98, r69, 48 + shrp r70 = r71, r70, 48 + shrp r71 = r99, r71, 48 + shrp r72 = r73, r72, 48 + shrp r73 = r100, r73, 48 + shrp r74 = r75, r74, 48 + shrp r75 = r101, r75, 48 + shrp r76 = r77, r76, 48 + shrp r77 = r102, r77, 48 + shrp r78 = r79, r78, 48 + shrp r79 = r103, r79, 48 + shrp r80 = r81, r80, 48 + shrp r81 = r104, r81, 48 + shrp r82 = r83, r82, 48 + shrp r83 = r105, r83, 48 + shrp r84 = r85, r84, 48 + shrp r85 = r106, r85, 48 + shrp r86 = r87, r86, 48 + shrp r87 = r107, r87, 48 + shrp r88 = r89, r88, 48 + shrp r89 = r108, r89, 48 + shrp r90 = r91, r90, 48 + shrp r91 = r109, r91, 48 + shrp r92 = r93, r92, 48 + shrp r93 = r110, r93, 48 + shrp r94 = r95, r94, 48 + shrp r95 = r111, r95, 48 + br.cond.sptk.many .Lber + ;; +.Lmod5: + shrp r64 = r65, r64, 40 + shrp r65 = r96, r65, 40 + shrp r66 = r67, r66, 40 + shrp r67 = r97, r67, 40 + shrp r68 = r69, r68, 40 + shrp r69 = r98, r69, 40 + shrp r70 = r71, r70, 40 + shrp r71 = r99, r71, 40 + shrp r72 = r73, r72, 40 + shrp r73 = r100, r73, 40 + shrp r74 = r75, r74, 40 + shrp r75 = r101, r75, 40 + shrp r76 = r77, r76, 40 + shrp r77 = r102, r77, 40 + shrp r78 = r79, r78, 40 + shrp r79 = r103, r79, 40 + shrp r80 = r81, r80, 40 + shrp r81 = r104, r81, 40 + shrp r82 = r83, r82, 40 + shrp r83 = r105, r83, 40 + shrp r84 = r85, r84, 40 + shrp r85 = r106, r85, 40 + shrp r86 = r87, r86, 40 + shrp r87 = r107, r87, 40 + shrp r88 = r89, r88, 40 + shrp r89 = r108, r89, 40 + shrp r90 = r91, r90, 40 + shrp r91 = r109, r91, 40 + shrp r92 = r93, r92, 40 + shrp r93 = r110, r93, 40 + shrp r94 = r95, r94, 40 + shrp r95 = r111, r95, 40 + br.cond.sptk.many .Lber + ;; +.Lmod4: + shrp r64 = r65, r64, 32 + shrp r65 = r96, r65, 32 + shrp r66 = r67, r66, 32 + shrp r67 = r97, r67, 32 + shrp r68 = r69, r68, 32 + shrp r69 = r98, r69, 32 + shrp r70 = r71, r70, 32 + shrp r71 = r99, r71, 32 + shrp r72 = r73, r72, 32 + shrp r73 = r100, r73, 32 + shrp r74 = r75, r74, 32 + shrp r75 = r101, r75, 32 + shrp r76 = r77, r76, 32 + shrp r77 = r102, r77, 32 + shrp r78 = r79, r78, 32 + shrp r79 = r103, r79, 32 + shrp r80 = r81, r80, 32 + shrp r81 = r104, r81, 32 + shrp r82 = r83, r82, 32 + shrp r83 = r105, r83, 32 + shrp r84 = r85, r84, 32 + shrp r85 = r106, r85, 32 + shrp r86 = r87, r86, 32 + shrp r87 = r107, r87, 32 + shrp r88 = r89, r88, 32 + shrp r89 = r108, r89, 32 + shrp r90 = r91, r90, 32 + shrp r91 = r109, r91, 32 + shrp r92 = r93, r92, 32 + shrp r93 = r110, r93, 32 + shrp r94 = r95, r94, 32 + shrp r95 = r111, r95, 32 + br.cond.sptk.many .Lber + ;; +.Lmod3: + shrp r64 = r65, r64, 24 + shrp r65 = r96, r65, 24 + shrp r66 = r67, r66, 24 + shrp r67 = r97, r67, 24 + shrp r68 = r69, r68, 24 + shrp r69 = r98, r69, 24 + shrp r70 = r71, r70, 24 + shrp r71 = r99, r71, 24 + shrp r72 = r73, r72, 24 + shrp r73 = r100, r73, 24 + shrp r74 = r75, r74, 24 + shrp r75 = r101, r75, 24 + shrp r76 = r77, r76, 24 + shrp r77 = r102, r77, 24 + shrp r78 = r79, r78, 24 + shrp r79 = r103, r79, 24 + shrp r80 = r81, r80, 24 + shrp r81 = r104, r81, 24 + shrp r82 = r83, r82, 24 + shrp r83 = r105, r83, 24 + shrp r84 = r85, r84, 24 + shrp r85 = r106, r85, 24 + shrp r86 = r87, r86, 24 + shrp r87 = r107, r87, 24 + shrp r88 = r89, r88, 24 + shrp r89 = r108, r89, 24 + shrp r90 = r91, r90, 24 + shrp r91 = r109, r91, 24 + shrp r92 = r93, r92, 24 + shrp r93 = r110, r93, 24 + shrp r94 = r95, r94, 24 + shrp r95 = r111, r95, 24 + br.cond.sptk.many .Lber + ;; +.Lmod2: + shrp r64 = r65, r64, 16 + shrp r65 = r96, r65, 16 + shrp r66 = r67, r66, 16 + shrp r67 = r97, r67, 16 + shrp r68 = r69, r68, 16 + shrp r69 = r98, r69, 16 + shrp r70 = r71, r70, 16 + shrp r71 = r99, r71, 16 + shrp r72 = r73, r72, 16 + shrp r73 = r100, r73, 16 + shrp r74 = r75, r74, 16 + shrp r75 = r101, r75, 16 + shrp r76 = r77, r76, 16 + shrp r77 = r102, r77, 16 + shrp r78 = r79, r78, 16 + shrp r79 = r103, r79, 16 + shrp r80 = r81, r80, 16 + shrp r81 = r104, r81, 16 + shrp r82 = r83, r82, 16 + shrp r83 = r105, r83, 16 + shrp r84 = r85, r84, 16 + shrp r85 = r106, r85, 16 + shrp r86 = r87, r86, 16 + shrp r87 = r107, r87, 16 + shrp r88 = r89, r88, 16 + shrp r89 = r108, r89, 16 + shrp r90 = r91, r90, 16 + shrp r91 = r109, r91, 16 + shrp r92 = r93, r92, 16 + shrp r93 = r110, r93, 16 + shrp r94 = r95, r94, 16 + shrp r95 = r111, r95, 16 + br.cond.sptk.many .Lber + ;; +.Lmod1: + shrp r64 = r65, r64, 8 + shrp r65 = r96, r65, 8 + shrp r66 = r67, r66, 8 + shrp r67 = r97, r67, 8 + shrp r68 = r69, r68, 8 + shrp r69 = r98, r69, 8 + shrp r70 = r71, r70, 8 + shrp r71 = r99, r71, 8 + shrp r72 = r73, r72, 8 + shrp r73 = r100, r73, 8 + shrp r74 = r75, r74, 8 + shrp r75 = r101, r75, 8 + shrp r76 = r77, r76, 8 + shrp r77 = r102, r77, 8 + shrp r78 = r79, r78, 8 + shrp r79 = r103, r79, 8 + shrp r80 = r81, r80, 8 + shrp r81 = r104, r81, 8 + shrp r82 = r83, r82, 8 + shrp r83 = r105, r83, 8 + shrp r84 = r85, r84, 8 + shrp r85 = r106, r85, 8 + shrp r86 = r87, r86, 8 + shrp r87 = r107, r87, 8 + shrp r88 = r89, r88, 8 + shrp r89 = r108, r89, 8 + shrp r90 = r91, r90, 8 + shrp r91 = r109, r91, 8 + shrp r92 = r93, r92, 8 + shrp r93 = r110, r93, 8 + shrp r94 = r95, r94, 8 + shrp r95 = r111, r95, 8 +.Lber: + ;; + psad1 r32 = r32, r64 // Here we do the calculation. + psad1 r33 = r33, r65 // The machine is providing a fast method + psad1 r34 = r34, r66 // for calculating sad, so we use it + psad1 r35 = r35, r67 + psad1 r36 = r36, r68 + psad1 r37 = r37, r69 + psad1 r38 = r38, r70 + psad1 r39 = r39, r71 + psad1 r40 = r40, r72 + psad1 r41 = r41, r73 + psad1 r42 = r42, r74 + psad1 r43 = r43, r75 + psad1 r44 = r44, r76 + psad1 r45 = r45, r77 + psad1 r46 = r46, r78 + psad1 r47 = r47, r79 + psad1 r48 = r48, r80 + psad1 r49 = r49, r81 + psad1 r50 = r50, r82 + psad1 r51 = r51, r83 + psad1 r52 = r52, r84 + psad1 r53 = r53, r85 + psad1 r54 = r54, r86 + psad1 r55 = r55, r87 + psad1 r56 = r56, r88 + psad1 r57 = r57, r89 + psad1 r58 = r58, r90 + psad1 r59 = r59, r91 + psad1 r60 = r60, r92 + psad1 r61 = r61, r93 + psad1 r62 = r62, r94 + psad1 r63 = r63, r95 + ;; + add r32 = r32, r63 // at last, we have to sum up + add r33 = r33, r62 // in 5 stages + add r34 = r34, r61 + add r35 = r35, r60 + add r36 = r36, r59 + add r37 = r37, r58 + add r38 = r38, r57 + add r39 = r39, r56 + add r40 = r40, r55 + add r41 = r41, r54 + add r42 = r42, r53 + add r43 = r43, r52 + add r44 = r44, r51 + add r45 = r45, r50 + add r46 = r46, r49 + add r47 = r47, r48 + ;; + add r32 = r32, r47 + add r33 = r33, r46 + add r34 = r34, r45 + add r35 = r35, r44 + add r36 = r36, r43 + add r37 = r37, r42 + add r38 = r38, r41 + add r39 = r39, r40 + ;; + add r32 = r32, r39 + add r33 = r33, r38 + add r34 = r34, r37 + add r35 = r35, r36 + ;; + add r32 = r32, r35 + add r33 = r33, r34 + ;; + add r8 = r32, r33 // and store the result in r8 + mov pr = r2, -1 + mov ar.pfs = r1 + br.ret.sptk.many b0 + .endp sad16_ia64# + + + + + .align 16 + .global sad8_ia64# + .proc sad8_ia64# +sad8_ia64: + alloc r1 = ar.pfs, 3, 21, 0, 0 + mov r2 = pr + dep r14 = r0, r33, 0, 3 // calculate aligned version of ref + dep.z r31 = r33, 0, 3 // calculate misalignment of ref + ;; + mov r40 = r34 //(1) calculate multiples of stride + shl r41 = r34, 1 //(2) + shladd r42 = r34, 1, r34 //(3) + shl r43 = r34, 2 //(4) + shladd r44 = r34, 2, r34 //(5) + ;; + cmp.eq p16, p17 = 0, r31 // set predicates according to the misalignment of ref + cmp.eq p18, p19 = 2, r31 + shl r45 = r42, 1 //(6) + cmp.eq p20, p21 = 4, r31 + cmp.eq p22, p23 = 6, r31 + shladd r46 = r42, 1, r34 //(7) + cmp.eq p24, p25 = 1, r31 + cmp.eq p26, p27 = 3, r31 + cmp.eq p28, p29 = 5, r31 + ;; + mov r48 = r14 // calculate memory adresses of data + add r33 = r32, r40 + add r49 = r14, r40 + add r34 = r32, r41 + add r50 = r14, r41 + add r35 = r32, r42 + add r51 = r14, r42 + add r36 = r32, r43 + add r52 = r14, r43 + add r37 = r32, r44 + add r53 = r14, r44 + add r38 = r32, r45 + add r54 = r14, r45 + add r39 = r32, r46 + add r55 = r14, r46 + ;; + ld8 r32 = [r32] // load everythingund alles wird geladen + ld8 r33 = [r33] // cur is located in r32 - r39 + ld8 r34 = [r34] // ref in r40 - r47 + ld8 r35 = [r35] + ld8 r36 = [r36] + ld8 r37 = [r37] + ld8 r38 = [r38] + ld8 r39 = [r39] + ld8 r40 = [r48] ,8 + ld8 r41 = [r49] ,8 + ld8 r42 = [r50] ,8 + ld8 r43 = [r51] ,8 + ld8 r44 = [r52] ,8 + ld8 r45 = [r53] ,8 + ld8 r46 = [r54] ,8 + ld8 r47 = [r55] ,8 + (p16) br.cond.dptk.many .Lber2 // if ref is aligned, we can start the calculation + ;; + ld8 r48 = [r48] // if not, we have to load some more + ld8 r49 = [r49] // because of the alignment of ld8 + ld8 r50 = [r50] + ld8 r51 = [r51] + ld8 r52 = [r52] + ld8 r53 = [r53] + ld8 r54 = [r54] + ld8 r55 = [r55] + (p24) br.cond.dptk.many .Lmode1 + (p18) br.cond.dpnt.many .Lmode2 + (p26) br.cond.dpnt.many .Lmode3 + (p20) br.cond.dpnt.many .Lmode4 + (p28) br.cond.dpnt.many .Lmode5 + (p22) br.cond.dpnt.many .Lmode6 + ;; +.Lmode7: // this jump piont is not needed, it is for better understandment + shrp r40 = r48, r40, 56 // here we do some preprocessing on the data + shrp r41 = r49, r41, 56 // this is because of the alignment problem of ref + shrp r42 = r50, r42, 56 + shrp r43 = r51, r43, 56 + shrp r44 = r52, r44, 56 + shrp r45 = r53, r45, 56 + shrp r46 = r54, r46, 56 + shrp r47 = r55, r47, 56 + br.cond.sptk.many .Lber2 + ;; +.Lmode6: + shrp r40 = r48, r40, 48 + shrp r41 = r49, r41, 48 + shrp r42 = r50, r42, 48 + shrp r43 = r51, r43, 48 + shrp r44 = r52, r44, 48 + shrp r45 = r53, r45, 48 + shrp r46 = r54, r46, 48 + shrp r47 = r55, r47, 48 + br.cond.sptk.many .Lber2 + ;; +.Lmode5: + shrp r40 = r48, r40, 40 + shrp r41 = r49, r41, 40 + shrp r42 = r50, r42, 40 + shrp r43 = r51, r43, 40 + shrp r44 = r52, r44, 40 + shrp r45 = r53, r45, 40 + shrp r46 = r54, r46, 40 + shrp r47 = r55, r47, 40 + br.cond.sptk.many .Lber2 + ;; +.Lmode4: + shrp r40 = r48, r40, 32 + shrp r41 = r49, r41, 32 + shrp r42 = r50, r42, 32 + shrp r43 = r51, r43, 32 + shrp r44 = r52, r44, 32 + shrp r45 = r53, r45, 32 + shrp r46 = r54, r46, 32 + shrp r47 = r55, r47, 32 + br.cond.sptk.many .Lber2 + ;; +.Lmode3: + shrp r40 = r48, r40, 24 + shrp r41 = r49, r41, 24 + shrp r42 = r50, r42, 24 + shrp r43 = r51, r43, 24 + shrp r44 = r52, r44, 24 + shrp r45 = r53, r45, 24 + shrp r46 = r54, r46, 24 + shrp r47 = r55, r47, 24 + br.cond.sptk.many .Lber2 + ;; +.Lmode2: + shrp r40 = r48, r40, 16 + shrp r41 = r49, r41, 16 + shrp r42 = r50, r42, 16 + shrp r43 = r51, r43, 16 + shrp r44 = r52, r44, 16 + shrp r45 = r53, r45, 16 + shrp r46 = r54, r46, 16 + shrp r47 = r55, r47, 16 + br.cond.sptk.many .Lber2 + ;; +.Lmode1: + shrp r40 = r48, r40, 8 + shrp r41 = r49, r41, 8 + shrp r42 = r50, r42, 8 + shrp r43 = r51, r43, 8 + shrp r44 = r52, r44, 8 + shrp r45 = r53, r45, 8 + shrp r46 = r54, r46, 8 + shrp r47 = r55, r47, 8 +.Lber2: + ;; + psad1 r32 = r32, r40 // we start calculating sad + psad1 r33 = r33, r41 // using th psad1 command of IA64 + psad1 r34 = r34, r42 + psad1 r35 = r35, r43 + psad1 r36 = r36, r44 + psad1 r37 = r37, r45 + psad1 r38 = r38, r46 + psad1 r39 = r39, r47 + ;; + add r32 = r32, r33 // then we sum up everything + add r33 = r34, r35 + add r34 = r36, r37 + add r35 = r38, r39 + ;; + add r32 = r32, r33 + add r33 = r34, r35 + ;; + add r8 = r32, r33 // and store the result un r8 + mov pr = r2, -1 + mov ar.pfs = r1 + br.ret.sptk.many b0 + .endp sad8_ia64#