--- trunk/xvidcore/src/image/ia64_asm/interpolate8x8_ia64.s 2002/07/11 00:15:59 290 +++ trunk/xvidcore/src/image/ia64_asm/interpolate8x8_ia64.s 2002/07/11 14:03:39 291 @@ -25,23 +25,24 @@ and r14 = -8,r33 // align src mov r15 = r32 // get dest mov r16 = r34 // stride - sub r17 = 1,r35 // 1-rounding +// sub r17 = 0,r0 // 1-rounding + ;; add r18 = 8,r14 - mux1 r17 = r17, @brcst // broadcast 1-rounding +// mux1 r17 = r17, @brcst // broadcast 1-rounding sub r24 = 64,r22 // lshift of src add r26 = 8,r22 // rshift of src+1 sub r27 = 56,r22 // lshift of src+1 mov ar.lc = 7 // loopcounter - mov ar.ec = LL + SL +OL + AVL + AL + STL // sum of latencies + mov ar.ec = LL + SL +OL + AVL + STL // sum of latencies mov pr.rot = 1 << 16 // init pr regs for sw-pipeling ;; - .rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],add1[AL+1],avg[AVL+1] - .rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL], pavg1p[AVL],stp[STL] + .rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1] + .rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL] .Lloop_interpolate: @@ -56,9 +57,9 @@ (or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things (or1p[0]) or or2[0] = shru2[SL],shl1[SL] - (addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding +// (addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding - (pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] // parallel average + (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] // parallel average (stp[0]) st8 [r15] = avg[AVL] // store results (stp[0]) add r15 = r15,r16 @@ -96,24 +97,24 @@ and r14 = -8,r33 mov r15 = r32 mov r16 = r34 - sub r17 = 1,r35 +// sub r17 = 0,r0 ;; add r18 = 8,r14 add r19 = r14,r16 // src + stride - mux1 r17 = r17, @brcst +// mux1 r17 = r17, @brcst sub r24 = 64,r22 ;; add r26 = 8,r19 // src + stride + 8 mov ar.lc = 7 - mov ar.ec = LL + SL +OL + AVL + AL + STL + mov ar.ec = LL + SL +OL + AVL + STL mov pr.rot = 1 << 16 ;; - .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],add1[AL+1],avg[AVL+1] - .rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL], pavg1p[AVL],stp[STL] + .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1] + .rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL] .Lloop_interpolate2: @@ -130,9 +131,9 @@ (or1p[0]) or or1[0] = shru1[SL],shl1[SL] (or1p[0]) or or2[0] = shru2[SL],shl2[SL] - (addp[0]) padd1.uus add1[0] = or1[OL],r17 +// (addp[0]) padd1.uus add1[0] = or1[OL],r17 - (pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] + (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] (stp[0]) st8 [r15] = avg[AVL] (stp[0]) add r15 = r15,r16 @@ -170,12 +171,12 @@ and r14 = -8,r33 mov r15 = r32 mov r16 = r34 - sub r17 = 1,r35 +// sub r17 = 0,r0 ;; add r18 = 8,r14 add r19 = r14,r16 - mux1 r17 = r17, @brcst +// mux1 r17 = r17, @brcst add r27 = 8,r22 sub r28 = 56,r22 @@ -184,12 +185,12 @@ add r26 = 8,r19 mov ar.lc = 7 - mov ar.ec = LL + SL +OL + 2*AVL + AL + STL + mov ar.ec = LL + SL +OL + 2*AVL + STL mov pr.rot = 1 << 16 ;; - .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],add1[AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1] - .rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL],pavg1p[AVL],pavg2p[AVL],stp[STL] + .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1] + .rotp aldp[LL], sh1p[SL], or1p[OL],pavg1p[AVL],pavg2p[AVL],stp[STL] .Lloop_interpolate3: @@ -213,10 +214,10 @@ (or1p[0]) or or3[0] = shru3[SL],shl3[SL] (or1p[0]) or or4[0] = shru4[SL],shl4[SL] - (addp[0]) padd1.uus add1[0] = or1[OL],r17 +// (addp[0]) padd1.uus add1[0] = or1[OL],r17 - (pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] - (pavg1p[0]) pavg1 avg1[0] = or3[OL+AL],or4[OL+AL] + (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] + (pavg1p[0]) pavg1 avg1[0] = or3[OL],or4[OL] (pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL]