--- trunk/xvidcore/src/quant/ia64_asm/quant_h263_ia64.s 2002/06/26 15:26:50 243 +++ trunk/xvidcore/src/quant/ia64_asm/quant_h263_ia64.s 2002/06/26 15:56:35 244 @@ -1,517 +1,517 @@ - .file "quant_h263.1.c" - .pred.safe_across_calls p1-p5,p16-p63 - .section .rodata - .align 4 - .type multipliers#,@object - .size multipliers#,128 -multipliers: - data4 0 - data4 32769 - data4 16385 - data4 10923 - data4 8193 - data4 6554 - data4 5462 - data4 4682 - data4 4097 - data4 3641 - data4 3277 - data4 2979 - data4 2731 - data4 2521 - data4 2341 - data4 2185 - data4 2049 - data4 1928 - data4 1821 - data4 1725 - data4 1639 - data4 1561 - data4 1490 - data4 1425 - data4 1366 - data4 1311 - data4 1261 - data4 1214 - data4 1171 - data4 1130 - data4 1093 - data4 1058 - .global __divdi3# -.text - .align 16 - .global quant_intra_ia64# - .proc quant_intra_ia64# -quant_intra_ia64: - .prologue 12, 37 - .save ar.pfs, r38 - alloc r38 = ar.pfs, 4, 3, 2, 0 - adds r16 = -8, r12 - .fframe 32 - adds r12 = -32, r12 - mov r17 = ar.lc - addl r14 = @ltoff(multipliers#), gp - ld2 r15 = [r33] - ;; - .savesp ar.lc, 24 - st8 [r16] = r17, 8 - ld8 r14 = [r14] - sxt2 r15 = r15 - ;; - .save.f 0x1 - stf.spill [r16] = f2 - .save rp, r37 - mov r37 = b0 - .body - dep.z r36 = r34, 1, 15 - dep.z r16 = r34, 2, 32 - cmp4.ge p6, p7 = 0, r15 - ;; - add r16 = r16, r14 - ;; - ld4 r16 = [r16] - ;; - setf.sig f2 = r16 - (p6) br.cond.dptk .L8 - extr r39 = r35, 1, 31 - sxt4 r40 = r35 - ;; - add r39 = r39, r15 - br .L21 - ;; -.L8: - extr r39 = r35, 1, 31 - sxt4 r40 = r35 - ;; - sub r39 = r15, r39 - ;; -.L21: - sxt4 r39 = r39 - br.call.sptk.many b0 = __divdi3# - ;; - addl r14 = 62, r0 - st2 [r32] = r8 - addl r19 = 1, r0 - ;; - mov ar.lc = r14 - ;; -.L20: - dep.z r17 = r19, 1, 32 - ;; - add r15 = r17, r33 - adds r19 = 1, r19 - ;; - ld2 r14 = [r15] - ;; - sxt2 r14 = r14 - ;; - mov r16 = r14 - mov r18 = r14 - ;; - sub r15 = r0, r16 - cmp4.le p8, p9 = r36, r16 - cmp4.le p6, p7 = r0, r16 - ;; - sxt2 r14 = r15 - (p6) br.cond.dptk .L14 - ;; - mov r16 = r14 - add r18 = r17, r32 - ;; - setf.sig f6 = r16 - cmp4.le p6, p7 = r36, r16 - mov r15 = r18 - ;; - xma.l f6 = f6, f2, f0 - (p7) st2 [r18] = r0 - ;; - getf.sig r14 = f6 - ;; - extr r14 = r14, 16, 16 - ;; - sub r14 = r0, r14 - ;; - (p6) st2 [r15] = r14 - br .L12 -.L14: - .pred.rel.mutex p8, p9 - setf.sig f6 = r18 - add r16 = r17, r32 - ;; - xma.l f6 = f6, f2, f0 - mov r15 = r16 - (p9) st2 [r16] = r0 - ;; - getf.sig r14 = f6 - ;; - extr r14 = r14, 16, 16 - ;; - (p8) st2 [r15] = r14 -.L12: - br.cloop.sptk.few .L20 - adds r18 = 24, r12 - ;; - ld8 r19 = [r18], 8 - mov ar.pfs = r38 - mov b0 = r37 - ;; - mov ar.lc = r19 - ldf.fill f2 = [r18] - .restore sp - adds r12 = 32, r12 - br.ret.sptk.many b0 - .endp quant_intra_ia64# - .common quant_intra#,8,8 - .common dequant_intra#,8,8 - .align 16 - .global dequant_intra_ia64# - .proc dequant_intra_ia64# -dequant_intra_ia64: - .prologue - ld2 r14 = [r33] - andcm r15 = 1, r34 - setf.sig f8 = r35 - ;; - sxt2 r14 = r14 - sub r15 = r34, r15 - addl r16 = -2048, r0 - ;; - setf.sig f6 = r14 - setf.sig f7 = r15 - shladd r34 = r34, 1, r0 - ;; - xma.l f8 = f6, f8, f0 - .save ar.lc, r2 - mov r2 = ar.lc - ;; - .body - getf.sig r14 = f8 - setf.sig f6 = r34 - ;; - sxt2 r15 = r14 - st2 [r32] = r14 - ;; - cmp4.le p6, p7 = r16, r15 - ;; - (p7) st2 [r32] = r16 - (p7) br.cond.dptk .L32 - addl r14 = 2047, r0 - ;; - cmp4.ge p6, p7 = r14, r15 - ;; - (p7) st2 [r32] = r14 -.L32: - addl r14 = 62, r0 - addl r19 = 1, r0 - addl r22 = 2048, r0 - addl r21 = -2048, r0 - addl r20 = 2047, r0 - ;; - mov ar.lc = r14 - ;; -.L56: - dep.z r16 = r19, 1, 32 - ;; - add r14 = r16, r33 - add r17 = r16, r32 - adds r19 = 1, r19 - ;; - ld2 r15 = [r14] - ;; - sxt2 r15 = r15 - ;; - cmp4.ne p6, p7 = 0, r15 - cmp4.le p8, p9 = r0, r15 - ;; - (p7) st2 [r17] = r0 - (p7) br.cond.dpnt .L36 - add r18 = r16, r32 - sub r17 = r0, r15 - ;; - mov r14 = r18 - (p8) br.cond.dptk .L40 - setf.sig f8 = r17 - ;; - xma.l f8 = f6, f8, f7 - ;; - getf.sig r15 = f8 - ;; - cmp4.lt p6, p7 = r22, r15 - sub r16 = r0, r15 - ;; - (p7) st2 [r14] = r16 - (p6) st2 [r14] = r21 - br .L36 -.L40: - setf.sig f8 = r15 - ;; - xma.l f8 = f6, f8, f7 - ;; - getf.sig r15 = f8 - ;; - cmp4.le p6, p7 = r20, r15 - ;; - (p6) mov r14 = r20 - (p7) mov r14 = r15 - ;; - st2 [r18] = r14 -.L36: - br.cloop.sptk.few .L56 - ;; - mov ar.lc = r2 - br.ret.sptk.many b0 - .endp dequant_intra_ia64# - - - -//uint32_t quant_inter_ia64(int16_t *coeff, const int16_t *data, const uint32_t quant) - - - - .common quant_inter#,8,8 - .align 16 - .global quant_inter_ia64# - .proc quant_inter_ia64# -quant_inter_ia64: - - -/******************************************************** - * * - * const uint32_t mult = multipliers[quant]; * - * const uint16_t quant_m_2 = quant << 1; * - * const uint16_t quant_d_2 = quant >> 1; * - * int sum = 0; * - * uint32_t i; * - * int16_t acLevel,acL; * - * * - ********************************************************/ - - - - LL=3 // LL = load latency - - .prologue - addl r14 = @ltoff(multipliers#), gp - dep.z r15 = r34, 2, 32 - .save ar.lc, r2 - mov r2 = ar.lc - ;; - .body - alloc r9=ar.pfs,0,24,0,24 - mov r17 = ar.ec - mov r10 = pr - ld8 r14 = [r14] - extr.u r16 = r34, 1, 16 //r16 = quant_d_2 - dep.z r20 = r34, 1, 15 //r20 = quant_m_2 - ;; - add r15 = r15, r14 - mov r21 = r16 //r21 = quant_d_2 - mov r8 = r0 //r8 = sum = 0 - mov pr.rot = 0 //p16-p63 = 0 - ;; - ld4 r15 = [r15] - addl r14 = 63, r0 - mov pr.rot = 1 << 16 //p16=1 - ;; - mov ar.lc = r14 - mov ar.ec = LL+9 - mov r29 = r15 - ;; - mov r15 = r33 //r15 = data - mov r18 = r32 //r18 = coeff - ;; - - - .rotr ac1[LL+3], ac2[8], ac3[2] - .rotp p[LL+9], cmp1[8], cmp1neg[8],cmp2[5], cmp2neg[2] - - - -/******************************************************************************** - * * - * for (i = 0; i < 64; i++) { * - * acL=acLevel = data[i]; * - * acLevel = ((acLevel < 0)?-acLevel:acLevel) - quant_d_2; * - * if (acLevel < quant_m_2){ * - * acLevel = 0; * - * } * - * acLevel = (acLevel * mult) >> SCALEBITS; * - * sum += acLevel; * - * coeff[i] = ((acL < 0)?-acLevel:acLevel); * - * } * - * * - ********************************************************************************/ - - - -.explicit -.L58: - //pipeline stage -{.mmi - (p[0]) ld2 ac1[0] = [r15],2 // 0 acL=acLevel = data[i]; - (p[LL+1]) sub ac2[0] = r0, ac1[LL+1] // LL+1 ac2=-acLevel - (p[LL]) sxt2 ac1[LL] = ac1[LL] // LL -} -{.mmi - (p[LL+1]) cmp4.le cmp1[0], cmp1neg[0] = r0, ac1[LL+1] // LL+1 cmp1 = (0<=acLevel) ; cmp1neg = !(0<=acLevel) - (p[LL+4]) cmp4.le cmp2[0], cmp2neg[0] = r20, ac2[3] // LL+4 cmp2 = (quant_m_2 < acLevel) ; cmp2neg = !(quant_m_2 < acLevel) - (cmp1[1]) sub ac2[1] = ac1[LL+2], r21 // LL+2 acLevel = acLevel - quant_d_2; -} -{.mmi - (cmp2neg[1]) mov ac2[4] = r0 // LL+5 if (acLevel < quant_m_2) acLevel=0; - (cmp1neg[1]) sub ac2[1] = ac2[1], r21 // LL+2 acLevel = ac2 - quant_d_2; - (p[LL+3]) sxt2 ac2[2] = ac2[2] // LL+3 -} -{.mmi - (cmp1[6]) mov ac3[0] = ac2[6] // LL+7 ac3 = acLevel; - (cmp1neg[6]) sub ac3[0] = r0, ac2[6] // LL+7 ac3 = -acLevel; - (p[LL+6]) pmpyshr2.u ac2[5] = r29, ac2[5], 16 // LL+6 acLevel = (acLevel * mult) >> SCALEBITS; -} -{.mib - (p[LL+8]) st2 [r18] = ac3[1] , 2 // LL+8 coeff[i] = ac3; - (cmp2[4]) add r8 = r8, ac2[7] // LL+8 sum += acLevel; - br.ctop.sptk.few .L58 - ;; -} -.default - mov ar.ec = r17 - ;; - mov ar.lc = r2 - mov pr = r10, -1 - mov ar.pfs = r9 - br.ret.sptk.many b0 - .endp quant_inter_ia64# - - - - - - - -// void dequant_inter_ia64(int16_t *data, const int16_t *coeff, const uint32_t quant) - - .common dequant_inter#,8,8 - .align 16 - .global dequant_inter_ia64# - .proc dequant_inter_ia64# -dequant_inter_ia64: - -//*********************************************************************** -// * -// const uint16_t quant_m_2 = quant << 1; * -// const uint16_t quant_add = (quant & 1 ? quant : quant - 1); * -// uint32_t i; * -// * -//*********************************************************************** * - - - - - .prologue - andcm r14 = 1, r34 - dep.z r29 = r34, 1, 15 - alloc r9=ar.pfs,0,32,0,32 - .save ar.lc, r2 - mov r2 = ar.lc - ;; - .body - sub r15 = r34, r14 // r15 = quant - addl r14 = 63, r0 - addl r21 = -2048, r0 - addl r20 = 2047, r0 - mov r16 = ar.ec - mov r17 = pr - ;; - zxt2 r15 = r15 - mov ar.lc = r14 - mov pr.rot = 0 - ;; - adds r14 = 0, r33 // r14 = coeff - mov r18 = r32 // r18 = data - mov ar.ec = LL+10 - mov pr.rot = 1 << 16 - ;; - -/******************************************************************************** - * * - *for (i = 0; i < 64; i++) { * - * int16_t acLevel = coeff[i]; * - * * - * if (acLevel == 0) * - * { * - * data[i] = 0; * - * } * - * else if (acLevel < 0) * - * { * - * acLevel = acLevel * quant_m_2 - quant_add; * - * data[i] = (acLevel >= -2048 ? acLevel : -2048); * - * } * - * else // if (acLevel > 0) * - * { * - * acLevel = acLevel * quant_m_2 + quant_add; * - * data[i] = (acLevel <= 2047 ? acLevel : 2047); * - * } * - * } * - * * - ********************************************************************************/ - - - - LL=2 // LL := load latency - - - .rotr ac1[LL+10], x[5], y1[3], y2[3] - .rotp p[LL+10] , cmp1neg[8], cmp2[5], cmp2neg[5],cmp3[2], cmp3neg[2] - -.explicit - //pipeline stage - -.L60: -{.mmi - (p[0])ld2 ac1[0] = [r14] ,2 // 0 acLevel = coeff[i]; - (p[LL+1])cmp4.ne p6, cmp1neg[0] = 0, ac1[LL+1] // LL+1 - (p[LL])sxt2 ac1[LL] = ac1[LL] // LL - -} -{.mmi - (p[LL+1])cmp4.le cmp2[0], cmp2neg[0] = r0, ac1[LL+1] // LL+1 - (cmp2[1]) mov x[0] = r20 // LL+2 - (p[LL+2])pmpyshr2.u ac1[LL+2] = r29, ac1[LL+2], 0 // LL+2 -} -{.mmi - (cmp2neg[1]) mov x[0] = r21 // LL+2 - (cmp2[2]) add ac1[LL+3] = ac1[LL+3], r15 // LL+3 - (cmp2neg[2]) sub ac1[LL+3] = ac1[LL+3], r15 // LL+3 - -} -{.mmi - (cmp2neg[4]) mov y1[0] = ac1[LL+5] // LL+5 - (cmp2neg[4]) mov y2[0] = x[3] // LL+5 - (p[LL+4])sxt2 ac1[LL+4] = ac1[LL+4] // LL+4 -} -{.mmi - (cmp2[4]) mov y1[0] = x[3] // LL+4 - (cmp2[4]) mov y2[0] = ac1[LL+5] // LL+4 - (p[LL+6])cmp4.le cmp3[0], cmp3neg[0] = x[4], ac1[LL+6] // LL+6 -} -{.mmi - (cmp3[1]) mov ac1[LL+7] = y1[2] // LL+7 - (cmp3neg[1]) mov ac1[LL+7] = y2[2] // LL+7 - (cmp1neg[7]) mov ac1[LL+8] = r0 // LL+8 -} -{.mbb - (p[LL+9])st2 [r18] = ac1[LL+9] ,2 // LL+9 - nop.b 0x0 - br.ctop.sptk.few .L60 - ;; -} -.default - mov ar.lc = r2 - mov ar.pfs = r9 - mov ar.ec = r16 - mov pr = r17, -1 - ;; - mov ar.lc = r2 - br.ret.sptk.many b0 - .endp dequant_inter_ia64# - .ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)" + .file "quant_h263.1.c" + .pred.safe_across_calls p1-p5,p16-p63 + .section .rodata + .align 4 + .type multipliers#,@object + .size multipliers#,128 +multipliers: + data4 0 + data4 32769 + data4 16385 + data4 10923 + data4 8193 + data4 6554 + data4 5462 + data4 4682 + data4 4097 + data4 3641 + data4 3277 + data4 2979 + data4 2731 + data4 2521 + data4 2341 + data4 2185 + data4 2049 + data4 1928 + data4 1821 + data4 1725 + data4 1639 + data4 1561 + data4 1490 + data4 1425 + data4 1366 + data4 1311 + data4 1261 + data4 1214 + data4 1171 + data4 1130 + data4 1093 + data4 1058 + .global __divdi3# +.text + .align 16 + .global quant_intra_ia64# + .proc quant_intra_ia64# +quant_intra_ia64: + .prologue 12, 37 + .save ar.pfs, r38 + alloc r38 = ar.pfs, 4, 3, 2, 0 + adds r16 = -8, r12 + .fframe 32 + adds r12 = -32, r12 + mov r17 = ar.lc + addl r14 = @ltoff(multipliers#), gp + ld2 r15 = [r33] + ;; + .savesp ar.lc, 24 + st8 [r16] = r17, 8 + ld8 r14 = [r14] + sxt2 r15 = r15 + ;; + .save.f 0x1 + stf.spill [r16] = f2 + .save rp, r37 + mov r37 = b0 + .body + dep.z r36 = r34, 1, 15 + dep.z r16 = r34, 2, 32 + cmp4.ge p6, p7 = 0, r15 + ;; + add r16 = r16, r14 + ;; + ld4 r16 = [r16] + ;; + setf.sig f2 = r16 + (p6) br.cond.dptk .L8 + extr r39 = r35, 1, 31 + sxt4 r40 = r35 + ;; + add r39 = r39, r15 + br .L21 + ;; +.L8: + extr r39 = r35, 1, 31 + sxt4 r40 = r35 + ;; + sub r39 = r15, r39 + ;; +.L21: + sxt4 r39 = r39 + br.call.sptk.many b0 = __divdi3# + ;; + addl r14 = 62, r0 + st2 [r32] = r8 + addl r19 = 1, r0 + ;; + mov ar.lc = r14 + ;; +.L20: + dep.z r17 = r19, 1, 32 + ;; + add r15 = r17, r33 + adds r19 = 1, r19 + ;; + ld2 r14 = [r15] + ;; + sxt2 r14 = r14 + ;; + mov r16 = r14 + mov r18 = r14 + ;; + sub r15 = r0, r16 + cmp4.le p8, p9 = r36, r16 + cmp4.le p6, p7 = r0, r16 + ;; + sxt2 r14 = r15 + (p6) br.cond.dptk .L14 + ;; + mov r16 = r14 + add r18 = r17, r32 + ;; + setf.sig f6 = r16 + cmp4.le p6, p7 = r36, r16 + mov r15 = r18 + ;; + xma.l f6 = f6, f2, f0 + (p7) st2 [r18] = r0 + ;; + getf.sig r14 = f6 + ;; + extr r14 = r14, 16, 16 + ;; + sub r14 = r0, r14 + ;; + (p6) st2 [r15] = r14 + br .L12 +.L14: + .pred.rel.mutex p8, p9 + setf.sig f6 = r18 + add r16 = r17, r32 + ;; + xma.l f6 = f6, f2, f0 + mov r15 = r16 + (p9) st2 [r16] = r0 + ;; + getf.sig r14 = f6 + ;; + extr r14 = r14, 16, 16 + ;; + (p8) st2 [r15] = r14 +.L12: + br.cloop.sptk.few .L20 + adds r18 = 24, r12 + ;; + ld8 r19 = [r18], 8 + mov ar.pfs = r38 + mov b0 = r37 + ;; + mov ar.lc = r19 + ldf.fill f2 = [r18] + .restore sp + adds r12 = 32, r12 + br.ret.sptk.many b0 + .endp quant_intra_ia64# + .common quant_intra#,8,8 + .common dequant_intra#,8,8 + .align 16 + .global dequant_intra_ia64# + .proc dequant_intra_ia64# +dequant_intra_ia64: + .prologue + ld2 r14 = [r33] + andcm r15 = 1, r34 + setf.sig f8 = r35 + ;; + sxt2 r14 = r14 + sub r15 = r34, r15 + addl r16 = -2048, r0 + ;; + setf.sig f6 = r14 + setf.sig f7 = r15 + shladd r34 = r34, 1, r0 + ;; + xma.l f8 = f6, f8, f0 + .save ar.lc, r2 + mov r2 = ar.lc + ;; + .body + getf.sig r14 = f8 + setf.sig f6 = r34 + ;; + sxt2 r15 = r14 + st2 [r32] = r14 + ;; + cmp4.le p6, p7 = r16, r15 + ;; + (p7) st2 [r32] = r16 + (p7) br.cond.dptk .L32 + addl r14 = 2047, r0 + ;; + cmp4.ge p6, p7 = r14, r15 + ;; + (p7) st2 [r32] = r14 +.L32: + addl r14 = 62, r0 + addl r19 = 1, r0 + addl r22 = 2048, r0 + addl r21 = -2048, r0 + addl r20 = 2047, r0 + ;; + mov ar.lc = r14 + ;; +.L56: + dep.z r16 = r19, 1, 32 + ;; + add r14 = r16, r33 + add r17 = r16, r32 + adds r19 = 1, r19 + ;; + ld2 r15 = [r14] + ;; + sxt2 r15 = r15 + ;; + cmp4.ne p6, p7 = 0, r15 + cmp4.le p8, p9 = r0, r15 + ;; + (p7) st2 [r17] = r0 + (p7) br.cond.dpnt .L36 + add r18 = r16, r32 + sub r17 = r0, r15 + ;; + mov r14 = r18 + (p8) br.cond.dptk .L40 + setf.sig f8 = r17 + ;; + xma.l f8 = f6, f8, f7 + ;; + getf.sig r15 = f8 + ;; + cmp4.lt p6, p7 = r22, r15 + sub r16 = r0, r15 + ;; + (p7) st2 [r14] = r16 + (p6) st2 [r14] = r21 + br .L36 +.L40: + setf.sig f8 = r15 + ;; + xma.l f8 = f6, f8, f7 + ;; + getf.sig r15 = f8 + ;; + cmp4.le p6, p7 = r20, r15 + ;; + (p6) mov r14 = r20 + (p7) mov r14 = r15 + ;; + st2 [r18] = r14 +.L36: + br.cloop.sptk.few .L56 + ;; + mov ar.lc = r2 + br.ret.sptk.many b0 + .endp dequant_intra_ia64# + + + +//uint32_t quant_inter_ia64(int16_t *coeff, const int16_t *data, const uint32_t quant) + + + + .common quant_inter#,8,8 + .align 16 + .global quant_inter_ia64# + .proc quant_inter_ia64# +quant_inter_ia64: + + +/******************************************************** + * * + * const uint32_t mult = multipliers[quant]; * + * const uint16_t quant_m_2 = quant << 1; * + * const uint16_t quant_d_2 = quant >> 1; * + * int sum = 0; * + * uint32_t i; * + * int16_t acLevel,acL; * + * * + ********************************************************/ + + + + LL=3 // LL = load latency + + .prologue + addl r14 = @ltoff(multipliers#), gp + dep.z r15 = r34, 2, 32 + .save ar.lc, r2 + mov r2 = ar.lc + ;; + .body + alloc r9=ar.pfs,0,24,0,24 + mov r17 = ar.ec + mov r10 = pr + ld8 r14 = [r14] + extr.u r16 = r34, 1, 16 //r16 = quant_d_2 + dep.z r20 = r34, 1, 15 //r20 = quant_m_2 + ;; + add r15 = r15, r14 + mov r21 = r16 //r21 = quant_d_2 + mov r8 = r0 //r8 = sum = 0 + mov pr.rot = 0 //p16-p63 = 0 + ;; + ld4 r15 = [r15] + addl r14 = 63, r0 + mov pr.rot = 1 << 16 //p16=1 + ;; + mov ar.lc = r14 + mov ar.ec = LL+9 + mov r29 = r15 + ;; + mov r15 = r33 //r15 = data + mov r18 = r32 //r18 = coeff + ;; + + + .rotr ac1[LL+3], ac2[8], ac3[2] + .rotp p[LL+9], cmp1[8], cmp1neg[8],cmp2[5], cmp2neg[2] + + + +/******************************************************************************** + * * + * for (i = 0; i < 64; i++) { * + * acL=acLevel = data[i]; * + * acLevel = ((acLevel < 0)?-acLevel:acLevel) - quant_d_2; * + * if (acLevel < quant_m_2){ * + * acLevel = 0; * + * } * + * acLevel = (acLevel * mult) >> SCALEBITS; * + * sum += acLevel; * + * coeff[i] = ((acL < 0)?-acLevel:acLevel); * + * } * + * * + ********************************************************************************/ + + + +.explicit +.L58: + //pipeline stage +{.mmi + (p[0]) ld2 ac1[0] = [r15],2 // 0 acL=acLevel = data[i]; + (p[LL+1]) sub ac2[0] = r0, ac1[LL+1] // LL+1 ac2=-acLevel + (p[LL]) sxt2 ac1[LL] = ac1[LL] // LL +} +{.mmi + (p[LL+1]) cmp4.le cmp1[0], cmp1neg[0] = r0, ac1[LL+1] // LL+1 cmp1 = (0<=acLevel) ; cmp1neg = !(0<=acLevel) + (p[LL+4]) cmp4.le cmp2[0], cmp2neg[0] = r20, ac2[3] // LL+4 cmp2 = (quant_m_2 < acLevel) ; cmp2neg = !(quant_m_2 < acLevel) + (cmp1[1]) sub ac2[1] = ac1[LL+2], r21 // LL+2 acLevel = acLevel - quant_d_2; +} +{.mmi + (cmp2neg[1]) mov ac2[4] = r0 // LL+5 if (acLevel < quant_m_2) acLevel=0; + (cmp1neg[1]) sub ac2[1] = ac2[1], r21 // LL+2 acLevel = ac2 - quant_d_2; + (p[LL+3]) sxt2 ac2[2] = ac2[2] // LL+3 +} +{.mmi + (cmp1[6]) mov ac3[0] = ac2[6] // LL+7 ac3 = acLevel; + (cmp1neg[6]) sub ac3[0] = r0, ac2[6] // LL+7 ac3 = -acLevel; + (p[LL+6]) pmpyshr2.u ac2[5] = r29, ac2[5], 16 // LL+6 acLevel = (acLevel * mult) >> SCALEBITS; +} +{.mib + (p[LL+8]) st2 [r18] = ac3[1] , 2 // LL+8 coeff[i] = ac3; + (cmp2[4]) add r8 = r8, ac2[7] // LL+8 sum += acLevel; + br.ctop.sptk.few .L58 + ;; +} +.default + mov ar.ec = r17 + ;; + mov ar.lc = r2 + mov pr = r10, -1 + mov ar.pfs = r9 + br.ret.sptk.many b0 + .endp quant_inter_ia64# + + + + + + + +// void dequant_inter_ia64(int16_t *data, const int16_t *coeff, const uint32_t quant) + + .common dequant_inter#,8,8 + .align 16 + .global dequant_inter_ia64# + .proc dequant_inter_ia64# +dequant_inter_ia64: + +//*********************************************************************** +// * +// const uint16_t quant_m_2 = quant << 1; * +// const uint16_t quant_add = (quant & 1 ? quant : quant - 1); * +// uint32_t i; * +// * +//*********************************************************************** * + + + + + .prologue + andcm r14 = 1, r34 + dep.z r29 = r34, 1, 15 + alloc r9=ar.pfs,0,32,0,32 + .save ar.lc, r2 + mov r2 = ar.lc + ;; + .body + sub r15 = r34, r14 // r15 = quant + addl r14 = 63, r0 + addl r21 = -2048, r0 + addl r20 = 2047, r0 + mov r16 = ar.ec + mov r17 = pr + ;; + zxt2 r15 = r15 + mov ar.lc = r14 + mov pr.rot = 0 + ;; + adds r14 = 0, r33 // r14 = coeff + mov r18 = r32 // r18 = data + mov ar.ec = LL+10 + mov pr.rot = 1 << 16 + ;; + +/******************************************************************************** + * * + *for (i = 0; i < 64; i++) { * + * int16_t acLevel = coeff[i]; * + * * + * if (acLevel == 0) * + * { * + * data[i] = 0; * + * } * + * else if (acLevel < 0) * + * { * + * acLevel = acLevel * quant_m_2 - quant_add; * + * data[i] = (acLevel >= -2048 ? acLevel : -2048); * + * } * + * else // if (acLevel > 0) * + * { * + * acLevel = acLevel * quant_m_2 + quant_add; * + * data[i] = (acLevel <= 2047 ? acLevel : 2047); * + * } * + * } * + * * + ********************************************************************************/ + + + + LL=2 // LL := load latency + + + .rotr ac1[LL+10], x[5], y1[3], y2[3] + .rotp p[LL+10] , cmp1neg[8], cmp2[5], cmp2neg[5],cmp3[2], cmp3neg[2] + +.explicit + //pipeline stage + +.L60: +{.mmi + (p[0])ld2 ac1[0] = [r14] ,2 // 0 acLevel = coeff[i]; + (p[LL+1])cmp4.ne p6, cmp1neg[0] = 0, ac1[LL+1] // LL+1 + (p[LL])sxt2 ac1[LL] = ac1[LL] // LL + +} +{.mmi + (p[LL+1])cmp4.le cmp2[0], cmp2neg[0] = r0, ac1[LL+1] // LL+1 + (cmp2[1]) mov x[0] = r20 // LL+2 + (p[LL+2])pmpyshr2.u ac1[LL+2] = r29, ac1[LL+2], 0 // LL+2 +} +{.mmi + (cmp2neg[1]) mov x[0] = r21 // LL+2 + (cmp2[2]) add ac1[LL+3] = ac1[LL+3], r15 // LL+3 + (cmp2neg[2]) sub ac1[LL+3] = ac1[LL+3], r15 // LL+3 + +} +{.mmi + (cmp2neg[4]) mov y1[0] = ac1[LL+5] // LL+5 + (cmp2neg[4]) mov y2[0] = x[3] // LL+5 + (p[LL+4])sxt2 ac1[LL+4] = ac1[LL+4] // LL+4 +} +{.mmi + (cmp2[4]) mov y1[0] = x[3] // LL+4 + (cmp2[4]) mov y2[0] = ac1[LL+5] // LL+4 + (p[LL+6])cmp4.le cmp3[0], cmp3neg[0] = x[4], ac1[LL+6] // LL+6 +} +{.mmi + (cmp3[1]) mov ac1[LL+7] = y1[2] // LL+7 + (cmp3neg[1]) mov ac1[LL+7] = y2[2] // LL+7 + (cmp1neg[7]) mov ac1[LL+8] = r0 // LL+8 +} +{.mbb + (p[LL+9])st2 [r18] = ac1[LL+9] ,2 // LL+9 + nop.b 0x0 + br.ctop.sptk.few .L60 + ;; +} +.default + mov ar.lc = r2 + mov ar.pfs = r9 + mov ar.ec = r16 + mov pr = r17, -1 + ;; + mov ar.lc = r2 + br.ret.sptk.many b0 + .endp dequant_inter_ia64# + .ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)"