--- trunk/xvidcore/src/quant/ia64_asm/quant_h263_ia64.s 2002/06/30 10:46:29 252 +++ trunk/xvidcore/src/quant/ia64_asm/quant_h263_ia64.s 2002/07/01 13:50:39 253 @@ -1,4 +1,4 @@ - .file "quant_h263.1.c" + .file "quant_h263_ia64.s" .pred.safe_across_calls p1-p5,p16-p63 .section .rodata .align 4 @@ -43,7 +43,7 @@ .global quant_intra_ia64# .proc quant_intra_ia64# quant_intra_ia64: - .prologue 12, 37 + .prologue .save ar.pfs, r38 alloc r38 = ar.pfs, 4, 3, 2, 0 adds r16 = -8, r12 @@ -134,7 +134,7 @@ (p6) st2 [r15] = r14 br .L12 .L14: - .pred.rel.mutex p8, p9 + .pred.rel "mutex", p8, p9 setf.sig f6 = r18 add r16 = r17, r32 ;; @@ -275,21 +275,21 @@ quant_inter_ia64: -/******************************************************** - * * - * const uint32_t mult = multipliers[quant]; * - * const uint16_t quant_m_2 = quant << 1; * - * const uint16_t quant_d_2 = quant >> 1; * - * int sum = 0; * - * uint32_t i; * - * int16_t acLevel,acL; * - * * - ********************************************************/ +//******************************************************* +//* * +//* const uint32_t mult = multipliers[quant]; * +//* const uint16_t quant_m_2 = quant << 1; * +//* const uint16_t quant_d_2 = quant >> 1; * +//* int sum = 0; * +//* uint32_t i; * +//* int16_t acLevel,acL; * +//* * +//*******************************************************/ LL=3 // LL = load latency - + //if LL is changed, you'll also have to change the .pred.rel... parts below! .prologue addl r14 = @ltoff(multipliers#), gp dep.z r15 = r34, 2, 32 @@ -327,25 +327,28 @@ -/******************************************************************************** - * * - * for (i = 0; i < 64; i++) { * - * acL=acLevel = data[i]; * - * acLevel = ((acLevel < 0)?-acLevel:acLevel) - quant_d_2; * - * if (acLevel < quant_m_2){ * - * acLevel = 0; * - * } * - * acLevel = (acLevel * mult) >> SCALEBITS; * - * sum += acLevel; * - * coeff[i] = ((acL < 0)?-acLevel:acLevel); * - * } * - * * - ********************************************************************************/ +//******************************************************************************* +//* * +//* for (i = 0; i < 64; i++) { * +//* acL=acLevel = data[i]; * +//* acLevel = ((acLevel < 0)?-acLevel:acLevel) - quant_d_2; * +//* if (acLevel < quant_m_2){ * +//* acLevel = 0; * +//* } * +//* acLevel = (acLevel * mult) >> SCALEBITS; * +//* sum += acLevel; * +//* coeff[i] = ((acL < 0)?-acLevel:acLevel); * +//* } * +//* * +//*******************************************************************************/ .explicit .L58: + .pred.rel "clear", p29, p37 + .pred.rel "mutex", p29, p37 + //pipeline stage {.mmi (p[0]) ld2 ac1[0] = [r15],2 // 0 acL=acLevel = data[i]; @@ -363,6 +366,7 @@ (p[LL+3]) sxt2 ac2[2] = ac2[2] // LL+3 } {.mmi + .pred.rel "mutex", p34, p42 (cmp1[6]) mov ac3[0] = ac2[6] // LL+7 ac3 = acLevel; (cmp1neg[6]) sub ac3[0] = r0, ac2[6] // LL+7 ac3 = -acLevel; (p[LL+6]) pmpyshr2.u ac2[5] = r29, ac2[5], 16 // LL+6 acLevel = (acLevel * mult) >> SCALEBITS; @@ -373,6 +377,8 @@ br.ctop.sptk.few .L58 ;; } + + .pred.rel "clear", p29, p37 .default mov ar.ec = r17 ;; @@ -397,12 +403,12 @@ dequant_inter_ia64: //*********************************************************************** -// * -// const uint16_t quant_m_2 = quant << 1; * -// const uint16_t quant_add = (quant & 1 ? quant : quant - 1); * -// uint32_t i; * -// * -//*********************************************************************** * +//* * +//* const uint16_t quant_m_2 = quant << 1; * +//* const uint16_t quant_add = (quant & 1 ? quant : quant - 1); * +//* uint32_t i; * +//* * +//*********************************************************************** @@ -432,32 +438,33 @@ mov pr.rot = 1 << 16 ;; -/******************************************************************************** - * * - *for (i = 0; i < 64; i++) { * - * int16_t acLevel = coeff[i]; * - * * - * if (acLevel == 0) * - * { * - * data[i] = 0; * - * } * - * else if (acLevel < 0) * - * { * - * acLevel = acLevel * quant_m_2 - quant_add; * - * data[i] = (acLevel >= -2048 ? acLevel : -2048); * - * } * - * else // if (acLevel > 0) * - * { * - * acLevel = acLevel * quant_m_2 + quant_add; * - * data[i] = (acLevel <= 2047 ? acLevel : 2047); * - * } * - * } * - * * - ********************************************************************************/ +//******************************************************************************* +//* * +//*for (i = 0; i < 64; i++) { * +//* int16_t acLevel = coeff[i]; * +//* * +//* if (acLevel == 0) * +//* { * +//* data[i] = 0; * +//* } * +//* else if (acLevel < 0) * +//* { * +//* acLevel = acLevel * quant_m_2 - quant_add; * +//* data[i] = (acLevel >= -2048 ? acLevel : -2048); * +//* } * +//* else // if (acLevel > 0) * +//* { * +//* acLevel = acLevel * quant_m_2 + quant_add; * +//* data[i] = (acLevel <= 2047 ? acLevel : 2047); * +//* } * +//* } * +//* * +//*******************************************************************************/ LL=2 // LL := load latency + //if LL is changed, you'll also have to change the .pred.rel... parts below! .rotr ac1[LL+10], x[5], y1[3], y2[3] @@ -467,6 +474,14 @@ //pipeline stage .L60: + .pred.rel "clear", p36 + .pred.rel "mutex", p47, p49 + .pred.rel "mutex", p46, p48 + .pred.rel "mutex", p40, p45 + .pred.rel "mutex", p39, p44 + .pred.rel "mutex", p38, p43 + .pred.rel "mutex", p37, p42 + .pred.rel "mutex", p36, p41 {.mmi (p[0])ld2 ac1[0] = [r14] ,2 // 0 acLevel = coeff[i]; (p[LL+1])cmp4.ne p6, cmp1neg[0] = 0, ac1[LL+1] // LL+1 @@ -490,8 +505,8 @@ (p[LL+4])sxt2 ac1[LL+4] = ac1[LL+4] // LL+4 } {.mmi - (cmp2[4]) mov y1[0] = x[3] // LL+4 - (cmp2[4]) mov y2[0] = ac1[LL+5] // LL+4 + (cmp2[4]) mov y1[0] = x[3] // LL+5 + (cmp2[4]) mov y2[0] = ac1[LL+5] // LL+5 (p[LL+6])cmp4.le cmp3[0], cmp3neg[0] = x[4], ac1[LL+6] // LL+6 } {.mmi @@ -505,6 +520,7 @@ br.ctop.sptk.few .L60 ;; } + .pred.rel "clear", p36 .default mov ar.lc = r2 mov ar.pfs = r9