1 |
.file "quant_h263.c" |
.file "quant_h263.1.c" |
2 |
.pred.safe_across_calls p1-p5,p16-p63 |
.pred.safe_across_calls p1-p5,p16-p63 |
3 |
.section .rodata |
.section .rodata |
4 |
.align 4 |
.align 4 |
43 |
.global quant_intra_ia64# |
.global quant_intra_ia64# |
44 |
.proc quant_intra_ia64# |
.proc quant_intra_ia64# |
45 |
quant_intra_ia64: |
quant_intra_ia64: |
46 |
.prologue //12, 37 |
.prologue 12, 37 |
47 |
.save ar.pfs, r38 |
.save ar.pfs, r38 |
48 |
alloc r38 = ar.pfs, 4, 3, 2, 0 |
alloc r38 = ar.pfs, 4, 3, 2, 0 |
49 |
adds r16 = -8, r12 |
adds r16 = -8, r12 |
72 |
ld4 r16 = [r16] |
ld4 r16 = [r16] |
73 |
;; |
;; |
74 |
setf.sig f2 = r16 |
setf.sig f2 = r16 |
75 |
(p6) br.cond.dptk .L4 |
(p6) br.cond.dptk .L8 |
76 |
extr r39 = r35, 1, 31 |
extr r39 = r35, 1, 31 |
77 |
sxt4 r40 = r35 |
sxt4 r40 = r35 |
78 |
;; |
;; |
79 |
add r39 = r39, r15 |
add r39 = r39, r15 |
80 |
br .L38 |
br .L21 |
81 |
;; |
;; |
82 |
.L4: |
.L8: |
83 |
extr r39 = r35, 1, 31 |
extr r39 = r35, 1, 31 |
84 |
sxt4 r40 = r35 |
sxt4 r40 = r35 |
85 |
;; |
;; |
86 |
sub r39 = r15, r39 |
sub r39 = r15, r39 |
87 |
;; |
;; |
88 |
.L38: |
.L21: |
89 |
sxt4 r39 = r39 |
sxt4 r39 = r39 |
90 |
br.call.sptk.many b0 = __divdi3# |
br.call.sptk.many b0 = __divdi3# |
91 |
;; |
;; |
92 |
addl r16 = 2, r0 |
addl r14 = 62, r0 |
93 |
st2 [r32] = r8 |
st2 [r32] = r8 |
94 |
addl r17 = 1, r0 |
addl r19 = 1, r0 |
|
;; |
|
|
add r14 = r33, r16 |
|
|
;; |
|
|
ld2 r15 = [r14] |
|
|
;; |
|
|
sxt2 r15 = r15 |
|
|
;; |
|
|
mov r14 = r15 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r14 |
|
|
(p6) br.cond.dptk .L21 |
|
|
sub r14 = r0, r14 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r36, r14 |
|
|
;; |
|
|
(p7) add r14 = r32, r16 |
|
|
(p6) add r15 = r32, r16 |
|
|
(p6) setf.sig f6 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f6 = f6, f2, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f6 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) sub r14 = r0, r14 |
|
|
br .L39 |
|
|
;; |
|
|
.L21: |
|
|
cmp4.le p6, p7 = r36, r14 |
|
|
;; |
|
|
(p7) add r14 = r32, r16 |
|
|
(p6) setf.sig f6 = r15 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f6 = f6, f2, f0 |
|
|
(p6) add r15 = r32, r16 |
|
|
;; |
|
|
(p6) getf.sig r14 = f6 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
.L39: |
|
|
//.pred.rel.mutex p6, p7 |
|
|
;; |
|
|
(p6) st2 [r15] = r14 |
|
|
adds r17 = 1, r17 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 63, r17 |
|
|
(p7) br.cond.dptk .L16 |
|
|
addl r14 = 30, r0 |
|
95 |
;; |
;; |
96 |
mov ar.lc = r14 |
mov ar.lc = r14 |
97 |
;; |
;; |
98 |
.L37: |
.L20: |
99 |
dep.z r16 = r17, 1, 32 |
dep.z r17 = r19, 1, 32 |
|
;; |
|
|
add r14 = r16, r33 |
|
|
;; |
|
|
ld2 r15 = [r14] |
|
|
;; |
|
|
sxt2 r15 = r15 |
|
|
;; |
|
|
mov r14 = r15 |
|
100 |
;; |
;; |
101 |
cmp4.le p6, p7 = r0, r14 |
add r15 = r17, r33 |
102 |
(p6) br.cond.dptk .L27 |
adds r19 = 1, r19 |
|
sub r14 = r0, r14 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r36, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) add r15 = r16, r32 |
|
|
(p6) setf.sig f6 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f6 = f6, f2, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f6 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) sub r14 = r0, r14 |
|
|
br .L40 |
|
|
;; |
|
|
.L27: |
|
|
cmp4.le p6, p7 = r36, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) setf.sig f6 = r15 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f6 = f6, f2, f0 |
|
|
(p6) add r15 = r16, r32 |
|
|
;; |
|
|
(p6) getf.sig r14 = f6 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
.L40: |
|
|
//.pred.rel.mutex p6, p7 |
|
|
;; |
|
|
(p6) st2 [r15] = r14 |
|
|
adds r14 = 1, r17 |
|
|
;; |
|
|
dep.z r16 = r14, 1, 32 |
|
|
;; |
|
|
add r15 = r16, r33 |
|
103 |
;; |
;; |
104 |
ld2 r14 = [r15] |
ld2 r14 = [r15] |
105 |
;; |
;; |
106 |
sxt2 r14 = r14 |
sxt2 r14 = r14 |
107 |
;; |
;; |
108 |
mov r15 = r14 |
mov r16 = r14 |
109 |
;; |
mov r18 = r14 |
|
cmp4.le p6, p7 = r0, r15 |
|
|
(p6) br.cond.dptk .L33 |
|
|
sub r14 = r0, r15 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
mov r15 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r36, r15 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) setf.sig f6 = r15 |
|
110 |
;; |
;; |
111 |
(p7) st2 [r14] = r0 |
sub r15 = r0, r16 |
112 |
(p6) xma.l f6 = f6, f2, f0 |
cmp4.le p8, p9 = r36, r16 |
113 |
(p6) add r15 = r16, r32 |
cmp4.le p6, p7 = r0, r16 |
114 |
;; |
;; |
115 |
(p6) getf.sig r14 = f6 |
sxt2 r14 = r15 |
116 |
|
(p6) br.cond.dptk .L14 |
117 |
;; |
;; |
118 |
(p6) extr r14 = r14, 16, 16 |
mov r16 = r14 |
119 |
|
add r18 = r17, r32 |
120 |
;; |
;; |
121 |
(p6) sub r14 = r0, r14 |
setf.sig f6 = r16 |
122 |
br .L41 |
cmp4.le p6, p7 = r36, r16 |
123 |
.L33: |
mov r15 = r18 |
|
cmp4.le p6, p7 = r36, r15 |
|
124 |
;; |
;; |
125 |
(p7) add r14 = r16, r32 |
xma.l f6 = f6, f2, f0 |
126 |
(p6) add r15 = r16, r32 |
(p7) st2 [r18] = r0 |
|
(p6) setf.sig f6 = r14 |
|
127 |
;; |
;; |
128 |
(p7) st2 [r14] = r0 |
getf.sig r14 = f6 |
|
(p6) xma.l f6 = f6, f2, f0 |
|
129 |
;; |
;; |
130 |
(p6) getf.sig r14 = f6 |
extr r14 = r14, 16, 16 |
131 |
;; |
;; |
132 |
(p6) extr r14 = r14, 16, 16 |
sub r14 = r0, r14 |
|
.L41: |
|
|
//.pred.rel.mutex p6, p7 |
|
133 |
;; |
;; |
134 |
(p6) st2 [r15] = r14 |
(p6) st2 [r15] = r14 |
135 |
adds r17 = 2, r17 |
br .L12 |
136 |
br.cloop.sptk.few .L37 |
.L14: |
137 |
.L16: |
.pred.rel.mutex p8, p9 |
138 |
|
setf.sig f6 = r18 |
139 |
|
add r16 = r17, r32 |
140 |
|
;; |
141 |
|
xma.l f6 = f6, f2, f0 |
142 |
|
mov r15 = r16 |
143 |
|
(p9) st2 [r16] = r0 |
144 |
|
;; |
145 |
|
getf.sig r14 = f6 |
146 |
|
;; |
147 |
|
extr r14 = r14, 16, 16 |
148 |
|
;; |
149 |
|
(p8) st2 [r15] = r14 |
150 |
|
.L12: |
151 |
|
br.cloop.sptk.few .L20 |
152 |
adds r18 = 24, r12 |
adds r18 = 24, r12 |
153 |
;; |
;; |
154 |
ld8 r19 = [r18], 8 |
ld8 r19 = [r18], 8 |
161 |
adds r12 = 32, r12 |
adds r12 = 32, r12 |
162 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
163 |
.endp quant_intra_ia64# |
.endp quant_intra_ia64# |
|
.align 16 |
|
|
.global quant_inter_ia64# |
|
|
.proc quant_inter_ia64# |
|
|
quant_inter_ia64: |
|
|
.prologue |
|
|
addl r14 = @ltoff(multipliers#), gp |
|
|
dep.z r15 = r34, 2, 32 |
|
|
.save ar.lc, r2 |
|
|
mov r2 = ar.lc |
|
|
;; |
|
|
.body |
|
|
ld8 r14 = [r14] |
|
|
extr.u r16 = r34, 1, 16 |
|
|
dep.z r17 = r34, 1, 15 |
|
|
;; |
|
|
add r15 = r15, r14 |
|
|
mov r18 = r16 |
|
|
mov r8 = r0 |
|
|
;; |
|
|
ld4 r15 = [r15] |
|
|
addl r14 = 31, r0 |
|
|
mov r19 = r0 |
|
|
;; |
|
|
setf.sig f6 = r15 |
|
|
mov ar.lc = r14 |
|
|
;; |
|
|
.L65: |
|
|
dep.z r16 = r19, 1, 32 |
|
|
;; |
|
|
add r14 = r16, r33 |
|
|
;; |
|
|
ld2 r15 = [r14] |
|
|
;; |
|
|
sxt2 r15 = r15 |
|
|
;; |
|
|
mov r14 = r15 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r14 |
|
|
(p6) br.cond.dptk .L55 |
|
|
sub r14 = r0, r14 |
|
|
;; |
|
|
sub r14 = r14, r18 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) setf.sig f7 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) add r16 = r16, r32 |
|
|
(p6) xma.l f7 = f7, f6, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f7 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) sub r15 = r0, r14 |
|
|
(p6) add r8 = r8, r14 |
|
|
;; |
|
|
(p6) st2 [r16] = r15 |
|
|
br .L53 |
|
|
.L55: |
|
|
sub r14 = r14, r18 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) add r15 = r16, r32 |
|
|
(p6) setf.sig f7 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f7 = f7, f6, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f7 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) st2 [r15] = r14 |
|
|
(p6) add r8 = r8, r14 |
|
|
.L53: |
|
|
adds r14 = 1, r19 |
|
|
;; |
|
|
dep.z r16 = r14, 1, 32 |
|
|
;; |
|
|
add r15 = r16, r33 |
|
|
;; |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r14 |
|
|
(p6) br.cond.dptk .L61 |
|
|
sub r14 = r0, r14 |
|
|
;; |
|
|
sub r14 = r14, r18 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) setf.sig f7 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) add r16 = r16, r32 |
|
|
(p6) xma.l f7 = f7, f6, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f7 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) sub r15 = r0, r14 |
|
|
(p6) add r8 = r8, r14 |
|
|
;; |
|
|
(p6) st2 [r16] = r15 |
|
|
br .L59 |
|
|
.L61: |
|
|
sub r14 = r14, r18 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) add r15 = r16, r32 |
|
|
(p6) setf.sig f7 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f7 = f7, f6, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f7 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) st2 [r15] = r14 |
|
|
(p6) add r8 = r8, r14 |
|
|
.L59: |
|
|
adds r19 = 2, r19 |
|
|
br.cloop.sptk.few .L65 |
|
|
;; |
|
|
mov ar.lc = r2 |
|
|
br.ret.sptk.many b0 |
|
|
.endp quant_inter_ia64# |
|
164 |
.common quant_intra#,8,8 |
.common quant_intra#,8,8 |
165 |
.common dequant_intra#,8,8 |
.common dequant_intra#,8,8 |
166 |
.align 16 |
.align 16 |
194 |
cmp4.le p6, p7 = r16, r15 |
cmp4.le p6, p7 = r16, r15 |
195 |
;; |
;; |
196 |
(p7) st2 [r32] = r16 |
(p7) st2 [r32] = r16 |
197 |
(p7) br.cond.dptk .L68 |
(p7) br.cond.dptk .L32 |
198 |
addl r14 = 2047, r0 |
addl r14 = 2047, r0 |
199 |
;; |
;; |
200 |
cmp4.ge p6, p7 = r14, r15 |
cmp4.ge p6, p7 = r14, r15 |
201 |
;; |
;; |
202 |
(p7) st2 [r32] = r14 |
(p7) st2 [r32] = r14 |
203 |
.L68: |
.L32: |
204 |
addl r14 = 20, r0 |
addl r14 = 62, r0 |
205 |
addl r19 = 1, r0 |
addl r19 = 1, r0 |
206 |
addl r21 = 2048, r0 |
addl r22 = 2048, r0 |
207 |
addl r20 = -2048, r0 |
addl r21 = -2048, r0 |
208 |
addl r18 = 2047, r0 |
addl r20 = 2047, r0 |
209 |
;; |
;; |
210 |
mov ar.lc = r14 |
mov ar.lc = r14 |
211 |
;; |
;; |
212 |
.L110: |
.L56: |
213 |
dep.z r16 = r19, 1, 32 |
dep.z r16 = r19, 1, 32 |
214 |
;; |
;; |
215 |
add r14 = r16, r33 |
add r14 = r16, r33 |
216 |
|
add r17 = r16, r32 |
217 |
|
adds r19 = 1, r19 |
218 |
;; |
;; |
219 |
ld2 r15 = [r14] |
ld2 r15 = [r14] |
220 |
;; |
;; |
221 |
sxt2 r15 = r15 |
sxt2 r15 = r15 |
222 |
;; |
;; |
223 |
cmp4.ne p6, p7 = 0, r15 |
cmp4.ne p6, p7 = 0, r15 |
224 |
|
cmp4.le p8, p9 = r0, r15 |
225 |
;; |
;; |
226 |
(p7) add r14 = r16, r32 |
(p7) st2 [r17] = r0 |
227 |
;; |
(p7) br.cond.dpnt .L36 |
228 |
(p7) st2 [r14] = r0 |
add r18 = r16, r32 |
229 |
(p7) br.cond.dpnt .L92 |
sub r17 = r0, r15 |
230 |
cmp4.le p6, p7 = r0, r15 |
;; |
231 |
(p6) br.cond.dptk .L95 |
mov r14 = r18 |
232 |
sub r14 = r0, r15 |
(p8) br.cond.dptk .L40 |
233 |
add r17 = r16, r32 |
setf.sig f8 = r17 |
|
;; |
|
|
setf.sig f8 = r14 |
|
234 |
;; |
;; |
235 |
xma.l f8 = f6, f8, f7 |
xma.l f8 = f6, f8, f7 |
236 |
;; |
;; |
237 |
getf.sig r15 = f8 |
getf.sig r15 = f8 |
238 |
;; |
;; |
239 |
cmp4.lt p6, p7 = r21, r15 |
cmp4.lt p6, p7 = r22, r15 |
240 |
;; |
sub r16 = r0, r15 |
|
(p7) sub r14 = r0, r15 |
|
241 |
;; |
;; |
242 |
(p7) st2 [r17] = r14 |
(p7) st2 [r14] = r16 |
243 |
(p6) st2 [r17] = r20 |
(p6) st2 [r14] = r21 |
244 |
br .L92 |
br .L36 |
245 |
.L95: |
.L40: |
246 |
setf.sig f8 = r15 |
setf.sig f8 = r15 |
|
add r14 = r16, r32 |
|
247 |
;; |
;; |
248 |
xma.l f8 = f6, f8, f7 |
xma.l f8 = f6, f8, f7 |
249 |
;; |
;; |
250 |
getf.sig r15 = f8 |
getf.sig r15 = f8 |
251 |
;; |
;; |
252 |
cmp4.le p6, p7 = r18, r15 |
cmp4.le p6, p7 = r20, r15 |
|
;; |
|
|
(p6) mov r15 = r18 |
|
|
;; |
|
|
st2 [r14] = r15 |
|
|
.L92: |
|
|
adds r14 = 1, r19 |
|
|
;; |
|
|
dep.z r17 = r14, 1, 32 |
|
|
;; |
|
|
add r15 = r17, r33 |
|
|
;; |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
mov r16 = r14 |
|
|
;; |
|
|
cmp4.ne p6, p7 = 0, r16 |
|
|
;; |
|
|
(p7) add r14 = r17, r32 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p7) br.cond.dpnt .L98 |
|
|
cmp4.le p6, p7 = r0, r16 |
|
|
(p6) br.cond.dptk .L101 |
|
|
sub r14 = r0, r16 |
|
|
add r17 = r17, r32 |
|
|
;; |
|
|
setf.sig f8 = r14 |
|
|
;; |
|
|
xma.l f8 = f6, f8, f7 |
|
|
;; |
|
|
getf.sig r16 = f8 |
|
|
;; |
|
|
cmp4.lt p6, p7 = r21, r16 |
|
253 |
;; |
;; |
254 |
(p7) sub r14 = r0, r16 |
(p6) mov r14 = r20 |
255 |
|
(p7) mov r14 = r15 |
256 |
;; |
;; |
257 |
(p7) st2 [r17] = r14 |
st2 [r18] = r14 |
258 |
(p6) st2 [r17] = r20 |
.L36: |
259 |
br .L98 |
br.cloop.sptk.few .L56 |
|
.L101: |
|
|
setf.sig f8 = r16 |
|
|
add r14 = r17, r32 |
|
260 |
;; |
;; |
261 |
xma.l f8 = f6, f8, f7 |
mov ar.lc = r2 |
262 |
;; |
br.ret.sptk.many b0 |
263 |
getf.sig r16 = f8 |
.endp dequant_intra_ia64# |
264 |
;; |
|
265 |
cmp4.le p6, p7 = r18, r16 |
|
266 |
;; |
|
267 |
(p6) mov r15 = r18 |
//uint32_t quant_inter_ia64(int16_t *coeff, const int16_t *data, const uint32_t quant) |
268 |
(p7) mov r15 = r16 |
|
269 |
;; |
|
270 |
st2 [r14] = r15 |
|
271 |
.L98: |
.common quant_inter#,8,8 |
272 |
adds r14 = 2, r19 |
.align 16 |
273 |
;; |
.global quant_inter_ia64# |
274 |
dep.z r17 = r14, 1, 32 |
.proc quant_inter_ia64# |
275 |
;; |
quant_inter_ia64: |
276 |
add r15 = r17, r33 |
|
277 |
;; |
|
278 |
ld2 r14 = [r15] |
/******************************************************** |
279 |
;; |
* * |
280 |
sxt2 r14 = r14 |
* const uint32_t mult = multipliers[quant]; * |
281 |
;; |
* const uint16_t quant_m_2 = quant << 1; * |
282 |
mov r16 = r14 |
* const uint16_t quant_d_2 = quant >> 1; * |
283 |
;; |
* int sum = 0; * |
284 |
cmp4.ne p6, p7 = 0, r16 |
* uint32_t i; * |
285 |
;; |
* int16_t acLevel,acL; * |
286 |
(p7) add r14 = r17, r32 |
* * |
287 |
;; |
********************************************************/ |
288 |
(p7) st2 [r14] = r0 |
|
289 |
(p7) br.cond.dpnt .L104 |
|
290 |
cmp4.le p6, p7 = r0, r16 |
|
291 |
(p6) br.cond.dptk .L107 |
LL=3 // LL = load latency |
292 |
sub r14 = r0, r16 |
|
293 |
add r17 = r17, r32 |
.prologue |
294 |
;; |
addl r14 = @ltoff(multipliers#), gp |
295 |
setf.sig f8 = r14 |
dep.z r15 = r34, 2, 32 |
296 |
;; |
.save ar.lc, r2 |
297 |
xma.l f8 = f6, f8, f7 |
mov r2 = ar.lc |
|
;; |
|
|
getf.sig r16 = f8 |
|
|
;; |
|
|
cmp4.lt p6, p7 = r21, r16 |
|
|
;; |
|
|
(p7) sub r14 = r0, r16 |
|
298 |
;; |
;; |
299 |
(p7) st2 [r17] = r14 |
.body |
300 |
(p6) st2 [r17] = r20 |
alloc r9=ar.pfs,0,24,0,24 |
301 |
br .L104 |
mov r17 = ar.ec |
302 |
.L107: |
mov r10 = pr |
303 |
setf.sig f8 = r16 |
ld8 r14 = [r14] |
304 |
add r14 = r17, r32 |
extr.u r16 = r34, 1, 16 //r16 = quant_d_2 |
305 |
|
dep.z r20 = r34, 1, 15 //r20 = quant_m_2 |
306 |
;; |
;; |
307 |
xma.l f8 = f6, f8, f7 |
add r15 = r15, r14 |
308 |
|
mov r21 = r16 //r21 = quant_d_2 |
309 |
|
mov r8 = r0 //r8 = sum = 0 |
310 |
|
mov pr.rot = 0 //p16-p63 = 0 |
311 |
;; |
;; |
312 |
getf.sig r16 = f8 |
ld4 r15 = [r15] |
313 |
|
addl r14 = 63, r0 |
314 |
|
mov pr.rot = 1 << 16 //p16=1 |
315 |
;; |
;; |
316 |
cmp4.le p6, p7 = r18, r16 |
mov ar.lc = r14 |
317 |
|
mov ar.ec = LL+9 |
318 |
|
mov r29 = r15 |
319 |
;; |
;; |
320 |
(p6) mov r15 = r18 |
mov r15 = r33 //r15 = data |
321 |
(p7) mov r15 = r16 |
mov r18 = r32 //r18 = coeff |
322 |
;; |
;; |
323 |
st2 [r14] = r15 |
|
324 |
.L104: |
|
325 |
adds r19 = 3, r19 |
.rotr ac1[LL+3], ac2[8], ac3[2] |
326 |
br.cloop.sptk.few .L110 |
.rotp p[LL+9], cmp1[8], cmp1neg[8],cmp2[5], cmp2neg[2] |
327 |
|
|
328 |
|
|
329 |
|
|
330 |
|
/******************************************************************************** |
331 |
|
* * |
332 |
|
* for (i = 0; i < 64; i++) { * |
333 |
|
* acL=acLevel = data[i]; * |
334 |
|
* acLevel = ((acLevel < 0)?-acLevel:acLevel) - quant_d_2; * |
335 |
|
* if (acLevel < quant_m_2){ * |
336 |
|
* acLevel = 0; * |
337 |
|
* } * |
338 |
|
* acLevel = (acLevel * mult) >> SCALEBITS; * |
339 |
|
* sum += acLevel; * |
340 |
|
* coeff[i] = ((acL < 0)?-acLevel:acLevel); * |
341 |
|
* } * |
342 |
|
* * |
343 |
|
********************************************************************************/ |
344 |
|
|
345 |
|
|
346 |
|
|
347 |
|
.explicit |
348 |
|
.L58: |
349 |
|
//pipeline stage |
350 |
|
{.mmi |
351 |
|
(p[0]) ld2 ac1[0] = [r15],2 // 0 acL=acLevel = data[i]; |
352 |
|
(p[LL+1]) sub ac2[0] = r0, ac1[LL+1] // LL+1 ac2=-acLevel |
353 |
|
(p[LL]) sxt2 ac1[LL] = ac1[LL] // LL |
354 |
|
} |
355 |
|
{.mmi |
356 |
|
(p[LL+1]) cmp4.le cmp1[0], cmp1neg[0] = r0, ac1[LL+1] // LL+1 cmp1 = (0<=acLevel) ; cmp1neg = !(0<=acLevel) |
357 |
|
(p[LL+4]) cmp4.le cmp2[0], cmp2neg[0] = r20, ac2[3] // LL+4 cmp2 = (quant_m_2 < acLevel) ; cmp2neg = !(quant_m_2 < acLevel) |
358 |
|
(cmp1[1]) sub ac2[1] = ac1[LL+2], r21 // LL+2 acLevel = acLevel - quant_d_2; |
359 |
|
} |
360 |
|
{.mmi |
361 |
|
(cmp2neg[1]) mov ac2[4] = r0 // LL+5 if (acLevel < quant_m_2) acLevel=0; |
362 |
|
(cmp1neg[1]) sub ac2[1] = ac2[1], r21 // LL+2 acLevel = ac2 - quant_d_2; |
363 |
|
(p[LL+3]) sxt2 ac2[2] = ac2[2] // LL+3 |
364 |
|
} |
365 |
|
{.mmi |
366 |
|
(cmp1[6]) mov ac3[0] = ac2[6] // LL+7 ac3 = acLevel; |
367 |
|
(cmp1neg[6]) sub ac3[0] = r0, ac2[6] // LL+7 ac3 = -acLevel; |
368 |
|
(p[LL+6]) pmpyshr2.u ac2[5] = r29, ac2[5], 16 // LL+6 acLevel = (acLevel * mult) >> SCALEBITS; |
369 |
|
} |
370 |
|
{.mib |
371 |
|
(p[LL+8]) st2 [r18] = ac3[1] , 2 // LL+8 coeff[i] = ac3; |
372 |
|
(cmp2[4]) add r8 = r8, ac2[7] // LL+8 sum += acLevel; |
373 |
|
br.ctop.sptk.few .L58 |
374 |
|
;; |
375 |
|
} |
376 |
|
.default |
377 |
|
mov ar.ec = r17 |
378 |
;; |
;; |
379 |
mov ar.lc = r2 |
mov ar.lc = r2 |
380 |
|
mov pr = r10, -1 |
381 |
|
mov ar.pfs = r9 |
382 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
383 |
.endp dequant_intra_ia64# |
.endp quant_inter_ia64# |
384 |
.common quant_inter#,8,8 |
|
385 |
|
|
386 |
|
|
387 |
|
|
388 |
|
|
389 |
|
|
390 |
|
|
391 |
|
// void dequant_inter_ia64(int16_t *data, const int16_t *coeff, const uint32_t quant) |
392 |
|
|
393 |
.common dequant_inter#,8,8 |
.common dequant_inter#,8,8 |
394 |
.align 16 |
.align 16 |
395 |
.global dequant_inter_ia64# |
.global dequant_inter_ia64# |
396 |
.proc dequant_inter_ia64# |
.proc dequant_inter_ia64# |
397 |
dequant_inter_ia64: |
dequant_inter_ia64: |
398 |
|
|
399 |
|
//*********************************************************************** |
400 |
|
// * |
401 |
|
// const uint16_t quant_m_2 = quant << 1; * |
402 |
|
// const uint16_t quant_add = (quant & 1 ? quant : quant - 1); * |
403 |
|
// uint32_t i; * |
404 |
|
// * |
405 |
|
//*********************************************************************** * |
406 |
|
|
407 |
|
|
408 |
|
|
409 |
|
|
410 |
.prologue |
.prologue |
411 |
andcm r14 = 1, r34 |
andcm r14 = 1, r34 |
412 |
dep.z r15 = r34, 1, 15 |
dep.z r29 = r34, 1, 15 |
413 |
|
alloc r9=ar.pfs,0,32,0,32 |
414 |
.save ar.lc, r2 |
.save ar.lc, r2 |
415 |
mov r2 = ar.lc |
mov r2 = ar.lc |
416 |
;; |
;; |
417 |
.body |
.body |
418 |
sub r34 = r34, r14 |
sub r15 = r34, r14 // r15 = quant |
419 |
setf.sig f6 = r15 |
addl r14 = 63, r0 |
420 |
mov r19 = r0 |
addl r21 = -2048, r0 |
421 |
addl r14 = 31, r0 |
addl r20 = 2047, r0 |
422 |
addl r18 = -2048, r0 |
mov r16 = ar.ec |
423 |
addl r17 = 2047, r0 |
mov r17 = pr |
424 |
;; |
;; |
425 |
zxt2 r34 = r34 |
zxt2 r15 = r15 |
426 |
mov ar.lc = r14 |
mov ar.lc = r14 |
427 |
|
mov pr.rot = 0 |
428 |
;; |
;; |
429 |
.L122: |
adds r14 = 0, r33 // r14 = coeff |
430 |
dep.z r16 = r19, 1, 32 |
mov r18 = r32 // r18 = data |
431 |
;; |
mov ar.ec = LL+10 |
432 |
add r14 = r16, r33 |
mov pr.rot = 1 << 16 |
433 |
;; |
;; |
434 |
ld2 r15 = [r14] |
|
435 |
;; |
/******************************************************************************** |
436 |
sxt2 r15 = r15 |
* * |
437 |
;; |
*for (i = 0; i < 64; i++) { * |
438 |
mov r14 = r15 |
* int16_t acLevel = coeff[i]; * |
439 |
;; |
* * |
440 |
cmp4.ne p6, p7 = 0, r14 |
* if (acLevel == 0) * |
441 |
;; |
* { * |
442 |
(p7) add r14 = r16, r32 |
* data[i] = 0; * |
443 |
;; |
* } * |
444 |
(p7) st2 [r14] = r0 |
* else if (acLevel < 0) * |
445 |
(p7) br.cond.dpnt .L112 |
* { * |
446 |
cmp4.le p6, p7 = r0, r14 |
* acLevel = acLevel * quant_m_2 - quant_add; * |
447 |
(p6) br.cond.dptk .L115 |
* data[i] = (acLevel >= -2048 ? acLevel : -2048); * |
448 |
setf.sig f7 = r14 |
* } * |
449 |
add r15 = r16, r32 |
* else // if (acLevel > 0) * |
450 |
;; |
* { * |
451 |
xma.l f7 = f7, f6, f0 |
* acLevel = acLevel * quant_m_2 + quant_add; * |
452 |
;; |
* data[i] = (acLevel <= 2047 ? acLevel : 2047); * |
453 |
getf.sig r14 = f7 |
* } * |
454 |
;; |
* } * |
455 |
sub r14 = r14, r34 |
* * |
456 |
;; |
********************************************************************************/ |
457 |
sxt2 r14 = r14 |
|
458 |
;; |
|
459 |
cmp4.le p6, p7 = r18, r14 |
|
460 |
;; |
LL=2 // LL := load latency |
461 |
(p7) mov r14 = r18 |
|
462 |
br .L123 |
|
463 |
.L115: |
.rotr ac1[LL+10], x[5], y1[3], y2[3] |
464 |
setf.sig f8 = r15 |
.rotp p[LL+10] , cmp1neg[8], cmp2[5], cmp2neg[5],cmp3[2], cmp3neg[2] |
465 |
setf.sig f7 = r34 |
|
466 |
;; |
.explicit |
467 |
xma.l f8 = f8, f6, f7 |
//pipeline stage |
468 |
add r15 = r16, r32 |
|
469 |
;; |
.L60: |
470 |
getf.sig r14 = f8 |
{.mmi |
471 |
|
(p[0])ld2 ac1[0] = [r14] ,2 // 0 acLevel = coeff[i]; |
472 |
|
(p[LL+1])cmp4.ne p6, cmp1neg[0] = 0, ac1[LL+1] // LL+1 |
473 |
|
(p[LL])sxt2 ac1[LL] = ac1[LL] // LL |
474 |
|
|
475 |
|
} |
476 |
|
{.mmi |
477 |
|
(p[LL+1])cmp4.le cmp2[0], cmp2neg[0] = r0, ac1[LL+1] // LL+1 |
478 |
|
(cmp2[1]) mov x[0] = r20 // LL+2 |
479 |
|
(p[LL+2])pmpyshr2.u ac1[LL+2] = r29, ac1[LL+2], 0 // LL+2 |
480 |
|
} |
481 |
|
{.mmi |
482 |
|
(cmp2neg[1]) mov x[0] = r21 // LL+2 |
483 |
|
(cmp2[2]) add ac1[LL+3] = ac1[LL+3], r15 // LL+3 |
484 |
|
(cmp2neg[2]) sub ac1[LL+3] = ac1[LL+3], r15 // LL+3 |
485 |
|
|
486 |
|
} |
487 |
|
{.mmi |
488 |
|
(cmp2neg[4]) mov y1[0] = ac1[LL+5] // LL+5 |
489 |
|
(cmp2neg[4]) mov y2[0] = x[3] // LL+5 |
490 |
|
(p[LL+4])sxt2 ac1[LL+4] = ac1[LL+4] // LL+4 |
491 |
|
} |
492 |
|
{.mmi |
493 |
|
(cmp2[4]) mov y1[0] = x[3] // LL+4 |
494 |
|
(cmp2[4]) mov y2[0] = ac1[LL+5] // LL+4 |
495 |
|
(p[LL+6])cmp4.le cmp3[0], cmp3neg[0] = x[4], ac1[LL+6] // LL+6 |
496 |
|
} |
497 |
|
{.mmi |
498 |
|
(cmp3[1]) mov ac1[LL+7] = y1[2] // LL+7 |
499 |
|
(cmp3neg[1]) mov ac1[LL+7] = y2[2] // LL+7 |
500 |
|
(cmp1neg[7]) mov ac1[LL+8] = r0 // LL+8 |
501 |
|
} |
502 |
|
{.mbb |
503 |
|
(p[LL+9])st2 [r18] = ac1[LL+9] ,2 // LL+9 |
504 |
|
nop.b 0x0 |
505 |
|
br.ctop.sptk.few .L60 |
506 |
;; |
;; |
507 |
sxt2 r14 = r14 |
} |
508 |
;; |
.default |
509 |
cmp4.le p6, p7 = r17, r14 |
mov ar.lc = r2 |
510 |
;; |
mov ar.pfs = r9 |
511 |
(p6) mov r14 = r17 |
mov ar.ec = r16 |
512 |
;; |
mov pr = r17, -1 |
|
.L123: |
|
|
st2 [r15] = r14 |
|
|
.L112: |
|
|
adds r14 = 1, r19 |
|
|
;; |
|
|
dep.z r16 = r14, 1, 32 |
|
|
;; |
|
|
add r15 = r16, r33 |
|
|
;; |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
mov r15 = r14 |
|
|
;; |
|
|
cmp4.ne p6, p7 = 0, r15 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p7) br.cond.dpnt .L117 |
|
|
cmp4.le p6, p7 = r0, r15 |
|
|
(p6) br.cond.dptk .L120 |
|
|
setf.sig f8 = r15 |
|
|
;; |
|
|
xma.l f8 = f8, f6, f0 |
|
|
add r15 = r16, r32 |
|
|
;; |
|
|
getf.sig r14 = f8 |
|
|
;; |
|
|
sub r14 = r14, r34 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r18, r14 |
|
|
;; |
|
|
(p7) mov r14 = r18 |
|
|
br .L124 |
|
|
;; |
|
|
.L120: |
|
|
setf.sig f7 = r14 |
|
|
setf.sig f8 = r34 |
|
|
add r15 = r16, r32 |
|
|
;; |
|
|
xma.l f7 = f7, f6, f8 |
|
|
;; |
|
|
getf.sig r14 = f7 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p6) mov r14 = r17 |
|
|
;; |
|
|
.L124: |
|
|
st2 [r15] = r14 |
|
|
.L117: |
|
|
adds r19 = 2, r19 |
|
|
br.cloop.sptk.few .L122 |
|
513 |
;; |
;; |
514 |
mov ar.lc = r2 |
mov ar.lc = r2 |
515 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |