1 |
.file "quant_h263.c" |
//******************************************************************************* |
2 |
|
//* * |
3 |
|
//* functions quant_inter and dequant_inter have been softwarepipelined * |
4 |
|
//* use was made of the pmpyshr2 instruction * |
5 |
|
//* * |
6 |
|
//* by Christian Engel and Hans-Joachim Daniels * |
7 |
|
//* christian.engel@ira.uka.de hans-joachim.daniels@ira.uka.de * |
8 |
|
//* * |
9 |
|
//* This was made for the ia64 DivX laboratory (yes, it was really called * |
10 |
|
//* this way, originally OpenDivX was intendet, but died shortly before our * |
11 |
|
//* work started (you will probably already know ...)) * |
12 |
|
//* at the Universitat Karlsruhe (TH) held between April and July 2002 * |
13 |
|
//* http://www.info.uni-karlsruhe.de/~rubino/ia64p/ * |
14 |
|
//* * |
15 |
|
//******************************************************************************* |
16 |
|
.file "quant_h263_ia64.s" |
17 |
.pred.safe_across_calls p1-p5,p16-p63 |
.pred.safe_across_calls p1-p5,p16-p63 |
18 |
.section .rodata |
.section .rodata |
19 |
.align 4 |
.align 4 |
58 |
.global quant_intra_ia64# |
.global quant_intra_ia64# |
59 |
.proc quant_intra_ia64# |
.proc quant_intra_ia64# |
60 |
quant_intra_ia64: |
quant_intra_ia64: |
61 |
.prologue //12, 37 |
.prologue |
62 |
.save ar.pfs, r38 |
.save ar.pfs, r38 |
63 |
alloc r38 = ar.pfs, 4, 3, 2, 0 |
alloc r38 = ar.pfs, 4, 3, 2, 0 |
64 |
adds r16 = -8, r12 |
adds r16 = -8, r12 |
87 |
ld4 r16 = [r16] |
ld4 r16 = [r16] |
88 |
;; |
;; |
89 |
setf.sig f2 = r16 |
setf.sig f2 = r16 |
90 |
(p6) br.cond.dptk .L4 |
(p6) br.cond.dptk .L8 |
91 |
extr r39 = r35, 1, 31 |
extr r39 = r35, 1, 31 |
92 |
sxt4 r40 = r35 |
sxt4 r40 = r35 |
93 |
;; |
;; |
94 |
add r39 = r39, r15 |
add r39 = r39, r15 |
95 |
br .L38 |
br .L21 |
96 |
;; |
;; |
97 |
.L4: |
.L8: |
98 |
extr r39 = r35, 1, 31 |
extr r39 = r35, 1, 31 |
99 |
sxt4 r40 = r35 |
sxt4 r40 = r35 |
100 |
;; |
;; |
101 |
sub r39 = r15, r39 |
sub r39 = r15, r39 |
102 |
;; |
;; |
103 |
.L38: |
.L21: |
104 |
sxt4 r39 = r39 |
sxt4 r39 = r39 |
105 |
br.call.sptk.many b0 = __divdi3# |
br.call.sptk.many b0 = __divdi3# |
106 |
;; |
;; |
107 |
addl r16 = 2, r0 |
addl r14 = 62, r0 |
108 |
st2 [r32] = r8 |
st2 [r32] = r8 |
109 |
addl r17 = 1, r0 |
addl r19 = 1, r0 |
|
;; |
|
|
add r14 = r33, r16 |
|
|
;; |
|
|
ld2 r15 = [r14] |
|
|
;; |
|
|
sxt2 r15 = r15 |
|
|
;; |
|
|
mov r14 = r15 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r14 |
|
|
(p6) br.cond.dptk .L21 |
|
|
sub r14 = r0, r14 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r36, r14 |
|
|
;; |
|
|
(p7) add r14 = r32, r16 |
|
|
(p6) add r15 = r32, r16 |
|
|
(p6) setf.sig f6 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f6 = f6, f2, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f6 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) sub r14 = r0, r14 |
|
|
br .L39 |
|
|
;; |
|
|
.L21: |
|
|
cmp4.le p6, p7 = r36, r14 |
|
|
;; |
|
|
(p7) add r14 = r32, r16 |
|
|
(p6) setf.sig f6 = r15 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f6 = f6, f2, f0 |
|
|
(p6) add r15 = r32, r16 |
|
|
;; |
|
|
(p6) getf.sig r14 = f6 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
.L39: |
|
|
//.pred.rel.mutex p6, p7 |
|
|
;; |
|
|
(p6) st2 [r15] = r14 |
|
|
adds r17 = 1, r17 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 63, r17 |
|
|
(p7) br.cond.dptk .L16 |
|
|
addl r14 = 30, r0 |
|
110 |
;; |
;; |
111 |
mov ar.lc = r14 |
mov ar.lc = r14 |
112 |
;; |
;; |
113 |
.L37: |
.L20: |
114 |
dep.z r16 = r17, 1, 32 |
dep.z r17 = r19, 1, 32 |
|
;; |
|
|
add r14 = r16, r33 |
|
|
;; |
|
|
ld2 r15 = [r14] |
|
|
;; |
|
|
sxt2 r15 = r15 |
|
|
;; |
|
|
mov r14 = r15 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r14 |
|
|
(p6) br.cond.dptk .L27 |
|
|
sub r14 = r0, r14 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r36, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) add r15 = r16, r32 |
|
|
(p6) setf.sig f6 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f6 = f6, f2, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f6 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) sub r14 = r0, r14 |
|
|
br .L40 |
|
|
;; |
|
|
.L27: |
|
|
cmp4.le p6, p7 = r36, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) setf.sig f6 = r15 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f6 = f6, f2, f0 |
|
|
(p6) add r15 = r16, r32 |
|
|
;; |
|
|
(p6) getf.sig r14 = f6 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
.L40: |
|
|
//.pred.rel.mutex p6, p7 |
|
|
;; |
|
|
(p6) st2 [r15] = r14 |
|
|
adds r14 = 1, r17 |
|
|
;; |
|
|
dep.z r16 = r14, 1, 32 |
|
115 |
;; |
;; |
116 |
add r15 = r16, r33 |
add r15 = r17, r33 |
117 |
|
adds r19 = 1, r19 |
118 |
;; |
;; |
119 |
ld2 r14 = [r15] |
ld2 r14 = [r15] |
120 |
;; |
;; |
121 |
sxt2 r14 = r14 |
sxt2 r14 = r14 |
122 |
;; |
;; |
123 |
mov r15 = r14 |
mov r16 = r14 |
124 |
;; |
mov r18 = r14 |
|
cmp4.le p6, p7 = r0, r15 |
|
|
(p6) br.cond.dptk .L33 |
|
|
sub r14 = r0, r15 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
mov r15 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r36, r15 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) setf.sig f6 = r15 |
|
125 |
;; |
;; |
126 |
(p7) st2 [r14] = r0 |
sub r15 = r0, r16 |
127 |
(p6) xma.l f6 = f6, f2, f0 |
cmp4.le p8, p9 = r36, r16 |
128 |
(p6) add r15 = r16, r32 |
cmp4.le p6, p7 = r0, r16 |
129 |
;; |
;; |
130 |
(p6) getf.sig r14 = f6 |
sxt2 r14 = r15 |
131 |
|
(p6) br.cond.dptk .L14 |
132 |
;; |
;; |
133 |
(p6) extr r14 = r14, 16, 16 |
mov r16 = r14 |
134 |
|
add r18 = r17, r32 |
135 |
;; |
;; |
136 |
(p6) sub r14 = r0, r14 |
setf.sig f6 = r16 |
137 |
br .L41 |
cmp4.le p6, p7 = r36, r16 |
138 |
.L33: |
mov r15 = r18 |
|
cmp4.le p6, p7 = r36, r15 |
|
139 |
;; |
;; |
140 |
(p7) add r14 = r16, r32 |
xma.l f6 = f6, f2, f0 |
141 |
(p6) add r15 = r16, r32 |
(p7) st2 [r18] = r0 |
|
(p6) setf.sig f6 = r14 |
|
142 |
;; |
;; |
143 |
(p7) st2 [r14] = r0 |
getf.sig r14 = f6 |
|
(p6) xma.l f6 = f6, f2, f0 |
|
144 |
;; |
;; |
145 |
(p6) getf.sig r14 = f6 |
extr r14 = r14, 16, 16 |
146 |
;; |
;; |
147 |
(p6) extr r14 = r14, 16, 16 |
sub r14 = r0, r14 |
|
.L41: |
|
|
//.pred.rel.mutex p6, p7 |
|
148 |
;; |
;; |
149 |
(p6) st2 [r15] = r14 |
(p6) st2 [r15] = r14 |
150 |
adds r17 = 2, r17 |
br .L12 |
151 |
br.cloop.sptk.few .L37 |
.L14: |
152 |
.L16: |
.pred.rel "mutex", p8, p9 |
153 |
|
setf.sig f6 = r18 |
154 |
|
add r16 = r17, r32 |
155 |
|
;; |
156 |
|
xma.l f6 = f6, f2, f0 |
157 |
|
mov r15 = r16 |
158 |
|
(p9) st2 [r16] = r0 |
159 |
|
;; |
160 |
|
getf.sig r14 = f6 |
161 |
|
;; |
162 |
|
extr r14 = r14, 16, 16 |
163 |
|
;; |
164 |
|
(p8) st2 [r15] = r14 |
165 |
|
.L12: |
166 |
|
br.cloop.sptk.few .L20 |
167 |
adds r18 = 24, r12 |
adds r18 = 24, r12 |
168 |
;; |
;; |
169 |
ld8 r19 = [r18], 8 |
ld8 r19 = [r18], 8 |
176 |
adds r12 = 32, r12 |
adds r12 = 32, r12 |
177 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
178 |
.endp quant_intra_ia64# |
.endp quant_intra_ia64# |
|
.align 16 |
|
|
.global quant_inter_ia64# |
|
|
.proc quant_inter_ia64# |
|
|
quant_inter_ia64: |
|
|
.prologue |
|
|
addl r14 = @ltoff(multipliers#), gp |
|
|
dep.z r15 = r34, 2, 32 |
|
|
.save ar.lc, r2 |
|
|
mov r2 = ar.lc |
|
|
;; |
|
|
.body |
|
|
ld8 r14 = [r14] |
|
|
extr.u r16 = r34, 1, 16 |
|
|
dep.z r17 = r34, 1, 15 |
|
|
;; |
|
|
add r15 = r15, r14 |
|
|
mov r18 = r16 |
|
|
mov r8 = r0 |
|
|
;; |
|
|
ld4 r15 = [r15] |
|
|
addl r14 = 31, r0 |
|
|
mov r19 = r0 |
|
|
;; |
|
|
setf.sig f6 = r15 |
|
|
mov ar.lc = r14 |
|
|
;; |
|
|
.L65: |
|
|
dep.z r16 = r19, 1, 32 |
|
|
;; |
|
|
add r14 = r16, r33 |
|
|
;; |
|
|
ld2 r15 = [r14] |
|
|
;; |
|
|
sxt2 r15 = r15 |
|
|
;; |
|
|
mov r14 = r15 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r14 |
|
|
(p6) br.cond.dptk .L55 |
|
|
sub r14 = r0, r14 |
|
|
;; |
|
|
sub r14 = r14, r18 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) setf.sig f7 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) add r16 = r16, r32 |
|
|
(p6) xma.l f7 = f7, f6, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f7 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) sub r15 = r0, r14 |
|
|
(p6) add r8 = r8, r14 |
|
|
;; |
|
|
(p6) st2 [r16] = r15 |
|
|
br .L53 |
|
|
.L55: |
|
|
sub r14 = r14, r18 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) add r15 = r16, r32 |
|
|
(p6) setf.sig f7 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f7 = f7, f6, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f7 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) st2 [r15] = r14 |
|
|
(p6) add r8 = r8, r14 |
|
|
.L53: |
|
|
adds r14 = 1, r19 |
|
|
;; |
|
|
dep.z r16 = r14, 1, 32 |
|
|
;; |
|
|
add r15 = r16, r33 |
|
|
;; |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r14 |
|
|
(p6) br.cond.dptk .L61 |
|
|
sub r14 = r0, r14 |
|
|
;; |
|
|
sub r14 = r14, r18 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) setf.sig f7 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) add r16 = r16, r32 |
|
|
(p6) xma.l f7 = f7, f6, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f7 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) sub r15 = r0, r14 |
|
|
(p6) add r8 = r8, r14 |
|
|
;; |
|
|
(p6) st2 [r16] = r15 |
|
|
br .L59 |
|
|
.L61: |
|
|
sub r14 = r14, r18 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) add r15 = r16, r32 |
|
|
(p6) setf.sig f7 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f7 = f7, f6, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f7 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) st2 [r15] = r14 |
|
|
(p6) add r8 = r8, r14 |
|
|
.L59: |
|
|
adds r19 = 2, r19 |
|
|
br.cloop.sptk.few .L65 |
|
|
;; |
|
|
mov ar.lc = r2 |
|
|
br.ret.sptk.many b0 |
|
|
.endp quant_inter_ia64# |
|
179 |
.common quant_intra#,8,8 |
.common quant_intra#,8,8 |
180 |
.common dequant_intra#,8,8 |
.common dequant_intra#,8,8 |
181 |
.align 16 |
.align 16 |
209 |
cmp4.le p6, p7 = r16, r15 |
cmp4.le p6, p7 = r16, r15 |
210 |
;; |
;; |
211 |
(p7) st2 [r32] = r16 |
(p7) st2 [r32] = r16 |
212 |
(p7) br.cond.dptk .L68 |
(p7) br.cond.dptk .L32 |
213 |
addl r14 = 2047, r0 |
addl r14 = 2047, r0 |
214 |
;; |
;; |
215 |
cmp4.ge p6, p7 = r14, r15 |
cmp4.ge p6, p7 = r14, r15 |
216 |
;; |
;; |
217 |
(p7) st2 [r32] = r14 |
(p7) st2 [r32] = r14 |
218 |
.L68: |
.L32: |
219 |
addl r14 = 20, r0 |
addl r14 = 62, r0 |
220 |
addl r19 = 1, r0 |
addl r19 = 1, r0 |
221 |
addl r21 = 2048, r0 |
addl r22 = 2048, r0 |
222 |
addl r20 = -2048, r0 |
addl r21 = -2048, r0 |
223 |
addl r18 = 2047, r0 |
addl r20 = 2047, r0 |
224 |
;; |
;; |
225 |
mov ar.lc = r14 |
mov ar.lc = r14 |
226 |
;; |
;; |
227 |
.L110: |
.L56: |
228 |
dep.z r16 = r19, 1, 32 |
dep.z r16 = r19, 1, 32 |
229 |
;; |
;; |
230 |
add r14 = r16, r33 |
add r14 = r16, r33 |
231 |
|
add r17 = r16, r32 |
232 |
|
adds r19 = 1, r19 |
233 |
;; |
;; |
234 |
ld2 r15 = [r14] |
ld2 r15 = [r14] |
235 |
;; |
;; |
236 |
sxt2 r15 = r15 |
sxt2 r15 = r15 |
237 |
;; |
;; |
238 |
cmp4.ne p6, p7 = 0, r15 |
cmp4.ne p6, p7 = 0, r15 |
239 |
|
cmp4.le p8, p9 = r0, r15 |
240 |
;; |
;; |
241 |
(p7) add r14 = r16, r32 |
(p7) st2 [r17] = r0 |
242 |
;; |
(p7) br.cond.dpnt .L36 |
243 |
(p7) st2 [r14] = r0 |
add r18 = r16, r32 |
244 |
(p7) br.cond.dpnt .L92 |
sub r17 = r0, r15 |
245 |
cmp4.le p6, p7 = r0, r15 |
;; |
246 |
(p6) br.cond.dptk .L95 |
mov r14 = r18 |
247 |
sub r14 = r0, r15 |
(p8) br.cond.dptk .L40 |
248 |
add r17 = r16, r32 |
setf.sig f8 = r17 |
|
;; |
|
|
setf.sig f8 = r14 |
|
249 |
;; |
;; |
250 |
xma.l f8 = f6, f8, f7 |
xma.l f8 = f6, f8, f7 |
251 |
;; |
;; |
252 |
getf.sig r15 = f8 |
getf.sig r15 = f8 |
253 |
;; |
;; |
254 |
cmp4.lt p6, p7 = r21, r15 |
cmp4.lt p6, p7 = r22, r15 |
255 |
;; |
sub r16 = r0, r15 |
|
(p7) sub r14 = r0, r15 |
|
256 |
;; |
;; |
257 |
(p7) st2 [r17] = r14 |
(p7) st2 [r14] = r16 |
258 |
(p6) st2 [r17] = r20 |
(p6) st2 [r14] = r21 |
259 |
br .L92 |
br .L36 |
260 |
.L95: |
.L40: |
261 |
setf.sig f8 = r15 |
setf.sig f8 = r15 |
|
add r14 = r16, r32 |
|
262 |
;; |
;; |
263 |
xma.l f8 = f6, f8, f7 |
xma.l f8 = f6, f8, f7 |
264 |
;; |
;; |
265 |
getf.sig r15 = f8 |
getf.sig r15 = f8 |
266 |
;; |
;; |
267 |
cmp4.le p6, p7 = r18, r15 |
cmp4.le p6, p7 = r20, r15 |
|
;; |
|
|
(p6) mov r15 = r18 |
|
|
;; |
|
|
st2 [r14] = r15 |
|
|
.L92: |
|
|
adds r14 = 1, r19 |
|
|
;; |
|
|
dep.z r17 = r14, 1, 32 |
|
|
;; |
|
|
add r15 = r17, r33 |
|
|
;; |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
mov r16 = r14 |
|
|
;; |
|
|
cmp4.ne p6, p7 = 0, r16 |
|
|
;; |
|
|
(p7) add r14 = r17, r32 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p7) br.cond.dpnt .L98 |
|
|
cmp4.le p6, p7 = r0, r16 |
|
|
(p6) br.cond.dptk .L101 |
|
|
sub r14 = r0, r16 |
|
|
add r17 = r17, r32 |
|
|
;; |
|
|
setf.sig f8 = r14 |
|
|
;; |
|
|
xma.l f8 = f6, f8, f7 |
|
|
;; |
|
|
getf.sig r16 = f8 |
|
|
;; |
|
|
cmp4.lt p6, p7 = r21, r16 |
|
268 |
;; |
;; |
269 |
(p7) sub r14 = r0, r16 |
(p6) mov r14 = r20 |
270 |
|
(p7) mov r14 = r15 |
271 |
;; |
;; |
272 |
(p7) st2 [r17] = r14 |
st2 [r18] = r14 |
273 |
(p6) st2 [r17] = r20 |
.L36: |
274 |
br .L98 |
br.cloop.sptk.few .L56 |
|
.L101: |
|
|
setf.sig f8 = r16 |
|
|
add r14 = r17, r32 |
|
275 |
;; |
;; |
276 |
xma.l f8 = f6, f8, f7 |
mov ar.lc = r2 |
277 |
;; |
br.ret.sptk.many b0 |
278 |
getf.sig r16 = f8 |
.endp dequant_intra_ia64# |
279 |
;; |
|
280 |
cmp4.le p6, p7 = r18, r16 |
|
281 |
;; |
|
282 |
(p6) mov r15 = r18 |
//uint32_t quant_inter_ia64(int16_t *coeff, const int16_t *data, const uint32_t quant) |
283 |
(p7) mov r15 = r16 |
|
284 |
;; |
|
285 |
st2 [r14] = r15 |
|
286 |
.L98: |
.common quant_inter#,8,8 |
287 |
adds r14 = 2, r19 |
.align 16 |
288 |
;; |
.global quant_inter_ia64# |
289 |
dep.z r17 = r14, 1, 32 |
.proc quant_inter_ia64# |
290 |
;; |
quant_inter_ia64: |
291 |
add r15 = r17, r33 |
|
292 |
;; |
|
293 |
ld2 r14 = [r15] |
//******************************************************* |
294 |
;; |
//* * |
295 |
sxt2 r14 = r14 |
//* const uint32_t mult = multipliers[quant]; * |
296 |
;; |
//* const uint16_t quant_m_2 = quant << 1; * |
297 |
mov r16 = r14 |
//* const uint16_t quant_d_2 = quant >> 1; * |
298 |
;; |
//* int sum = 0; * |
299 |
cmp4.ne p6, p7 = 0, r16 |
//* uint32_t i; * |
300 |
;; |
//* int16_t acLevel,acL; * |
301 |
(p7) add r14 = r17, r32 |
//* * |
302 |
;; |
//*******************************************************/ |
303 |
(p7) st2 [r14] = r0 |
|
304 |
(p7) br.cond.dpnt .L104 |
|
305 |
cmp4.le p6, p7 = r0, r16 |
|
306 |
(p6) br.cond.dptk .L107 |
LL=3 // LL = load latency |
307 |
sub r14 = r0, r16 |
//if LL is changed, you'll also have to change the .pred.rel... parts below! |
308 |
add r17 = r17, r32 |
.prologue |
309 |
;; |
addl r14 = @ltoff(multipliers#), gp |
310 |
setf.sig f8 = r14 |
dep.z r15 = r34, 2, 32 |
311 |
;; |
.save ar.lc, r2 |
312 |
xma.l f8 = f6, f8, f7 |
mov r2 = ar.lc |
|
;; |
|
|
getf.sig r16 = f8 |
|
|
;; |
|
|
cmp4.lt p6, p7 = r21, r16 |
|
|
;; |
|
|
(p7) sub r14 = r0, r16 |
|
313 |
;; |
;; |
314 |
(p7) st2 [r17] = r14 |
.body |
315 |
(p6) st2 [r17] = r20 |
alloc r9=ar.pfs,0,24,0,24 |
316 |
br .L104 |
mov r17 = ar.ec |
317 |
.L107: |
mov r10 = pr |
318 |
setf.sig f8 = r16 |
ld8 r14 = [r14] |
319 |
add r14 = r17, r32 |
extr.u r16 = r34, 1, 16 //r16 = quant_d_2 |
320 |
|
dep.z r20 = r34, 1, 15 //r20 = quant_m_2 |
321 |
;; |
;; |
322 |
xma.l f8 = f6, f8, f7 |
add r15 = r15, r14 |
323 |
|
mov r21 = r16 //r21 = quant_d_2 |
324 |
|
mov r8 = r0 //r8 = sum = 0 |
325 |
|
mov pr.rot = 0 //p16-p63 = 0 |
326 |
;; |
;; |
327 |
getf.sig r16 = f8 |
ld4 r15 = [r15] |
328 |
|
addl r14 = 63, r0 |
329 |
|
mov pr.rot = 1 << 16 //p16=1 |
330 |
;; |
;; |
331 |
cmp4.le p6, p7 = r18, r16 |
mov ar.lc = r14 |
332 |
|
mov ar.ec = LL+9 |
333 |
|
mov r29 = r15 |
334 |
;; |
;; |
335 |
(p6) mov r15 = r18 |
mov r15 = r33 //r15 = data |
336 |
(p7) mov r15 = r16 |
mov r18 = r32 //r18 = coeff |
337 |
;; |
;; |
338 |
st2 [r14] = r15 |
|
339 |
.L104: |
|
340 |
adds r19 = 3, r19 |
.rotr ac1[LL+3], ac2[8], ac3[2] |
341 |
br.cloop.sptk.few .L110 |
.rotp p[LL+9], cmp1[8], cmp1neg[8],cmp2[5], cmp2neg[2] |
342 |
|
|
343 |
|
|
344 |
|
|
345 |
|
//******************************************************************************* |
346 |
|
//* * |
347 |
|
//* for (i = 0; i < 64; i++) { * |
348 |
|
//* acL=acLevel = data[i]; * |
349 |
|
//* acLevel = ((acLevel < 0)?-acLevel:acLevel) - quant_d_2; * |
350 |
|
//* if (acLevel < quant_m_2){ * |
351 |
|
//* acLevel = 0; * |
352 |
|
//* } * |
353 |
|
//* acLevel = (acLevel * mult) >> SCALEBITS; * |
354 |
|
//* sum += acLevel; * |
355 |
|
//* coeff[i] = ((acL < 0)?-acLevel:acLevel); * |
356 |
|
//* } * |
357 |
|
//* * |
358 |
|
//*******************************************************************************/ |
359 |
|
|
360 |
|
|
361 |
|
|
362 |
|
.explicit |
363 |
|
.L58: |
364 |
|
.pred.rel "clear", p29, p37 |
365 |
|
.pred.rel "mutex", p29, p37 |
366 |
|
|
367 |
|
//pipeline stage |
368 |
|
{.mmi |
369 |
|
(p[0]) ld2 ac1[0] = [r15],2 // 0 acL=acLevel = data[i]; |
370 |
|
(p[LL+1]) sub ac2[0] = r0, ac1[LL+1] // LL+1 ac2=-acLevel |
371 |
|
(p[LL]) sxt2 ac1[LL] = ac1[LL] // LL |
372 |
|
} |
373 |
|
{.mmi |
374 |
|
(p[LL+1]) cmp4.le cmp1[0], cmp1neg[0] = r0, ac1[LL+1] // LL+1 cmp1 = (0<=acLevel) ; cmp1neg = !(0<=acLevel) |
375 |
|
(p[LL+4]) cmp4.le cmp2[0], cmp2neg[0] = r20, ac2[3] // LL+4 cmp2 = (quant_m_2 < acLevel) ; cmp2neg = !(quant_m_2 < acLevel) |
376 |
|
(cmp1[1]) sub ac2[1] = ac1[LL+2], r21 // LL+2 acLevel = acLevel - quant_d_2; |
377 |
|
} |
378 |
|
{.mmi |
379 |
|
(cmp2neg[1]) mov ac2[4] = r0 // LL+5 if (acLevel < quant_m_2) acLevel=0; |
380 |
|
(cmp1neg[1]) sub ac2[1] = ac2[1], r21 // LL+2 acLevel = ac2 - quant_d_2; |
381 |
|
(p[LL+3]) sxt2 ac2[2] = ac2[2] // LL+3 |
382 |
|
} |
383 |
|
{.mmi |
384 |
|
.pred.rel "mutex", p34, p42 |
385 |
|
(cmp1[6]) mov ac3[0] = ac2[6] // LL+7 ac3 = acLevel; |
386 |
|
(cmp1neg[6]) sub ac3[0] = r0, ac2[6] // LL+7 ac3 = -acLevel; |
387 |
|
(p[LL+6]) pmpyshr2.u ac2[5] = r29, ac2[5], 16 // LL+6 acLevel = (acLevel * mult) >> SCALEBITS; |
388 |
|
} |
389 |
|
{.mib |
390 |
|
(p[LL+8]) st2 [r18] = ac3[1] , 2 // LL+8 coeff[i] = ac3; |
391 |
|
(cmp2[4]) add r8 = r8, ac2[7] // LL+8 sum += acLevel; |
392 |
|
br.ctop.sptk.few .L58 |
393 |
|
;; |
394 |
|
} |
395 |
|
|
396 |
|
.pred.rel "clear", p29, p37 |
397 |
|
.default |
398 |
|
mov ar.ec = r17 |
399 |
;; |
;; |
400 |
mov ar.lc = r2 |
mov ar.lc = r2 |
401 |
|
mov pr = r10, -1 |
402 |
|
mov ar.pfs = r9 |
403 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
404 |
.endp dequant_intra_ia64# |
.endp quant_inter_ia64# |
405 |
.common quant_inter#,8,8 |
|
406 |
|
|
407 |
|
|
408 |
|
|
409 |
|
|
410 |
|
|
411 |
|
|
412 |
|
// void dequant_inter_ia64(int16_t *data, const int16_t *coeff, const uint32_t quant) |
413 |
|
|
414 |
.common dequant_inter#,8,8 |
.common dequant_inter#,8,8 |
415 |
.align 16 |
.align 16 |
416 |
.global dequant_inter_ia64# |
.global dequant_inter_ia64# |
417 |
.proc dequant_inter_ia64# |
.proc dequant_inter_ia64# |
418 |
dequant_inter_ia64: |
dequant_inter_ia64: |
419 |
|
|
420 |
|
//*********************************************************************** |
421 |
|
//* * |
422 |
|
//* const uint16_t quant_m_2 = quant << 1; * |
423 |
|
//* const uint16_t quant_add = (quant & 1 ? quant : quant - 1); * |
424 |
|
//* uint32_t i; * |
425 |
|
//* * |
426 |
|
//*********************************************************************** |
427 |
|
|
428 |
|
|
429 |
|
|
430 |
|
|
431 |
.prologue |
.prologue |
432 |
andcm r14 = 1, r34 |
andcm r14 = 1, r34 |
433 |
dep.z r15 = r34, 1, 15 |
dep.z r29 = r34, 1, 15 |
434 |
|
alloc r9=ar.pfs,0,32,0,32 |
435 |
.save ar.lc, r2 |
.save ar.lc, r2 |
436 |
mov r2 = ar.lc |
mov r2 = ar.lc |
437 |
;; |
;; |
438 |
.body |
.body |
439 |
sub r34 = r34, r14 |
sub r15 = r34, r14 // r15 = quant |
440 |
setf.sig f6 = r15 |
addl r14 = 63, r0 |
441 |
mov r19 = r0 |
addl r21 = -2048, r0 |
442 |
addl r14 = 31, r0 |
addl r20 = 2047, r0 |
443 |
addl r18 = -2048, r0 |
mov r16 = ar.ec |
444 |
addl r17 = 2047, r0 |
mov r17 = pr |
445 |
;; |
;; |
446 |
zxt2 r34 = r34 |
zxt2 r15 = r15 |
447 |
mov ar.lc = r14 |
mov ar.lc = r14 |
448 |
|
mov pr.rot = 0 |
449 |
;; |
;; |
450 |
.L122: |
adds r14 = 0, r33 // r14 = coeff |
451 |
dep.z r16 = r19, 1, 32 |
mov r18 = r32 // r18 = data |
452 |
;; |
mov ar.ec = LL+10 |
453 |
add r14 = r16, r33 |
mov pr.rot = 1 << 16 |
454 |
;; |
;; |
455 |
ld2 r15 = [r14] |
|
456 |
;; |
//******************************************************************************* |
457 |
sxt2 r15 = r15 |
//* * |
458 |
;; |
//*for (i = 0; i < 64; i++) { * |
459 |
mov r14 = r15 |
//* int16_t acLevel = coeff[i]; * |
460 |
;; |
//* * |
461 |
cmp4.ne p6, p7 = 0, r14 |
//* if (acLevel == 0) * |
462 |
;; |
//* { * |
463 |
(p7) add r14 = r16, r32 |
//* data[i] = 0; * |
464 |
;; |
//* } * |
465 |
(p7) st2 [r14] = r0 |
//* else if (acLevel < 0) * |
466 |
(p7) br.cond.dpnt .L112 |
//* { * |
467 |
cmp4.le p6, p7 = r0, r14 |
//* acLevel = acLevel * quant_m_2 - quant_add; * |
468 |
(p6) br.cond.dptk .L115 |
//* data[i] = (acLevel >= -2048 ? acLevel : -2048); * |
469 |
setf.sig f7 = r14 |
//* } * |
470 |
add r15 = r16, r32 |
//* else // if (acLevel > 0) * |
471 |
;; |
//* { * |
472 |
xma.l f7 = f7, f6, f0 |
//* acLevel = acLevel * quant_m_2 + quant_add; * |
473 |
;; |
//* data[i] = (acLevel <= 2047 ? acLevel : 2047); * |
474 |
getf.sig r14 = f7 |
//* } * |
475 |
;; |
//* } * |
476 |
sub r14 = r14, r34 |
//* * |
477 |
;; |
//*******************************************************************************/ |
478 |
sxt2 r14 = r14 |
|
479 |
;; |
|
480 |
cmp4.le p6, p7 = r18, r14 |
|
481 |
;; |
LL=2 // LL := load latency |
482 |
(p7) mov r14 = r18 |
//if LL is changed, you'll also have to change the .pred.rel... parts below! |
483 |
br .L123 |
|
484 |
.L115: |
|
485 |
setf.sig f8 = r15 |
.rotr ac1[LL+10], x[5], y1[3], y2[3] |
486 |
setf.sig f7 = r34 |
.rotp p[LL+10] , cmp1neg[8], cmp2[5], cmp2neg[5],cmp3[2], cmp3neg[2] |
487 |
;; |
|
488 |
xma.l f8 = f8, f6, f7 |
.explicit |
489 |
add r15 = r16, r32 |
//pipeline stage |
490 |
;; |
|
491 |
getf.sig r14 = f8 |
.L60: |
492 |
;; |
.pred.rel "clear", p36 |
493 |
sxt2 r14 = r14 |
.pred.rel "mutex", p47, p49 |
494 |
;; |
.pred.rel "mutex", p46, p48 |
495 |
cmp4.le p6, p7 = r17, r14 |
.pred.rel "mutex", p40, p45 |
496 |
;; |
.pred.rel "mutex", p39, p44 |
497 |
(p6) mov r14 = r17 |
.pred.rel "mutex", p38, p43 |
498 |
;; |
.pred.rel "mutex", p37, p42 |
499 |
.L123: |
.pred.rel "mutex", p36, p41 |
500 |
st2 [r15] = r14 |
{.mmi |
501 |
.L112: |
(p[0])ld2 ac1[0] = [r14] ,2 // 0 acLevel = coeff[i]; |
502 |
adds r14 = 1, r19 |
(p[LL+1])cmp4.ne p6, cmp1neg[0] = 0, ac1[LL+1] // LL+1 |
503 |
;; |
(p[LL])sxt2 ac1[LL] = ac1[LL] // LL |
504 |
dep.z r16 = r14, 1, 32 |
|
505 |
;; |
} |
506 |
add r15 = r16, r33 |
{.mmi |
507 |
;; |
(p[LL+1])cmp4.le cmp2[0], cmp2neg[0] = r0, ac1[LL+1] // LL+1 |
508 |
ld2 r14 = [r15] |
(cmp2[1]) mov x[0] = r20 // LL+2 |
509 |
;; |
(p[LL+2])pmpyshr2.u ac1[LL+2] = r29, ac1[LL+2], 0 // LL+2 |
510 |
sxt2 r14 = r14 |
} |
511 |
;; |
{.mmi |
512 |
mov r15 = r14 |
(cmp2neg[1]) mov x[0] = r21 // LL+2 |
513 |
;; |
(cmp2[2]) add ac1[LL+3] = ac1[LL+3], r15 // LL+3 |
514 |
cmp4.ne p6, p7 = 0, r15 |
(cmp2neg[2]) sub ac1[LL+3] = ac1[LL+3], r15 // LL+3 |
515 |
;; |
|
516 |
(p7) add r14 = r16, r32 |
} |
517 |
;; |
{.mmi |
518 |
(p7) st2 [r14] = r0 |
(cmp2neg[4]) mov y1[0] = ac1[LL+5] // LL+5 |
519 |
(p7) br.cond.dpnt .L117 |
(cmp2neg[4]) mov y2[0] = x[3] // LL+5 |
520 |
cmp4.le p6, p7 = r0, r15 |
(p[LL+4])sxt2 ac1[LL+4] = ac1[LL+4] // LL+4 |
521 |
(p6) br.cond.dptk .L120 |
} |
522 |
setf.sig f8 = r15 |
{.mmi |
523 |
;; |
(cmp2[4]) mov y1[0] = x[3] // LL+5 |
524 |
xma.l f8 = f8, f6, f0 |
(cmp2[4]) mov y2[0] = ac1[LL+5] // LL+5 |
525 |
add r15 = r16, r32 |
(p[LL+6])cmp4.le cmp3[0], cmp3neg[0] = x[4], ac1[LL+6] // LL+6 |
526 |
;; |
} |
527 |
getf.sig r14 = f8 |
{.mmi |
528 |
;; |
(cmp3[1]) mov ac1[LL+7] = y1[2] // LL+7 |
529 |
sub r14 = r14, r34 |
(cmp3neg[1]) mov ac1[LL+7] = y2[2] // LL+7 |
530 |
;; |
(cmp1neg[7]) mov ac1[LL+8] = r0 // LL+8 |
531 |
sxt2 r14 = r14 |
} |
532 |
;; |
{.mbb |
533 |
cmp4.le p6, p7 = r18, r14 |
(p[LL+9])st2 [r18] = ac1[LL+9] ,2 // LL+9 |
534 |
;; |
nop.b 0x0 |
535 |
(p7) mov r14 = r18 |
br.ctop.sptk.few .L60 |
536 |
br .L124 |
;; |
537 |
;; |
} |
538 |
.L120: |
.pred.rel "clear", p36 |
539 |
setf.sig f7 = r14 |
.default |
540 |
setf.sig f8 = r34 |
mov ar.lc = r2 |
541 |
add r15 = r16, r32 |
mov ar.pfs = r9 |
542 |
;; |
mov ar.ec = r16 |
543 |
xma.l f7 = f7, f6, f8 |
mov pr = r17, -1 |
|
;; |
|
|
getf.sig r14 = f7 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p6) mov r14 = r17 |
|
|
;; |
|
|
.L124: |
|
|
st2 [r15] = r14 |
|
|
.L117: |
|
|
adds r19 = 2, r19 |
|
|
br.cloop.sptk.few .L122 |
|
544 |
;; |
;; |
545 |
mov ar.lc = r2 |
mov ar.lc = r2 |
546 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |