Parent Directory | Revision Log
Revision 243 - (view) (download)
1 : | ia64p | 243 | .file "quant_h263.1.c" |
2 : | .pred.safe_across_calls p1-p5,p16-p63 | ||
3 : | .section .rodata | ||
4 : | .align 4 | ||
5 : | .type multipliers#,@object | ||
6 : | .size multipliers#,128 | ||
7 : | multipliers: | ||
8 : | data4 0 | ||
9 : | data4 32769 | ||
10 : | data4 16385 | ||
11 : | data4 10923 | ||
12 : | data4 8193 | ||
13 : | data4 6554 | ||
14 : | data4 5462 | ||
15 : | data4 4682 | ||
16 : | data4 4097 | ||
17 : | data4 3641 | ||
18 : | data4 3277 | ||
19 : | data4 2979 | ||
20 : | data4 2731 | ||
21 : | data4 2521 | ||
22 : | data4 2341 | ||
23 : | data4 2185 | ||
24 : | data4 2049 | ||
25 : | data4 1928 | ||
26 : | data4 1821 | ||
27 : | data4 1725 | ||
28 : | data4 1639 | ||
29 : | data4 1561 | ||
30 : | data4 1490 | ||
31 : | data4 1425 | ||
32 : | data4 1366 | ||
33 : | data4 1311 | ||
34 : | data4 1261 | ||
35 : | data4 1214 | ||
36 : | data4 1171 | ||
37 : | data4 1130 | ||
38 : | data4 1093 | ||
39 : | data4 1058 | ||
40 : | .global __divdi3# | ||
41 : | .text | ||
42 : | .align 16 | ||
43 : | .global quant_intra_ia64# | ||
44 : | .proc quant_intra_ia64# | ||
45 : | quant_intra_ia64: | ||
46 : | .prologue 12, 37 | ||
47 : | .save ar.pfs, r38 | ||
48 : | alloc r38 = ar.pfs, 4, 3, 2, 0 | ||
49 : | adds r16 = -8, r12 | ||
50 : | .fframe 32 | ||
51 : | adds r12 = -32, r12 | ||
52 : | mov r17 = ar.lc | ||
53 : | addl r14 = @ltoff(multipliers#), gp | ||
54 : | ld2 r15 = [r33] | ||
55 : | ;; | ||
56 : | .savesp ar.lc, 24 | ||
57 : | st8 [r16] = r17, 8 | ||
58 : | ld8 r14 = [r14] | ||
59 : | sxt2 r15 = r15 | ||
60 : | ;; | ||
61 : | .save.f 0x1 | ||
62 : | stf.spill [r16] = f2 | ||
63 : | .save rp, r37 | ||
64 : | mov r37 = b0 | ||
65 : | .body | ||
66 : | dep.z r36 = r34, 1, 15 | ||
67 : | dep.z r16 = r34, 2, 32 | ||
68 : | cmp4.ge p6, p7 = 0, r15 | ||
69 : | ;; | ||
70 : | add r16 = r16, r14 | ||
71 : | ;; | ||
72 : | ld4 r16 = [r16] | ||
73 : | ;; | ||
74 : | setf.sig f2 = r16 | ||
75 : | (p6) br.cond.dptk .L8 | ||
76 : | extr r39 = r35, 1, 31 | ||
77 : | sxt4 r40 = r35 | ||
78 : | ;; | ||
79 : | add r39 = r39, r15 | ||
80 : | br .L21 | ||
81 : | ;; | ||
82 : | .L8: | ||
83 : | extr r39 = r35, 1, 31 | ||
84 : | sxt4 r40 = r35 | ||
85 : | ;; | ||
86 : | sub r39 = r15, r39 | ||
87 : | ;; | ||
88 : | .L21: | ||
89 : | sxt4 r39 = r39 | ||
90 : | br.call.sptk.many b0 = __divdi3# | ||
91 : | ;; | ||
92 : | addl r14 = 62, r0 | ||
93 : | st2 [r32] = r8 | ||
94 : | addl r19 = 1, r0 | ||
95 : | ;; | ||
96 : | mov ar.lc = r14 | ||
97 : | ;; | ||
98 : | .L20: | ||
99 : | dep.z r17 = r19, 1, 32 | ||
100 : | ;; | ||
101 : | add r15 = r17, r33 | ||
102 : | adds r19 = 1, r19 | ||
103 : | ;; | ||
104 : | ld2 r14 = [r15] | ||
105 : | ;; | ||
106 : | sxt2 r14 = r14 | ||
107 : | ;; | ||
108 : | mov r16 = r14 | ||
109 : | mov r18 = r14 | ||
110 : | ;; | ||
111 : | sub r15 = r0, r16 | ||
112 : | cmp4.le p8, p9 = r36, r16 | ||
113 : | cmp4.le p6, p7 = r0, r16 | ||
114 : | ;; | ||
115 : | sxt2 r14 = r15 | ||
116 : | (p6) br.cond.dptk .L14 | ||
117 : | ;; | ||
118 : | mov r16 = r14 | ||
119 : | add r18 = r17, r32 | ||
120 : | ;; | ||
121 : | setf.sig f6 = r16 | ||
122 : | cmp4.le p6, p7 = r36, r16 | ||
123 : | mov r15 = r18 | ||
124 : | ;; | ||
125 : | xma.l f6 = f6, f2, f0 | ||
126 : | (p7) st2 [r18] = r0 | ||
127 : | ;; | ||
128 : | getf.sig r14 = f6 | ||
129 : | ;; | ||
130 : | extr r14 = r14, 16, 16 | ||
131 : | ;; | ||
132 : | sub r14 = r0, r14 | ||
133 : | ;; | ||
134 : | (p6) st2 [r15] = r14 | ||
135 : | br .L12 | ||
136 : | .L14: | ||
137 : | .pred.rel.mutex p8, p9 | ||
138 : | setf.sig f6 = r18 | ||
139 : | add r16 = r17, r32 | ||
140 : | ;; | ||
141 : | xma.l f6 = f6, f2, f0 | ||
142 : | mov r15 = r16 | ||
143 : | (p9) st2 [r16] = r0 | ||
144 : | ;; | ||
145 : | getf.sig r14 = f6 | ||
146 : | ;; | ||
147 : | extr r14 = r14, 16, 16 | ||
148 : | ;; | ||
149 : | (p8) st2 [r15] = r14 | ||
150 : | .L12: | ||
151 : | br.cloop.sptk.few .L20 | ||
152 : | adds r18 = 24, r12 | ||
153 : | ;; | ||
154 : | ld8 r19 = [r18], 8 | ||
155 : | mov ar.pfs = r38 | ||
156 : | mov b0 = r37 | ||
157 : | ;; | ||
158 : | mov ar.lc = r19 | ||
159 : | ldf.fill f2 = [r18] | ||
160 : | .restore sp | ||
161 : | adds r12 = 32, r12 | ||
162 : | br.ret.sptk.many b0 | ||
163 : | .endp quant_intra_ia64# | ||
164 : | .common quant_intra#,8,8 | ||
165 : | .common dequant_intra#,8,8 | ||
166 : | .align 16 | ||
167 : | .global dequant_intra_ia64# | ||
168 : | .proc dequant_intra_ia64# | ||
169 : | dequant_intra_ia64: | ||
170 : | .prologue | ||
171 : | ld2 r14 = [r33] | ||
172 : | andcm r15 = 1, r34 | ||
173 : | setf.sig f8 = r35 | ||
174 : | ;; | ||
175 : | sxt2 r14 = r14 | ||
176 : | sub r15 = r34, r15 | ||
177 : | addl r16 = -2048, r0 | ||
178 : | ;; | ||
179 : | setf.sig f6 = r14 | ||
180 : | setf.sig f7 = r15 | ||
181 : | shladd r34 = r34, 1, r0 | ||
182 : | ;; | ||
183 : | xma.l f8 = f6, f8, f0 | ||
184 : | .save ar.lc, r2 | ||
185 : | mov r2 = ar.lc | ||
186 : | ;; | ||
187 : | .body | ||
188 : | getf.sig r14 = f8 | ||
189 : | setf.sig f6 = r34 | ||
190 : | ;; | ||
191 : | sxt2 r15 = r14 | ||
192 : | st2 [r32] = r14 | ||
193 : | ;; | ||
194 : | cmp4.le p6, p7 = r16, r15 | ||
195 : | ;; | ||
196 : | (p7) st2 [r32] = r16 | ||
197 : | (p7) br.cond.dptk .L32 | ||
198 : | addl r14 = 2047, r0 | ||
199 : | ;; | ||
200 : | cmp4.ge p6, p7 = r14, r15 | ||
201 : | ;; | ||
202 : | (p7) st2 [r32] = r14 | ||
203 : | .L32: | ||
204 : | addl r14 = 62, r0 | ||
205 : | addl r19 = 1, r0 | ||
206 : | addl r22 = 2048, r0 | ||
207 : | addl r21 = -2048, r0 | ||
208 : | addl r20 = 2047, r0 | ||
209 : | ;; | ||
210 : | mov ar.lc = r14 | ||
211 : | ;; | ||
212 : | .L56: | ||
213 : | dep.z r16 = r19, 1, 32 | ||
214 : | ;; | ||
215 : | add r14 = r16, r33 | ||
216 : | add r17 = r16, r32 | ||
217 : | adds r19 = 1, r19 | ||
218 : | ;; | ||
219 : | ld2 r15 = [r14] | ||
220 : | ;; | ||
221 : | sxt2 r15 = r15 | ||
222 : | ;; | ||
223 : | cmp4.ne p6, p7 = 0, r15 | ||
224 : | cmp4.le p8, p9 = r0, r15 | ||
225 : | ;; | ||
226 : | (p7) st2 [r17] = r0 | ||
227 : | (p7) br.cond.dpnt .L36 | ||
228 : | add r18 = r16, r32 | ||
229 : | sub r17 = r0, r15 | ||
230 : | ;; | ||
231 : | mov r14 = r18 | ||
232 : | (p8) br.cond.dptk .L40 | ||
233 : | setf.sig f8 = r17 | ||
234 : | ;; | ||
235 : | xma.l f8 = f6, f8, f7 | ||
236 : | ;; | ||
237 : | getf.sig r15 = f8 | ||
238 : | ;; | ||
239 : | cmp4.lt p6, p7 = r22, r15 | ||
240 : | sub r16 = r0, r15 | ||
241 : | ;; | ||
242 : | (p7) st2 [r14] = r16 | ||
243 : | (p6) st2 [r14] = r21 | ||
244 : | br .L36 | ||
245 : | .L40: | ||
246 : | setf.sig f8 = r15 | ||
247 : | ;; | ||
248 : | xma.l f8 = f6, f8, f7 | ||
249 : | ;; | ||
250 : | getf.sig r15 = f8 | ||
251 : | ;; | ||
252 : | cmp4.le p6, p7 = r20, r15 | ||
253 : | ;; | ||
254 : | (p6) mov r14 = r20 | ||
255 : | (p7) mov r14 = r15 | ||
256 : | ;; | ||
257 : | st2 [r18] = r14 | ||
258 : | .L36: | ||
259 : | br.cloop.sptk.few .L56 | ||
260 : | ;; | ||
261 : | mov ar.lc = r2 | ||
262 : | br.ret.sptk.many b0 | ||
263 : | .endp dequant_intra_ia64# | ||
264 : | |||
265 : | |||
266 : | |||
267 : | //uint32_t quant_inter_ia64(int16_t *coeff, const int16_t *data, const uint32_t quant) | ||
268 : | |||
269 : | |||
270 : | |||
271 : | .common quant_inter#,8,8 | ||
272 : | .align 16 | ||
273 : | .global quant_inter_ia64# | ||
274 : | .proc quant_inter_ia64# | ||
275 : | quant_inter_ia64: | ||
276 : | |||
277 : | |||
278 : | /******************************************************** | ||
279 : | * * | ||
280 : | * const uint32_t mult = multipliers[quant]; * | ||
281 : | * const uint16_t quant_m_2 = quant << 1; * | ||
282 : | * const uint16_t quant_d_2 = quant >> 1; * | ||
283 : | * int sum = 0; * | ||
284 : | * uint32_t i; * | ||
285 : | * int16_t acLevel,acL; * | ||
286 : | * * | ||
287 : | ********************************************************/ | ||
288 : | |||
289 : | |||
290 : | |||
291 : | LL=3 // LL = load latency | ||
292 : | |||
293 : | .prologue | ||
294 : | addl r14 = @ltoff(multipliers#), gp | ||
295 : | dep.z r15 = r34, 2, 32 | ||
296 : | .save ar.lc, r2 | ||
297 : | mov r2 = ar.lc | ||
298 : | ;; | ||
299 : | .body | ||
300 : | alloc r9=ar.pfs,0,24,0,24 | ||
301 : | mov r17 = ar.ec | ||
302 : | mov r10 = pr | ||
303 : | ld8 r14 = [r14] | ||
304 : | extr.u r16 = r34, 1, 16 //r16 = quant_d_2 | ||
305 : | dep.z r20 = r34, 1, 15 //r20 = quant_m_2 | ||
306 : | ;; | ||
307 : | add r15 = r15, r14 | ||
308 : | mov r21 = r16 //r21 = quant_d_2 | ||
309 : | mov r8 = r0 //r8 = sum = 0 | ||
310 : | mov pr.rot = 0 //p16-p63 = 0 | ||
311 : | ;; | ||
312 : | ld4 r15 = [r15] | ||
313 : | addl r14 = 63, r0 | ||
314 : | mov pr.rot = 1 << 16 //p16=1 | ||
315 : | ;; | ||
316 : | mov ar.lc = r14 | ||
317 : | mov ar.ec = LL+9 | ||
318 : | mov r29 = r15 | ||
319 : | ;; | ||
320 : | mov r15 = r33 //r15 = data | ||
321 : | mov r18 = r32 //r18 = coeff | ||
322 : | ;; | ||
323 : | |||
324 : | |||
325 : | .rotr ac1[LL+3], ac2[8], ac3[2] | ||
326 : | .rotp p[LL+9], cmp1[8], cmp1neg[8],cmp2[5], cmp2neg[2] | ||
327 : | |||
328 : | |||
329 : | |||
330 : | /******************************************************************************** | ||
331 : | * * | ||
332 : | * for (i = 0; i < 64; i++) { * | ||
333 : | * acL=acLevel = data[i]; * | ||
334 : | * acLevel = ((acLevel < 0)?-acLevel:acLevel) - quant_d_2; * | ||
335 : | * if (acLevel < quant_m_2){ * | ||
336 : | * acLevel = 0; * | ||
337 : | * } * | ||
338 : | * acLevel = (acLevel * mult) >> SCALEBITS; * | ||
339 : | * sum += acLevel; * | ||
340 : | * coeff[i] = ((acL < 0)?-acLevel:acLevel); * | ||
341 : | * } * | ||
342 : | * * | ||
343 : | ********************************************************************************/ | ||
344 : | |||
345 : | |||
346 : | |||
347 : | .explicit | ||
348 : | .L58: | ||
349 : | //pipeline stage | ||
350 : | {.mmi | ||
351 : | (p[0]) ld2 ac1[0] = [r15],2 // 0 acL=acLevel = data[i]; | ||
352 : | (p[LL+1]) sub ac2[0] = r0, ac1[LL+1] // LL+1 ac2=-acLevel | ||
353 : | (p[LL]) sxt2 ac1[LL] = ac1[LL] // LL | ||
354 : | } | ||
355 : | {.mmi | ||
356 : | (p[LL+1]) cmp4.le cmp1[0], cmp1neg[0] = r0, ac1[LL+1] // LL+1 cmp1 = (0<=acLevel) ; cmp1neg = !(0<=acLevel) | ||
357 : | (p[LL+4]) cmp4.le cmp2[0], cmp2neg[0] = r20, ac2[3] // LL+4 cmp2 = (quant_m_2 < acLevel) ; cmp2neg = !(quant_m_2 < acLevel) | ||
358 : | (cmp1[1]) sub ac2[1] = ac1[LL+2], r21 // LL+2 acLevel = acLevel - quant_d_2; | ||
359 : | } | ||
360 : | {.mmi | ||
361 : | (cmp2neg[1]) mov ac2[4] = r0 // LL+5 if (acLevel < quant_m_2) acLevel=0; | ||
362 : | (cmp1neg[1]) sub ac2[1] = ac2[1], r21 // LL+2 acLevel = ac2 - quant_d_2; | ||
363 : | (p[LL+3]) sxt2 ac2[2] = ac2[2] // LL+3 | ||
364 : | } | ||
365 : | {.mmi | ||
366 : | (cmp1[6]) mov ac3[0] = ac2[6] // LL+7 ac3 = acLevel; | ||
367 : | (cmp1neg[6]) sub ac3[0] = r0, ac2[6] // LL+7 ac3 = -acLevel; | ||
368 : | (p[LL+6]) pmpyshr2.u ac2[5] = r29, ac2[5], 16 // LL+6 acLevel = (acLevel * mult) >> SCALEBITS; | ||
369 : | } | ||
370 : | {.mib | ||
371 : | (p[LL+8]) st2 [r18] = ac3[1] , 2 // LL+8 coeff[i] = ac3; | ||
372 : | (cmp2[4]) add r8 = r8, ac2[7] // LL+8 sum += acLevel; | ||
373 : | br.ctop.sptk.few .L58 | ||
374 : | ;; | ||
375 : | } | ||
376 : | .default | ||
377 : | mov ar.ec = r17 | ||
378 : | ;; | ||
379 : | mov ar.lc = r2 | ||
380 : | mov pr = r10, -1 | ||
381 : | mov ar.pfs = r9 | ||
382 : | br.ret.sptk.many b0 | ||
383 : | .endp quant_inter_ia64# | ||
384 : | |||
385 : | |||
386 : | |||
387 : | |||
388 : | |||
389 : | |||
390 : | |||
391 : | // void dequant_inter_ia64(int16_t *data, const int16_t *coeff, const uint32_t quant) | ||
392 : | |||
393 : | .common dequant_inter#,8,8 | ||
394 : | .align 16 | ||
395 : | .global dequant_inter_ia64# | ||
396 : | .proc dequant_inter_ia64# | ||
397 : | dequant_inter_ia64: | ||
398 : | |||
399 : | //*********************************************************************** | ||
400 : | // * | ||
401 : | // const uint16_t quant_m_2 = quant << 1; * | ||
402 : | // const uint16_t quant_add = (quant & 1 ? quant : quant - 1); * | ||
403 : | // uint32_t i; * | ||
404 : | // * | ||
405 : | //*********************************************************************** * | ||
406 : | |||
407 : | |||
408 : | |||
409 : | |||
410 : | .prologue | ||
411 : | andcm r14 = 1, r34 | ||
412 : | dep.z r29 = r34, 1, 15 | ||
413 : | alloc r9=ar.pfs,0,32,0,32 | ||
414 : | .save ar.lc, r2 | ||
415 : | mov r2 = ar.lc | ||
416 : | ;; | ||
417 : | .body | ||
418 : | sub r15 = r34, r14 // r15 = quant | ||
419 : | addl r14 = 63, r0 | ||
420 : | addl r21 = -2048, r0 | ||
421 : | addl r20 = 2047, r0 | ||
422 : | mov r16 = ar.ec | ||
423 : | mov r17 = pr | ||
424 : | ;; | ||
425 : | zxt2 r15 = r15 | ||
426 : | mov ar.lc = r14 | ||
427 : | mov pr.rot = 0 | ||
428 : | ;; | ||
429 : | adds r14 = 0, r33 // r14 = coeff | ||
430 : | mov r18 = r32 // r18 = data | ||
431 : | mov ar.ec = LL+10 | ||
432 : | mov pr.rot = 1 << 16 | ||
433 : | ;; | ||
434 : | |||
435 : | /******************************************************************************** | ||
436 : | * * | ||
437 : | *for (i = 0; i < 64; i++) { * | ||
438 : | * int16_t acLevel = coeff[i]; * | ||
439 : | * * | ||
440 : | * if (acLevel == 0) * | ||
441 : | * { * | ||
442 : | * data[i] = 0; * | ||
443 : | * } * | ||
444 : | * else if (acLevel < 0) * | ||
445 : | * { * | ||
446 : | * acLevel = acLevel * quant_m_2 - quant_add; * | ||
447 : | * data[i] = (acLevel >= -2048 ? acLevel : -2048); * | ||
448 : | * } * | ||
449 : | * else // if (acLevel > 0) * | ||
450 : | * { * | ||
451 : | * acLevel = acLevel * quant_m_2 + quant_add; * | ||
452 : | * data[i] = (acLevel <= 2047 ? acLevel : 2047); * | ||
453 : | * } * | ||
454 : | * } * | ||
455 : | * * | ||
456 : | ********************************************************************************/ | ||
457 : | |||
458 : | |||
459 : | |||
460 : | LL=2 // LL := load latency | ||
461 : | |||
462 : | |||
463 : | .rotr ac1[LL+10], x[5], y1[3], y2[3] | ||
464 : | .rotp p[LL+10] , cmp1neg[8], cmp2[5], cmp2neg[5],cmp3[2], cmp3neg[2] | ||
465 : | |||
466 : | .explicit | ||
467 : | //pipeline stage | ||
468 : | |||
469 : | .L60: | ||
470 : | {.mmi | ||
471 : | (p[0])ld2 ac1[0] = [r14] ,2 // 0 acLevel = coeff[i]; | ||
472 : | (p[LL+1])cmp4.ne p6, cmp1neg[0] = 0, ac1[LL+1] // LL+1 | ||
473 : | (p[LL])sxt2 ac1[LL] = ac1[LL] // LL | ||
474 : | |||
475 : | } | ||
476 : | {.mmi | ||
477 : | (p[LL+1])cmp4.le cmp2[0], cmp2neg[0] = r0, ac1[LL+1] // LL+1 | ||
478 : | (cmp2[1]) mov x[0] = r20 // LL+2 | ||
479 : | (p[LL+2])pmpyshr2.u ac1[LL+2] = r29, ac1[LL+2], 0 // LL+2 | ||
480 : | } | ||
481 : | {.mmi | ||
482 : | (cmp2neg[1]) mov x[0] = r21 // LL+2 | ||
483 : | (cmp2[2]) add ac1[LL+3] = ac1[LL+3], r15 // LL+3 | ||
484 : | (cmp2neg[2]) sub ac1[LL+3] = ac1[LL+3], r15 // LL+3 | ||
485 : | |||
486 : | } | ||
487 : | {.mmi | ||
488 : | (cmp2neg[4]) mov y1[0] = ac1[LL+5] // LL+5 | ||
489 : | (cmp2neg[4]) mov y2[0] = x[3] // LL+5 | ||
490 : | (p[LL+4])sxt2 ac1[LL+4] = ac1[LL+4] // LL+4 | ||
491 : | } | ||
492 : | {.mmi | ||
493 : | (cmp2[4]) mov y1[0] = x[3] // LL+4 | ||
494 : | (cmp2[4]) mov y2[0] = ac1[LL+5] // LL+4 | ||
495 : | (p[LL+6])cmp4.le cmp3[0], cmp3neg[0] = x[4], ac1[LL+6] // LL+6 | ||
496 : | } | ||
497 : | {.mmi | ||
498 : | (cmp3[1]) mov ac1[LL+7] = y1[2] // LL+7 | ||
499 : | (cmp3neg[1]) mov ac1[LL+7] = y2[2] // LL+7 | ||
500 : | (cmp1neg[7]) mov ac1[LL+8] = r0 // LL+8 | ||
501 : | } | ||
502 : | {.mbb | ||
503 : | (p[LL+9])st2 [r18] = ac1[LL+9] ,2 // LL+9 | ||
504 : | nop.b 0x0 | ||
505 : | br.ctop.sptk.few .L60 | ||
506 : | ;; | ||
507 : | } | ||
508 : | .default | ||
509 : | mov ar.lc = r2 | ||
510 : | mov ar.pfs = r9 | ||
511 : | mov ar.ec = r16 | ||
512 : | mov pr = r17, -1 | ||
513 : | ;; | ||
514 : | mov ar.lc = r2 | ||
515 : | br.ret.sptk.many b0 | ||
516 : | .endp dequant_inter_ia64# | ||
517 : | .ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)" |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |