1 |
.file "quant_h263.1.c" |
//******************************************************************************* |
2 |
|
//* * |
3 |
|
//* functions quant_inter and dequant_inter have been softwarepipelined * |
4 |
|
//* use was made of the pmpyshr2 instruction * |
5 |
|
//* * |
6 |
|
//* by Christian Engel and Hans-Joachim Daniels * |
7 |
|
//* christian.engel@ira.uka.de hans-joachim.daniels@ira.uka.de * |
8 |
|
//* * |
9 |
|
//* This was made for the ia64 DivX laboratory (yes, it was really called * |
10 |
|
//* this way, originally OpenDivX was intendet, but died shortly before our * |
11 |
|
//* work started (you will probably already know ...)) * |
12 |
|
//* at the Universitat Karlsruhe (TH) held between April and July 2002 * |
13 |
|
//* http://www.info.uni-karlsruhe.de/~rubino/ia64p/ * |
14 |
|
//* * |
15 |
|
//******************************************************************************* |
16 |
|
.file "quant_h263_ia64.s" |
17 |
.pred.safe_across_calls p1-p5,p16-p63 |
.pred.safe_across_calls p1-p5,p16-p63 |
18 |
.section .rodata |
.section .rodata |
19 |
.align 4 |
.align 4 |
58 |
.global quant_intra_ia64# |
.global quant_intra_ia64# |
59 |
.proc quant_intra_ia64# |
.proc quant_intra_ia64# |
60 |
quant_intra_ia64: |
quant_intra_ia64: |
61 |
.prologue 12, 37 |
.prologue |
62 |
.save ar.pfs, r38 |
.save ar.pfs, r38 |
63 |
alloc r38 = ar.pfs, 4, 3, 2, 0 |
alloc r38 = ar.pfs, 4, 3, 2, 0 |
64 |
adds r16 = -8, r12 |
adds r16 = -8, r12 |
149 |
(p6) st2 [r15] = r14 |
(p6) st2 [r15] = r14 |
150 |
br .L12 |
br .L12 |
151 |
.L14: |
.L14: |
152 |
.pred.rel.mutex p8, p9 |
.pred.rel "mutex", p8, p9 |
153 |
setf.sig f6 = r18 |
setf.sig f6 = r18 |
154 |
add r16 = r17, r32 |
add r16 = r17, r32 |
155 |
;; |
;; |
290 |
quant_inter_ia64: |
quant_inter_ia64: |
291 |
|
|
292 |
|
|
293 |
/******************************************************** |
//******************************************************* |
294 |
* * |
//* * |
295 |
* const uint32_t mult = multipliers[quant]; * |
//* const uint32_t mult = multipliers[quant]; * |
296 |
* const uint16_t quant_m_2 = quant << 1; * |
//* const uint16_t quant_m_2 = quant << 1; * |
297 |
* const uint16_t quant_d_2 = quant >> 1; * |
//* const uint16_t quant_d_2 = quant >> 1; * |
298 |
* int sum = 0; * |
//* int sum = 0; * |
299 |
* uint32_t i; * |
//* uint32_t i; * |
300 |
* int16_t acLevel,acL; * |
//* int16_t acLevel,acL; * |
301 |
* * |
//* * |
302 |
********************************************************/ |
//*******************************************************/ |
303 |
|
|
304 |
|
|
305 |
|
|
306 |
LL=3 // LL = load latency |
LL=3 // LL = load latency |
307 |
|
//if LL is changed, you'll also have to change the .pred.rel... parts below! |
308 |
.prologue |
.prologue |
309 |
addl r14 = @ltoff(multipliers#), gp |
addl r14 = @ltoff(multipliers#), gp |
310 |
dep.z r15 = r34, 2, 32 |
dep.z r15 = r34, 2, 32 |
342 |
|
|
343 |
|
|
344 |
|
|
345 |
/******************************************************************************** |
//******************************************************************************* |
346 |
* * |
//* * |
347 |
* for (i = 0; i < 64; i++) { * |
//* for (i = 0; i < 64; i++) { * |
348 |
* acL=acLevel = data[i]; * |
//* acL=acLevel = data[i]; * |
349 |
* acLevel = ((acLevel < 0)?-acLevel:acLevel) - quant_d_2; * |
//* acLevel = ((acLevel < 0)?-acLevel:acLevel) - quant_d_2; * |
350 |
* if (acLevel < quant_m_2){ * |
//* if (acLevel < quant_m_2){ * |
351 |
* acLevel = 0; * |
//* acLevel = 0; * |
352 |
* } * |
//* } * |
353 |
* acLevel = (acLevel * mult) >> SCALEBITS; * |
//* acLevel = (acLevel * mult) >> SCALEBITS; * |
354 |
* sum += acLevel; * |
//* sum += acLevel; * |
355 |
* coeff[i] = ((acL < 0)?-acLevel:acLevel); * |
//* coeff[i] = ((acL < 0)?-acLevel:acLevel); * |
356 |
* } * |
//* } * |
357 |
* * |
//* * |
358 |
********************************************************************************/ |
//*******************************************************************************/ |
359 |
|
|
360 |
|
|
361 |
|
|
362 |
.explicit |
.explicit |
363 |
.L58: |
.L58: |
364 |
|
.pred.rel "clear", p29, p37 |
365 |
|
.pred.rel "mutex", p29, p37 |
366 |
|
|
367 |
//pipeline stage |
//pipeline stage |
368 |
{.mmi |
{.mmi |
369 |
(p[0]) ld2 ac1[0] = [r15],2 // 0 acL=acLevel = data[i]; |
(p[0]) ld2 ac1[0] = [r15],2 // 0 acL=acLevel = data[i]; |
381 |
(p[LL+3]) sxt2 ac2[2] = ac2[2] // LL+3 |
(p[LL+3]) sxt2 ac2[2] = ac2[2] // LL+3 |
382 |
} |
} |
383 |
{.mmi |
{.mmi |
384 |
|
.pred.rel "mutex", p34, p42 |
385 |
(cmp1[6]) mov ac3[0] = ac2[6] // LL+7 ac3 = acLevel; |
(cmp1[6]) mov ac3[0] = ac2[6] // LL+7 ac3 = acLevel; |
386 |
(cmp1neg[6]) sub ac3[0] = r0, ac2[6] // LL+7 ac3 = -acLevel; |
(cmp1neg[6]) sub ac3[0] = r0, ac2[6] // LL+7 ac3 = -acLevel; |
387 |
(p[LL+6]) pmpyshr2.u ac2[5] = r29, ac2[5], 16 // LL+6 acLevel = (acLevel * mult) >> SCALEBITS; |
(p[LL+6]) pmpyshr2.u ac2[5] = r29, ac2[5], 16 // LL+6 acLevel = (acLevel * mult) >> SCALEBITS; |
392 |
br.ctop.sptk.few .L58 |
br.ctop.sptk.few .L58 |
393 |
;; |
;; |
394 |
} |
} |
395 |
|
|
396 |
|
.pred.rel "clear", p29, p37 |
397 |
.default |
.default |
398 |
mov ar.ec = r17 |
mov ar.ec = r17 |
399 |
;; |
;; |
418 |
dequant_inter_ia64: |
dequant_inter_ia64: |
419 |
|
|
420 |
//*********************************************************************** |
//*********************************************************************** |
421 |
// * |
//* * |
422 |
// const uint16_t quant_m_2 = quant << 1; * |
//* const uint16_t quant_m_2 = quant << 1; * |
423 |
// const uint16_t quant_add = (quant & 1 ? quant : quant - 1); * |
//* const uint16_t quant_add = (quant & 1 ? quant : quant - 1); * |
424 |
// uint32_t i; * |
//* uint32_t i; * |
425 |
// * |
//* * |
426 |
//*********************************************************************** * |
//*********************************************************************** |
427 |
|
|
428 |
|
|
429 |
|
|
453 |
mov pr.rot = 1 << 16 |
mov pr.rot = 1 << 16 |
454 |
;; |
;; |
455 |
|
|
456 |
/******************************************************************************** |
//******************************************************************************* |
457 |
* * |
//* * |
458 |
*for (i = 0; i < 64; i++) { * |
//*for (i = 0; i < 64; i++) { * |
459 |
* int16_t acLevel = coeff[i]; * |
//* int16_t acLevel = coeff[i]; * |
460 |
* * |
//* * |
461 |
* if (acLevel == 0) * |
//* if (acLevel == 0) * |
462 |
* { * |
//* { * |
463 |
* data[i] = 0; * |
//* data[i] = 0; * |
464 |
* } * |
//* } * |
465 |
* else if (acLevel < 0) * |
//* else if (acLevel < 0) * |
466 |
* { * |
//* { * |
467 |
* acLevel = acLevel * quant_m_2 - quant_add; * |
//* acLevel = acLevel * quant_m_2 - quant_add; * |
468 |
* data[i] = (acLevel >= -2048 ? acLevel : -2048); * |
//* data[i] = (acLevel >= -2048 ? acLevel : -2048); * |
469 |
* } * |
//* } * |
470 |
* else // if (acLevel > 0) * |
//* else // if (acLevel > 0) * |
471 |
* { * |
//* { * |
472 |
* acLevel = acLevel * quant_m_2 + quant_add; * |
//* acLevel = acLevel * quant_m_2 + quant_add; * |
473 |
* data[i] = (acLevel <= 2047 ? acLevel : 2047); * |
//* data[i] = (acLevel <= 2047 ? acLevel : 2047); * |
474 |
* } * |
//* } * |
475 |
* } * |
//* } * |
476 |
* * |
//* * |
477 |
********************************************************************************/ |
//*******************************************************************************/ |
478 |
|
|
479 |
|
|
480 |
|
|
481 |
LL=2 // LL := load latency |
LL=2 // LL := load latency |
482 |
|
//if LL is changed, you'll also have to change the .pred.rel... parts below! |
483 |
|
|
484 |
|
|
485 |
.rotr ac1[LL+10], x[5], y1[3], y2[3] |
.rotr ac1[LL+10], x[5], y1[3], y2[3] |
489 |
//pipeline stage |
//pipeline stage |
490 |
|
|
491 |
.L60: |
.L60: |
492 |
|
.pred.rel "clear", p36 |
493 |
|
.pred.rel "mutex", p47, p49 |
494 |
|
.pred.rel "mutex", p46, p48 |
495 |
|
.pred.rel "mutex", p40, p45 |
496 |
|
.pred.rel "mutex", p39, p44 |
497 |
|
.pred.rel "mutex", p38, p43 |
498 |
|
.pred.rel "mutex", p37, p42 |
499 |
|
.pred.rel "mutex", p36, p41 |
500 |
{.mmi |
{.mmi |
501 |
(p[0])ld2 ac1[0] = [r14] ,2 // 0 acLevel = coeff[i]; |
(p[0])ld2 ac1[0] = [r14] ,2 // 0 acLevel = coeff[i]; |
502 |
(p[LL+1])cmp4.ne p6, cmp1neg[0] = 0, ac1[LL+1] // LL+1 |
(p[LL+1])cmp4.ne p6, cmp1neg[0] = 0, ac1[LL+1] // LL+1 |
520 |
(p[LL+4])sxt2 ac1[LL+4] = ac1[LL+4] // LL+4 |
(p[LL+4])sxt2 ac1[LL+4] = ac1[LL+4] // LL+4 |
521 |
} |
} |
522 |
{.mmi |
{.mmi |
523 |
(cmp2[4]) mov y1[0] = x[3] // LL+4 |
(cmp2[4]) mov y1[0] = x[3] // LL+5 |
524 |
(cmp2[4]) mov y2[0] = ac1[LL+5] // LL+4 |
(cmp2[4]) mov y2[0] = ac1[LL+5] // LL+5 |
525 |
(p[LL+6])cmp4.le cmp3[0], cmp3neg[0] = x[4], ac1[LL+6] // LL+6 |
(p[LL+6])cmp4.le cmp3[0], cmp3neg[0] = x[4], ac1[LL+6] // LL+6 |
526 |
} |
} |
527 |
{.mmi |
{.mmi |
535 |
br.ctop.sptk.few .L60 |
br.ctop.sptk.few .L60 |
536 |
;; |
;; |
537 |
} |
} |
538 |
|
.pred.rel "clear", p36 |
539 |
.default |
.default |
540 |
mov ar.lc = r2 |
mov ar.lc = r2 |
541 |
mov ar.pfs = r9 |
mov ar.pfs = r9 |