--- trunk/xvidcore/src/utils/ia64_asm/mem_transfer_ia64.s 2002/07/04 13:41:53 255 +++ trunk/xvidcore/src/utils/ia64_asm/mem_transfer_ia64.s 2002/07/05 14:01:18 256 @@ -1,590 +1,680 @@ - .file "mem_transfer.c" - .pred.safe_across_calls p1-p5,p16-p63 - .common transfer_8to16copy#,8,8 -.text +/**************************************************************************** +* +* mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, +* University of Karlsruhe, Germany, 03.06.2002, during the laboratory +* "IA-64 Video Codec Assember Parktikum" at IPD Goos. +* +* Annotations: +* =========== +* +* - All functions work on 8x8-matrices. While the C-code-functions treat each +* element seperatly, the functions in this assembler-code treat a whole line +* simultaneously. So one loop is saved. +* The remaining loop is relized by using softwarepipelining with rotating +* rregisters. +* - Register renaming is used for better readability +* - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both +* parts are shifted and joined together with an "OR"-Instruction. +* - First parameter is stored in GR 32, next in GR 33, and so on. They must be +* saved, as these GRs are used for register-rotation. +* - Some of the orininal, German comments used during development are left in +* in the code. They shouldn't bother anyone. +* +* Anmerkungen: +* ============ +* +* - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code +* jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- +* Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. +* Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit +* rotierenden Registern realisiert. +* - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. +* - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke +* geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem +* logischen Oder zusammenkopiert. +* - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- +* sichert werden, da die Register für die register-Rotation benötigt werden. +* - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase +* sind im Code verblieben. Sie sollten niemanden stören. +* +****************************************************************************/ + + +// *** define Latencies for software pipilines *** + + LL = 3 // Load + SL = 3 // Store + PL = 1 // Pack + SHL = 1 // Shift + OL = 1 // Or + UL = 1 // Unpack + PAL = 1 // Parallel Add + PSL = 1 // Parallel Subtract + PAVGL = 1 // Parallel Avarage + + .text + + +/**************************************************************************** +* +* transfer8x8_copy_ia64 +* +* SRC is missaligned, to align the source load two 8-bytes-words, shift it, +* join them and store the aligned source into the destination address. +* +****************************************************************************/ + + .align 16 + .global transfer8x8_copy_ia64# + .proc transfer8x8_copy_ia64# + +transfer8x8_copy_ia64: + .prologue + +// *** register renaming *** + zero = r0 + + oldLC = r2 + oldPR = r3 + + src_1 = r14 // left aligned address of src + src_2 = r15 // right aligned address of src + dst = r16 // destination address + stride = r17 + + offset = r18 // shift right offset + aoffset = r19 // shift left offset + + + .body + +// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** + .save ar.lc, oldLC + mov oldLC = ar.lc + mov oldPR = pr + +// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** + alloc r9 = ar.pfs, 3, 29, 0, 32 + +// *** Saving Parameters *** + mov dst = r32 + mov stride = r34 + +// *** Misalingment-Treatment *** + and src_1 = -8, r33 // Computing adress of first aligned block containing src-values + dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress + ;; + sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl + add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values + +// *** init loop: set loop counter, epilog counter, predicates *** + mov ar.lc = 7 + mov ar.ec = LL + SHL + OL + 1 + mov pr.rot = 1 << 16 + ;; + +// *** define register arrays and predicate array for software pipeline *** + // src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left + .rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] + .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] + +/* Software pipelined loop: +* Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 +* Stage 2: Shift both values of source to SHD_R and SHD_L +* Stage 3: Join both parts together with OR +* Stage 4: Store aligned date to destination and add stride to destination address */ +.Loop_8x8copy: + {.mii + (ld_stage[0]) ld8 src_v1[0] = [src_1], stride + (sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset + } + {.mii + (ld_stage[0]) ld8 src_v2[0] = [src_2], stride + (sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset + (or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL] + } + {.mib + (st_stage[0]) st8 [dst] = value[OL] + (st_stage[0]) add dst = dst, stride + br.ctop.sptk.few .Loop_8x8copy + ;; + } + +// *** Restore old LC and PRs *** + mov ar.lc = oldLC + mov pr = oldPR, -1 + + br.ret.sptk.many b0 + + .endp transfer8x8_copy_ia64# + + + + +/***************************************************************************** +* +* transfer_8to16copy_ia64 +* +* SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, +* UNPACK is used. So 8 bytes are loaded from source, unpacked to two +* 4 x 16 bit values and stored to the destination. Destination is a continuous +* array of 64 x 16 bit signed data. To store the next line, only 16 must be +* added to the destination address. +*****************************************************************************/ + .align 16 .global transfer_8to16copy_ia64# .proc transfer_8to16copy_ia64# + + transfer_8to16copy_ia64: .prologue - .save ar.lc, r2 - mov r2 = ar.lc + +// *** register renaming *** + oldLC = r2 + oldPR = r3 + + zero = r0 // damit ist die Zahl "zero" = 0 gemeint + + dst_1 = r14 // destination address for first 4 x 16 bit values + dst_2 = r15 // destination address for second 4 x 16 bit values + src = r16 + stride = r17 + + .body - addl r14 = 7, r0 - mov r21 = r0 - mov r20 = r0 - ;; - mov ar.lc = r14 - ;; -.L101: - addl r19 = 1, r0 - zxt4 r14 = r21 - dep.z r15 = r20, 1, 32 - ;; - add r16 = r21, r19 - add r14 = r33, r14 - add r17 = r20, r19 - ;; - ld1 r18 = [r14] - add r15 = r15, r32 - zxt4 r16 = r16 - ;; - st2 [r15] = r18 - addl r19 = 2, r0 - add r16 = r33, r16 - dep.z r17 = r17, 1, 32 - ;; - ld1 r15 = [r16] - add r14 = r21, r19 - add r18 = r20, r19 - add r17 = r17, r32 - ;; - zxt4 r14 = r14 - st2 [r17] = r15 - addl r19 = 3, r0 - ;; - add r14 = r33, r14 - add r15 = r21, r19 - dep.z r18 = r18, 1, 32 - ;; - ld1 r17 = [r14] - add r16 = r20, r19 - add r18 = r18, r32 - zxt4 r15 = r15 - ;; - st2 [r18] = r17 - addl r19 = 4, r0 - add r15 = r33, r15 - dep.z r16 = r16, 1, 32 - ;; - ld1 r18 = [r15] - add r14 = r21, r19 - add r17 = r20, r19 - add r16 = r16, r32 - ;; - zxt4 r14 = r14 - st2 [r16] = r18 - addl r19 = 5, r0 - ;; - add r14 = r33, r14 - add r15 = r21, r19 - add r16 = r20, r19 - dep.z r17 = r17, 1, 32 - ;; - ld1 r18 = [r14] - addl r19 = 6, r0 - add r17 = r17, r32 - zxt4 r15 = r15 - ;; - st2 [r17] = r18 - add r14 = r21, r19 - add r15 = r33, r15 - dep.z r16 = r16, 1, 32 - add r17 = r20, r19 - ;; - ld1 r18 = [r15] - add r16 = r16, r32 - zxt4 r14 = r14 - ;; - st2 [r16] = r18 - addl r19 = 7, r0 - add r14 = r33, r14 - ;; - ld1 r15 = [r14] - add r16 = r21, r19 - dep.z r17 = r17, 1, 32 - add r14 = r20, r19 - ;; - add r17 = r17, r32 - zxt4 r16 = r16 - ;; - st2 [r17] = r15 - dep.z r14 = r14, 1, 32 - add r16 = r33, r16 - ;; - add r14 = r14, r32 - ld1 r15 = [r16] - add r21 = r21, r34 - ;; - st2 [r14] = r15 - adds r20 = 8, r20 - br.cloop.sptk.few .L101 - ;; - mov ar.lc = r2 + +// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** + .save ar.lc, oldLC + mov oldLC = ar.lc + mov oldPR = pr + +// *** Allocating new stackframe, define rotating registers *** + alloc r9 = ar.pfs, 4, 92, 0, 96 + +// *** Saving Paramters *** + mov dst_1 = r32 // fist 4 x 16 bit values + add dst_2 = 8, r32 // second 4 x 16 bit values + mov src = r33 + mov stride = r34 + +// *** init loop: set loop counter, epilog counter, predicates *** + mov ar.lc = 7 + mov ar.ec = LL + UL + 1 + mov pr.rot = 1 << 16 + ;; + +// *** define register arrays and predicate array for software pipeline *** + // src_v = source value, dst_v1 = destination value 1 + .rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] + .rotp ld_stage[LL], upack_stage[UL], st_stage[1] + +/* Software pipelined loop: +* Stage 1: Load value of SRC +* Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data +* Stage 3: Store both 8 byte of 16 bit data */ +.Loop_8to16copy: + {.mii + (ld_stage[0]) ld8 src_v[0] = [src], stride + (upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL] + (upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL] + } + {.mmb + (st_stage[0]) st8 [dst_1] = dst_v1[UL], 16 + (st_stage[0]) st8 [dst_2] = dst_v2[UL], 16 + br.ctop.sptk.few .Loop_8to16copy + ;; + } + +// *** Restore old LC and PRs *** + mov ar.lc = oldLC + mov pr = oldPR, -1 + br.ret.sptk.many b0 .endp transfer_8to16copy_ia64# - .common transfer_16to8copy#,8,8 + + + + +/***************************************************************************** +* +* transfer_16to8copy_ia64 +* +* src is a 64 x 16 bit signed continuous array. To convert the 16 bit +* values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of +* 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word +* of 8 x 8 unsigned data to the destination. +****************************************************************************/ + .align 16 .global transfer_16to8copy_ia64# .proc transfer_16to8copy_ia64# transfer_16to8copy_ia64: .prologue + +// *** register renaming *** + dst = r14 + src_1 = r15 + src_2 = r17 + stride = r16 + + .body - mov r22 = r0 - addl r21 = 255, r0 - mov r20 = r0 - mov r19 = r0 -.L25: - mov r18 = r0 - ;; -.L29: - add r14 = r19, r18 - ;; - dep.z r14 = r14, 1, 32 - ;; - add r14 = r14, r33 - ;; - ld2 r15 = [r14] - ;; - sxt2 r15 = r15 - ;; - mov r16 = r15 - ;; - cmp4.le p6, p7 = r0, r16 - ;; - (p7) mov r16 = r0 - (p7) br.cond.dpnt .L106 - ;; - cmp4.ge p6, p7 = r21, r16 - ;; - (p7) addl r16 = 255, r0 -.L106: - add r14 = r20, r18 - adds r17 = 1, r18 - ;; - zxt4 r14 = r14 - add r15 = r19, r17 - ;; - add r14 = r32, r14 - dep.z r15 = r15, 1, 32 - ;; - st1 [r14] = r16 - add r15 = r15, r33 - ;; - ld2 r14 = [r15] - ;; - sxt2 r14 = r14 - ;; - mov r16 = r14 - ;; - cmp4.le p6, p7 = r0, r16 - ;; - (p7) mov r16 = r0 - (p7) br.cond.dpnt .L110 - ;; - cmp4.ge p6, p7 = r21, r16 - ;; - (p7) addl r16 = 255, r0 -.L110: - add r14 = r20, r17 - adds r17 = 2, r18 - ;; - zxt4 r14 = r14 - add r15 = r19, r17 - ;; - add r14 = r32, r14 - dep.z r15 = r15, 1, 32 - ;; - st1 [r14] = r16 - add r15 = r15, r33 - ;; - ld2 r14 = [r15] - ;; - sxt2 r14 = r14 - ;; - mov r16 = r14 - ;; - cmp4.le p6, p7 = r0, r16 - ;; - (p7) mov r16 = r0 - (p7) br.cond.dpnt .L114 - ;; - cmp4.ge p6, p7 = r21, r16 - ;; - (p7) addl r16 = 255, r0 -.L114: - add r14 = r20, r17 - adds r17 = 3, r18 - ;; - zxt4 r14 = r14 - add r15 = r19, r17 - ;; - add r14 = r32, r14 - dep.z r15 = r15, 1, 32 - ;; - st1 [r14] = r16 - add r15 = r15, r33 - ;; - ld2 r14 = [r15] - ;; - sxt2 r14 = r14 - ;; - mov r15 = r14 - ;; - cmp4.le p6, p7 = r0, r15 - ;; - (p7) mov r15 = r0 - (p7) br.cond.dpnt .L118 - ;; - cmp4.ge p6, p7 = r21, r15 - ;; - (p7) addl r15 = 255, r0 -.L118: - add r14 = r20, r17 - adds r18 = 4, r18 - ;; - zxt4 r14 = r14 - cmp4.geu p6, p7 = 7, r18 - ;; - add r14 = r32, r14 - ;; - st1 [r14] = r15 - (p6) br.cond.dptk .L29 - adds r22 = 1, r22 - add r20 = r20, r34 - adds r19 = 8, r19 - ;; - cmp4.geu p6, p7 = 7, r22 - (p6) br.cond.dptk .L25 + +// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** + .save ar.lc, oldLC + mov oldLC = ar.lc + mov oldPR = pr + +// *** Allocating new stackframe, define rotating registers *** + alloc r9 = ar.pfs, 4, 92, 0, 96 + +// *** Saving Paramters *** + mov dst = r32 + mov src_1 = r33 + add src_2 = 8, r33 + mov stride = r34 + +// *** init loop: set loop counter, epilog counter, predicates *** + mov ar.lc = 7 + mov ar.ec = LL + PL + 1 + mov pr.rot = 1 << 16 + ;; + +// *** define register arrays and predicate array for software pipeline *** + // src_v1 = source value 1, dst_v = destination value + .rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1] + .rotp ld_stage[LL], pack_stage[PL], st_stage[1] + + +/* Software pipelined loop: +* Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data +* Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data +* Stage 3: Store the 8 byte to the destination address and add stride to +* destination address (to get the next 8 byte line of destination)*/ +.Loop_16to8copy: + {.mmi + (ld_stage[0]) ld8 src_v1[0] = [src_1], 16 + (ld_stage[0]) ld8 src_v2[0] = [src_2], 16 + (pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL] + } + {.mib + (st_stage[0]) st8 [dst] = dst_v[PL] + (st_stage[0]) add dst = dst, stride + br.ctop.sptk.few .Loop_16to8copy + ;; + } + +// *** Restore old LC and PRs *** + mov ar.lc = oldLC + mov pr = oldPR, -1 + br.ret.sptk.many b0 .endp transfer_16to8copy_ia64# - .common transfer_8to16sub#,8,8 + + + +/***************************************************************************** +* +* transfer_16to8add_ia64 +* +* The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- +* bit-values. These are "parallel-added" to the values of src. The result is +* converted into 8-bit-values using "PACK" and stored at the adress of dst. +* We assume that there is no misalignment. +* +*****************************************************************************/ + + .align 16 + .global transfer_16to8add_ia64# + .proc transfer_16to8add_ia64# + +transfer_16to8add_ia64: + .prologue + +// *** register renaming *** + dst = r14 + src = r15 + stride = r16 + + _src = r17 + + + .body + +// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** + .save ar.lc, r2 + mov oldLC = ar.lc + mov oldPR = pr + +// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** + alloc r9 = ar.pfs, 4, 92, 0, 96 + +// *** Saving Paramters *** + mov dst = r32 + mov src = r33 + mov stride = r34 + add _src = 8, r33 + +// *** init loop: set loop counter, epilog counter, predicates *** + mov ar.lc = 7 + mov ar.ec = LL + UL + PAL + PL + 1 + mov pr.rot = 1 << 16 + ;; + +// *** define register arrays and predicate array for software pipeline *** + .rotr _dst[LL+UL+PAL+PL+1], dst8[PL+1], pixel_1[PAL+1], pixel_2[PAL+1], w_dst16_1[UL+1], w_src_1[LL+UL+1], w_dst16_2[UL+1], w_src_2[LL+UL+1], w_dst8[LL+1] + .rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] + + +/* Software pipelined loop: + * s1_p: The values of src and dst are loaded + * s2_p: The dst-values are converted to 16-bit-values + * s3_p: The values of src and dst are added + * s4_p: The Results are packed into 8-bit-values + * s5_p: The 8-bit-values are stored at the dst-adresses + */ + +.Loop_16to8add: + {.mii + (s1_p[0]) ld8 w_src_1[0] = [src], 16 // läd die 1. Hälfte der j. Zeile von src (i = 0..3) + (s1_p[0]) mov _dst[0] = dst // erhöht die Adresse von dst um stride + (s3_p[0]) padd2.sss pixel_1[0] = w_dst16_1[UL], w_src_1[LL+UL] // parallele Addition von scr und dst + } + {.mii + (s1_p[0]) ld8 w_dst8[0] = [dst], stride // läd die j. Zeile von dst + (s2_p[0]) unpack1.l w_dst16_1[0] = r0, w_dst8[LL]; // dst wird für i = 0..3 in 16-Bit umgewandelt + (s2_p[0]) unpack1.h w_dst16_2[0] = r0, w_dst8[LL]; // dst wird für i = 4..7 in 16-Bit umgewandelt + } + {.mii + (s1_p[0]) ld8 w_src_2[0] = [_src], 16 // läd die 2. Hälfte der j. Zeile von src (i = 4..7) + (s3_p[0]) padd2.sss pixel_2[0] = w_dst16_2[UL], w_src_2[LL+UL] // parallele Addition von scr und dst + (s4_p[0]) pack2.uss dst8[0] = pixel_1[PAL], pixel_2[PAL] // wandelt die Summen (pixel) in 8-Bit Werte um. Die Überprüfung der Wertebereiche erfolgt automatisch + } + {.mmb + (s5_p[0]) st8 [_dst[LL+UL+PAL+PL]] = dst8[PL] // speichert dst ab + (s1_p[0]) nop.m 0 + br.ctop.sptk.few .Loop_16to8add + ;; + } + +// *** Restore old LC and PRs *** + mov ar.lc = oldLC + mov pr = oldPR, -1 + + br.ret.sptk.many b0 + .endp transfer_16to8add_ia64# + + + +/***************************************************************************** +* +* transfer_8to16sub_ia64 +* +* The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The +* Difference of cur and ref ist stored at the dct-adresses and cur is copied +* into the ref-array. +* +* You must assume, that the data adressed by 'ref' are misaligned in memory. +* But you can assume, that the other data are aligned (at least I hope so). +* +****************************************************************************/ + .align 16 .global transfer_8to16sub_ia64# .proc transfer_8to16sub_ia64# + + transfer_8to16sub_ia64: .prologue + +// *** register renaming *** + oldLC = r2 + oldPR = r3 + + zero = r0 // damit ist die Zahl "zero" = 0 gemeint + + //Die folgenden Register erhalten die gleichen Namen, wie die Variablen in der C-Vorlage + dct = r14 + cur = r15 + ref = r34 // muss nicht extra gesichert werden, deswegen bleibt das ÜbergabeRegister in dieser Liste + stride = r16 + + offset = r17 // Offset der falsch ausgerichteten Daten zum zurechtrücken + aoffset = r18 // Gegenstück zum Offset, + ref_a1 = r19 // Adresse des ersten 64-Bit Blocks von ref + ref_a2 = r20 // Adresse des zweiten 64-Bit Blocks von ref + + _dct = r21 // Register für die Zieladressen des 2. dct-Blocks + + .body - mov r25 = r0 - mov r24 = r0 - mov r23 = r0 -.L39: - mov r22 = r0 - ;; -.L43: - add r15 = r23, r22 - adds r20 = 1, r22 - add r16 = r24, r22 - ;; - zxt4 r15 = r15 - add r18 = r23, r20 - dep.z r16 = r16, 1, 32 - ;; - add r19 = r34, r15 - zxt4 r18 = r18 - add r16 = r16, r32 - add r15 = r33, r15 - ;; - ld1 r14 = [r19] - add r21 = r34, r18 - ld1 r17 = [r15] - adds r19 = 2, r22 - add r18 = r33, r18 - ;; - st1 [r15] = r14 - sub r17 = r17, r14 - add r20 = r24, r20 - ;; - st2 [r16] = r17 - dep.z r20 = r20, 1, 32 - ld1 r14 = [r21] - ld1 r15 = [r18] - add r16 = r23, r19 - ;; - st1 [r18] = r14 - sub r15 = r15, r14 - zxt4 r16 = r16 - add r20 = r20, r32 - ;; - add r18 = r34, r16 - adds r17 = 3, r22 - st2 [r20] = r15 - add r16 = r33, r16 - add r19 = r24, r19 - ;; - ld1 r14 = [r18] - add r15 = r23, r17 - dep.z r19 = r19, 1, 32 - ld1 r18 = [r16] - ;; - zxt4 r15 = r15 - add r19 = r19, r32 - st1 [r16] = r14 - sub r18 = r18, r14 - ;; - add r20 = r34, r15 - st2 [r19] = r18 - add r15 = r33, r15 - add r17 = r24, r17 - ;; - ld1 r14 = [r20] - ld1 r16 = [r15] - dep.z r17 = r17, 1, 32 - ;; - add r17 = r17, r32 - adds r22 = 4, r22 - st1 [r15] = r14 - sub r16 = r16, r14 - ;; - cmp4.geu p6, p7 = 7, r22 - st2 [r17] = r16 - (p6) br.cond.dptk .L43 - adds r25 = 1, r25 - adds r24 = 8, r24 - add r23 = r23, r35 - ;; - cmp4.geu p6, p7 = 7, r25 - (p6) br.cond.dptk .L39 + +// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** + .save ar.lc, r2 + mov oldLC = ar.lc + mov oldPR = pr + +// *** Allocating new stackframe, define rotating registers *** + alloc r9 = ar.pfs, 4, 92, 0, 96 + +// *** Saving Paramters *** + mov dct = r32 + mov cur = r33 + // mov ref = r34: ref is unaligned, get aligned ref below... + mov stride = r35 + + and ref_a1 = -8, ref // Die Adresse des ersten 64-Bit Blocks, in dem ref liegt, wird berechnet (entspricht mod 8) + dep offset = ref, zero, 3, 3 + ;; + add ref_a2 = 8, ref_a1 + sub aoffset = 64, offset // Gegenstück zum Offset wird berechnet + add _dct = 8, dct // Die Adresse für den 2. dct-Block wird berechnet, um 8 Byte (= 64 Bit) höher als beim 1. Block + +// *** init loop: set loop counter, epilog counter, predicates *** + mov ar.lc = 7 + mov ar.ec = LL + SHL + OL + UL + PSL + 1 + mov pr.rot = 1 << 16 + ;; + +// *** define register arrays and predicate array for software pipeline *** + .rotr c[LL+1], ref_v1[LL+1], ref_v2[LL+1], c16_1[SHL+OL+UL+1], c16_2[SHL+OL+UL+1], ref_shdr[SHL+1], ref_shdl[SHL+1], r[OL+1], r16_1[UL+1], r16_2[UL+1], dct_1[PSL+1], dct_2[PSL+1], _cur[LL+SHL+OL+UL+1] + .rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] + + +/* Software pipelined loop: + * s1_p: The values of ref and cur ale loaded, a copy of cur is made. + * s2_p: cur is converted to 16-bit and thehe misaligned values of ref are + * shifted... + * s3_p: ... and copied together. + * s4_p: This ref-value is converted to 16-bit. The values of cur are stored + * at the ref-adresses. + * s5_p: the ref- abd cur-values are substracted... + * s6_p: ...and the result is stored at the dct-adresses. + */ + +loop_8to16sub: + {.mii + (s1_p[0]) ld8 ref_v1[0] = [ref_a1], stride // läd den 1. 64-Bit-Block, der einen Teil der ref-Daten enthält + (s1_p[0]) mov _cur[0] = cur // cur wird für spätere Verwendung gesichert + (s2_p[0]) shr.u ref_shdr[0] = ref_v1[LL], offset // Die rechte Hälfte wird zurechtgerückt + } + {.mii + (s1_p[0]) ld8 ref_v2[0] = [ref_a2], stride // läd den 2. 64-Bit-Block + (s2_p[0]) shl ref_shdl[0] = ref_v2[LL], aoffset // Die linke Hälfte wird zurechtgerückt + (s3_p[0]) or r[0] = ref_shdr[SHL], ref_shdl[SHL] // Die zurechtgerückten Daten werden in r zusammenkopiert + } + {.mii + (s1_p[0]) ld8 c[0] = [cur], stride //läd die j. Zeile von cur komplett + (s2_p[0]) unpack1.l c16_1[0] = zero, c[LL]; // c wird für i = 0..3 in 16-Bit umgewandelt + (s2_p[0]) unpack1.h c16_2[0] = zero, c[LL]; // c wird für i = 4..7 in 16-Bit umgewandelt + } + {.mii + (s4_p[0]) st8 [_cur[LL+SHL+OL]] = r[OL] // cur wird auf den Wert von r gesetzt + //Umwandeln der 8-Bit r und c -Werte in 16-bit Werte + (s4_p[0]) unpack1.l r16_1[0] = zero, r[OL]; // r wird für i = 0..3 in 16-Bit umgewandelt + (s4_p[0]) unpack1.h r16_2[0] = zero, r[OL]; // r wird für i = 4..7 in 16-Bit umgewandelt + } + {.mii + (s5_p[0]) psub2.sss dct_1[0] = c16_1[SHL+OL+UL], r16_1[UL] // Subtraktion der 1. Häfte der j. Zeile + (s5_p[0]) psub2.sss dct_2[0] = c16_2[SHL+OL+UL], r16_2[UL] // Subtraktion der 2. Hälfte + } + {.mmb + (s6_p[0]) st8 [dct] = dct_1[PSL], 16 // speichert den 1. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert + (s6_p[0]) st8 [_dct] = dct_2[PSL], 16 // speichert den 2. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert + br.ctop.sptk.few loop_8to16sub // Und hopp + ;; + } + +// *** Restore old LC and PRs *** + mov ar.lc = oldLC + mov pr = oldPR, -1 + br.ret.sptk.many b0 .endp transfer_8to16sub_ia64# - .common transfer_8to16sub2#,8,8 + + + + + +/***************************************************************************** +* +* transfer_8to16sub2_ia64 +* +* At the time, this function was written, it was not yet in use. +* We assume that the values of ref1/2 are misaligned. +* +* The values of ref1/2 and cur are loaded, the ref-values need misalignment- +* treatment. The values are converted to 16-bit using unpack. The average of +* ref1 and ref2 is computed with pavg and substacted from cur. The results are +* stored at the dct-adresses. +* pavg1.raz is used to get the same results as the C-code-function. +* +*****************************************************************************/ + + .text .align 16 .global transfer_8to16sub2_ia64# .proc transfer_8to16sub2_ia64# + transfer_8to16sub2_ia64: .prologue + +// *** register renaming *** + // We've tried to keep the C-Code names as often as possible, at least as + // part of register-names + oldLC = r2 + oldPR = r3 + + zero = r0 + + dct_al = r14 // dct: adress of left block in one line + dct_ar = r15 // dct: adress of right block in one line + cur = r16 + ref1_al = r17 // ref1: aligned adress of lower part + ref1_ah = r18 // ref1: aligned adress of higher part + ref2_al = r19 // ref2: aligned adress of lower part + ref2_ah = r20 // ref2: aligned adress of higher part + stride = r21 + + offset_1 = r22 + offset_2 = r23 + aoffset_1 = r24 + aoffset_2 = r25 + + + .body + +// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** .save ar.lc, r2 - mov r2 = ar.lc - .body - mov r28 = r0 - addl r27 = 255, r0 - mov r26 = r0 - mov r25 = r0 -.L50: - addl r14 = 3, r0 - mov r21 = r0 - ;; - mov ar.lc = r14 - ;; -.L138: - add r14 = r26, r21 - add r17 = r25, r21 - adds r19 = 1, r21 - ;; - zxt4 r17 = r17 - dep.z r14 = r14, 1, 32 - add r18 = r25, r19 - ;; - add r15 = r34, r17 - add r23 = r14, r32 - add r20 = r35, r17 - ;; - ld1 r14 = [r15] - ld1 r16 = [r20] - add r17 = r33, r17 - ;; - add r14 = r14, r16 - ld1 r15 = [r17] - zxt4 r18 = r18 - ;; - adds r14 = 1, r14 - add r24 = r35, r18 - add r22 = r34, r18 - ;; - shr.u r14 = r14, 1 - add r19 = r26, r19 - add r16 = r33, r18 - ;; - cmp4.ge p6, p7 = r27, r14 - dep.z r19 = r19, 1, 32 - adds r21 = 2, r21 - ;; - (p7) addl r14 = 255, r0 - add r19 = r19, r32 - ;; - sub r14 = r15, r14 - ;; - st2 [r23] = r14 - ld1 r14 = [r24] - ld1 r15 = [r22] - ld1 r16 = [r16] - ;; - add r15 = r15, r14 - ;; - adds r15 = 1, r15 - ;; - shr.u r14 = r15, 1 - ;; - cmp4.ge p6, p7 = r27, r14 - ;; - (p7) addl r14 = 255, r0 - ;; - sub r14 = r16, r14 - ;; - st2 [r19] = r14 - br.cloop.sptk.few .L138 - adds r28 = 1, r28 - adds r26 = 8, r26 - add r25 = r25, r36 - ;; - cmp4.geu p6, p7 = 7, r28 - (p6) br.cond.dptk .L50 - mov ar.lc = r2 + mov oldLC = ar.lc + mov oldPR = pr + +// *** Saving Paramters *** +// *** (as inputregisters r32 + are needed for register-rotation) *** + mov dct_ar = r32 + add dct_al = 8, r32 + mov cur = r33 + + and ref1_al = -8, r34 + and ref2_al = -8, r35 // ref2 aligned adrress of lower part + + mov stride = r36 + +// *** Calculations for Misaligment-Handling *** + dep offset_1 = r34, zero, 3, 3 + dep offset_2 = r35, zero, 3, 3 + ;; + add ref1_ah = 8, ref1_al + add ref2_ah = 8, ref2_al + sub aoffset_1 = 64, offset_1 + sub aoffset_2 = 64, offset_2 + ;; + +// *** Allocating new stackframe, define rotating registers *** + alloc r9 = ar.pfs, 5, 91, 0, 96 + +// *** init loop: set loop counter, epilog counter, predicates *** + mov ar.lc = 7 + mov ar.ec = LL + SHL + OL + PAVGL + UL +PSL + 1 + mov pr.rot = 1 << 16 + ;; + +// *** define register arrays and predicate array for software pipeline *** + .rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] + .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] + +/* software pipelined loop: + * ld_stage: The values of ref1, ref2, cur are loaded + * sh_stage: The misaligned values of ref1/2 are shifted... + * or_stage: ...and copied together. + * pavg_stage: The average of ref1 and ref2 is computed. + * up_stage: The result and the cur-values are converted to 16-bit. + * psub_stage: Those values are substracted... + * st_stage: ...and stored at the dct-adresses. + */ + +.Loop_8to16sub2: + {.mii + (ld_stage[0]) ld8 c[0] = [cur], stride + (sh_stage[0]) shr.u ref1_l[0] = ref1_vl[LL], offset_1 + (sh_stage[0]) shl ref1_h[0] = ref1_vh[LL], aoffset_1 + } + {.mii + (ld_stage[0]) ld8 ref1_vl[0] = [ref1_al], stride + (sh_stage[0]) shr.u ref2_l[0] = ref2_vl[LL], offset_2 + (sh_stage[0]) shl ref2_h[0] = ref2_vh[LL], aoffset_2 + } + {.mii + (ld_stage[0]) ld8 ref1_vh[0] = [ref1_ah], stride + (or_stage[0]) or ref1_aligned[0] = ref1_h[SHL], ref1_l[SHL] + (or_stage[0]) or ref2_aligned[0] = ref2_h[SHL], ref2_l[SHL] + } + {.mii + (ld_stage[0]) ld8 ref2_vl[0] = [ref2_al], stride + (pavg_stage[0]) pavg1.raz r[0] = ref1_aligned[OL], ref2_aligned[OL] + (up_stage[0]) unpack1.l r16_r[0] = zero, r[PAVGL] + } + {.mii + (ld_stage[0]) ld8 ref2_vh[0] = [ref2_ah], stride + (up_stage[0]) unpack1.h r16_l[0] = zero, r[PAVGL] + (up_stage[0]) unpack1.l c16_r[0] = zero, c[LL+SHL+OL+PAVGL] + } + {.mii + (st_stage[0]) st8 [dct_ar] = dct16_r[PSL], 16 + (up_stage[0]) unpack1.h c16_l[0] = zero, c[LL+SHL+OL+PAVGL] + (psub_stage[0]) psub2.sss dct16_l[0] = c16_l[UL], r16_l[UL] + } + {.mib + (st_stage[0]) st8 [dct_al] = dct16_l[PSL], 16 + (psub_stage[0]) psub2.sss dct16_r[0] = c16_r[UL], r16_r[UL] + br.ctop.sptk.few .Loop_8to16sub2 // Und hopp + ;; + } + +// *** Restore old LC and PRs *** + mov ar.lc = oldLC + mov pr = oldPR, -1 + br.ret.sptk.many b0 .endp transfer_8to16sub2_ia64# - .common transfer_16to8add#,8,8 - .align 16 - .global transfer_16to8add_ia64# - .proc transfer_16to8add_ia64# -transfer_16to8add_ia64: - .prologue - .save ar.lc, r2 - mov r2 = ar.lc - .body - mov r26 = r0 - addl r25 = 255, r0 - mov r24 = r0 - mov r21 = r0 -.L62: - addl r14 = 3, r0 - mov r20 = r0 - ;; - mov ar.lc = r14 - ;; -.L149: - adds r17 = 1, r20 - add r14 = r21, r20 - add r15 = r24, r20 - ;; - zxt4 r14 = r14 - add r18 = r21, r17 - dep.z r15 = r15, 1, 32 - ;; - add r23 = r32, r14 - zxt4 r18 = r18 - add r15 = r15, r33 - ;; - mov r16 = r23 - add r22 = r32, r18 - ld2 r14 = [r15] - ;; - ld1 r18 = [r16] - add r19 = r24, r17 - adds r20 = 2, r20 - ;; - add r14 = r14, r18 - dep.z r19 = r19, 1, 32 - mov r16 = r22 - ;; - sxt2 r14 = r14 - add r19 = r19, r33 - ;; - cmp4.le p6, p7 = r0, r14 - cmp4.ge p8, p9 = r25, r14 - ;; - (p7) mov r14 = r0 - (p7) br.cond.dpnt .L143 - ;; - (p9) addl r14 = 255, r0 - ;; -.L143: - st1 [r23] = r14 - ld1 r14 = [r22] - ld2 r15 = [r19] - ;; - add r15 = r15, r14 - ;; - sxt2 r15 = r15 - ;; - cmp4.le p6, p7 = r0, r15 - cmp4.ge p8, p9 = r25, r15 - ;; - (p7) mov r15 = r0 - (p7) br.cond.dpnt .L147 - ;; - (p9) addl r15 = 255, r0 - ;; -.L147: - st1 [r16] = r15 - br.cloop.sptk.few .L149 - adds r26 = 1, r26 - adds r24 = 8, r24 - add r21 = r21, r34 - ;; - cmp4.geu p6, p7 = 7, r26 - (p6) br.cond.dptk .L62 - mov ar.lc = r2 - br.ret.sptk.many b0 - .endp transfer_16to8add_ia64# - .common transfer8x8_copy#,8,8 - .align 16 - .global transfer8x8_copy_ia64# - .proc transfer8x8_copy_ia64# -transfer8x8_copy_ia64: - .prologue - .save ar.lc, r2 - mov r2 = ar.lc - .body - addl r14 = 7, r0 - mov r21 = r0 - ;; - mov ar.lc = r14 - ;; -.L168: - zxt4 r14 = r21 - adds r15 = 1, r21 - adds r18 = 2, r21 - ;; - add r16 = r33, r14 - zxt4 r15 = r15 - zxt4 r18 = r18 - ;; - ld1 r17 = [r16] - add r14 = r32, r14 - add r19 = r33, r15 - ;; - st1 [r14] = r17 - add r15 = r32, r15 - add r20 = r33, r18 - ld1 r16 = [r19] - adds r14 = 3, r21 - add r18 = r32, r18 - ;; - st1 [r15] = r16 - zxt4 r14 = r14 - adds r17 = 4, r21 - ld1 r15 = [r20] - ;; - add r19 = r33, r14 - zxt4 r17 = r17 - st1 [r18] = r15 - add r14 = r32, r14 - ;; - add r20 = r33, r17 - ld1 r15 = [r19] - adds r16 = 5, r21 - add r17 = r32, r17 - ;; - st1 [r14] = r15 - zxt4 r16 = r16 - adds r18 = 6, r21 - ld1 r14 = [r20] - ;; - add r19 = r33, r16 - zxt4 r18 = r18 - st1 [r17] = r14 - add r16 = r32, r16 - ;; - add r20 = r33, r18 - ld1 r14 = [r19] - adds r15 = 7, r21 - add r18 = r32, r18 - ;; - st1 [r16] = r14 - zxt4 r15 = r15 - add r21 = r21, r34 - ld1 r16 = [r20] - ;; - add r17 = r33, r15 - st1 [r18] = r16 - add r15 = r32, r15 - ;; - ld1 r14 = [r17] - ;; - st1 [r15] = r14 - br.cloop.sptk.few .L168 - ;; - mov ar.lc = r2 - br.ret.sptk.many b0 - .endp transfer8x8_copy_ia64# - .ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)"