Diff of /trunk/xvidcore/src/utils/ia64_asm/mem_transfer_ia64.s

-revision 304, Tue Jul 16 17:50:44 2002 UTC
+revision 305, Tue Jul 16 17:55:18 2002 UTC
 Line 1
- /****************************************************************************
+ ///////////////////////////////////////////////////////////////////////////////
- *
+ //
- * mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel,
+ // mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel,
- * University of Karlsruhe, Germany, 03.06.2002, during the laboratory
+ // University of Karlsruhe, Germany, 03.06.2002, during the laboratory
- * "IA-64 Video Codec Assember Parktikum" at IPD Goos.
+ // "IA-64 Video Codec Assember Parktikum" at IPD Goos.
- *
+ //
- * Annotations:
+ //
- * ===========
+ ///// legal header taken from original C-file ///////////////////////////////////////
- *
+ //
- * - All functions work on 8x8-matrices. While the C-code-functions treat each
+ // XVID MPEG-4 VIDEO CODEC
- *   element seperatly, the functions in this assembler-code treat a whole line
+ // - 8bit<->16bit transfer  -
- *   simultaneously. So one loop is saved.
+ //
- *   The remaining loop is relized by using softwarepipelining with rotating
+ // This program is an implementation of a part of one or more MPEG-4
- *   rregisters.
+ // Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
- * - Register renaming is used for better readability
+ // to use this software module in hardware or software products are
- * - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both
+ // advised that its use may infringe existing patents or copyrights, and
- *   parts are shifted and joined together with an "OR"-Instruction.
+ // any such use would be at such party's own risk.  The original
- * - First parameter is stored in GR 32, next in GR 33, and so on. They must be
+ // developer of this software module and his/her company, and subsequent
- *   saved, as these GRs are used for register-rotation.
+ // editors and their companies, will have no liability for use of this
- * - Some of the orininal, German comments used during development are left in
+ // software or modifications or derivatives thereof.
- *   in the code. They shouldn't bother anyone.
+ //
- *
+ // This program is free software ; you can redistribute it and/or modify
- * Anmerkungen:
+ // it under the terms of the GNU General Public License as published by
- * ============
+ // the Free Software Foundation ; either version 2 of the License, or
- *
+ // (at your option) any later version.
- * - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code
+ //
- *   jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler-
+ // This program is distributed in the hope that it will be useful,
- *   Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden.
+ // but WITHOUT ANY WARRANTY ; without even the implied warranty of
- *   Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit
+ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   rotierenden Registern realisiert.
+ // GNU General Public License for more details.
- * - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet.
+ //
- * - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke
+ // You should have received a copy of the GNU General Public License
- *   geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem
+ // along with this program ; if not, write to the Free Software
- *   logischen Oder zusammenkopiert.
+ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- * - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge-
+ //
- *   sichert werden, da die Register für die register-Rotation benötigt werden.
+ ///// History /////////////////////////////////////////////////////////////////
- * - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase
+ //
- *   sind im Code verblieben. Sie sollten niemanden stören.
+ // - 16.07.2002: several minor changes for ecc-conformity
- *
+ // - 03.06.2002: initial version
- ****************************************************************************/
+ //
+ ///////////////////////////////////////////////////////////////////////////////
+ //
+ // Annotations:
+ // ===========
+ //
+ // - All functions work on 8x8-matrices. While the C-code-functions treat each
+ //   element seperatly, the functions in this assembler-code treat a whole line
+ //   simultaneously. So one loop is saved.
+ //   The remaining loop is relized by using softwarepipelining with rotating
+ //   rregisters.
+ // - Register renaming is used for better readability
+ // - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both
+ //   parts are shifted and joined together with an "OR"-Instruction.
+ // - First parameter is stored in GR 32, next in GR 33, and so on. They must be
+ //   saved, as these GRs are used for register-rotation.
+ // - Some of the orininal, German comments used during development are left in
+ //   in the code. They shouldn't bother anyone.
+ //
+ // Anmerkungen:
+ // ============
+ //
+ // - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code
+ //   jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler-
+ //   Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden.
+ //   Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit
+ //   rotierenden Registern realisiert.
+ // - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet.
+ // - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke
+ //   geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem
+ //   logischen Oder zusammenkopiert.
+ // - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge-
+ //   sichert werden, da die Register für die register-Rotation benötigt werden.
+ // - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase
+ //   sind im Code verblieben. Sie sollten niemanden stören.
+ //
+ ///////////////////////////////////////////////////////////////////////////////
  //      ***     define Latencies for software pipilines ***
-Line 55
+Line 91
          .text
- /****************************************************************************
+ ///////////////////////////////////////////////////////////////////////////////
- *
+ //
- * transfer8x8_copy_ia64
+ // transfer8x8_copy_ia64
- *
+ //
- * SRC is missaligned, to align the source load two 8-bytes-words, shift it,
+ // SRC is missaligned, to align the source load two 8-bytes-words, shift it,
- * join them and store the aligned source into the destination address.
+ // join them and store the aligned source into the destination address.
- *
+ //
- ****************************************************************************/
+ ///////////////////////////////////////////////////////////////////////////////
          .align 16
          .global transfer8x8_copy_ia64#
-Line 85
+Line 121
          offset = r18 // shift right offset
          aoffset = r19 // shift left offset
-         .body
  //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
          .save ar.lc, oldLC
          mov oldLC = ar.lc
          mov oldPR = pr
+         .body
  //      *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR ***
          alloc r9 = ar.pfs, 3, 29, 0, 32
-Line 118
+Line 153
          .rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1]
          .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1]
- /* Software pipelined loop:
- * Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2
+ //      Software pipelined loop:
- * Stage 2: Shift both values of source to SHD_R and SHD_L
+ //      Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2
- * Stage 3: Join both parts together with OR
+ //      Stage 2: Shift both values of source to SHD_R and SHD_L
- * Stage 4: Store aligned date to destination and add stride to destination address */
+ //      Stage 3: Join both parts together with OR
+ //      Stage 4: Store aligned date to destination and add stride to destination address
  .Loop_8x8copy:
          {.mii
                  (ld_stage[0]) ld8 src_v1[0] = [src_1], stride
-Line 151
+Line 189
- /*****************************************************************************
+ ///////////////////////////////////////////////////////////////////////////////
- *
+ //
- * transfer_8to16copy_ia64
+ // transfer_8to16copy_ia64
- *
+ //
- * SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values,
+ // SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values,
- * UNPACK is used. So 8 bytes are loaded from source, unpacked to two
+ // UNPACK is used. So 8 bytes are loaded from source, unpacked to two
- * 4 x 16 bit values and stored to the destination. Destination is a continuous
+ // 4 x 16 bit values and stored to the destination. Destination is a continuous
- * array of 64 x 16 bit signed data. To store the next line, only 16 must be
+ // array of 64 x 16 bit signed data. To store the next line, only 16 must be
- * added to the destination address.
+ // added to the destination address.
- *****************************************************************************/
+ ///////////////////////////////////////////////////////////////////////////////
          .align 16
          .global transfer_8to16copy_ia64#
-Line 181
+Line 219
          src = r16
          stride = r17
-         .body
  //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
          .save ar.lc, oldLC
          mov oldLC = ar.lc
          mov oldPR = pr
+         .body
  //      *** Allocating new stackframe, define rotating registers ***
          alloc r9 = ar.pfs, 4, 92, 0, 96
-Line 209
+Line 247
          .rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1]
          .rotp ld_stage[LL], upack_stage[UL], st_stage[1]
- /* Software pipelined loop:
- * Stage 1: Load value of SRC
+ //      Software pipelined loop:
- * Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data
+ //      Stage 1: Load value of SRC
- * Stage 3: Store both 8 byte of 16 bit data */
+ //      Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data
+ //      Stage 3: Store both 8 byte of 16 bit data
  .Loop_8to16copy:
          {.mii
                  (ld_stage[0]) ld8 src_v[0] = [src], stride
-Line 236
+Line 277
- /*****************************************************************************
+ ///////////////////////////////////////////////////////////////////////////////
- *
+ //
- * transfer_16to8copy_ia64
+ // transfer_16to8copy_ia64
- *
+ //
- * src is a 64 x 16 bit signed continuous array. To convert the 16 bit
+ // src is a 64 x 16 bit signed continuous array. To convert the 16 bit
- * values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of
+ // values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of
- * 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word
+ // 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word
- * of 8 x 8 unsigned data to the destination.
+ // of 8 x 8 unsigned data to the destination.
- ****************************************************************************/
+ ///////////////////////////////////////////////////////////////////////////////
          .align 16
          .global transfer_16to8copy_ia64#
-Line 258
+Line 299
          src_2 = r17
          stride = r16
-         .body
  //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
          .save ar.lc, oldLC
          mov oldLC = ar.lc
          mov oldPR = pr
+         .body
  //      *** Allocating new stackframe, define rotating registers ***
          alloc r9 = ar.pfs, 4, 92, 0, 96
-Line 287
+Line 328
          .rotp ld_stage[LL], pack_stage[PL], st_stage[1]
- /* Software pipelined loop:
+ //      Software pipelined loop:
- * Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data
+ //      Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data
- * Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data
+ //      Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data
- * Stage 3: Store the 8 byte to the destination address and add stride to
+ //      Stage 3: Store the 8 byte to the destination address and add stride to
- *          destination address (to get the next 8 byte line of destination)*/
+ //               destination address (to get the next 8 byte line of destination)
  .Loop_16to8copy:
          {.mmi
                  (ld_stage[0]) ld8 src_v1[0] = [src_1], 16
-Line 314
+Line 357
- /*****************************************************************************
+ ///////////////////////////////////////////////////////////////////////////////
- *
+ //
- * transfer_16to8add_ia64
+ // transfer_16to8add_ia64
- *
+ //
- * The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16-
+ // The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16-
- * bit-values. These are "parallel-added" to the values of src. The result is
+ // bit-values. These are "parallel-added" to the values of src. The result is
- * converted into 8-bit-values using "PACK" and stored at the adress of dst.
+ // converted into 8-bit-values using "PACK" and stored at the adress of dst.
- * We assume that there is no misalignment.
+ // We assume that there is no misalignment.
- *
+ //
- *****************************************************************************/
+ ///////////////////////////////////////////////////////////////////////////////
          .align 16
          .global transfer_16to8add_ia64#
-Line 339
+Line 382
          _src = r17
-         .body
  //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
          .save ar.lc, r2
          mov oldLC = ar.lc
          mov oldPR = pr
+         .body
  //      *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR ***
          alloc r9 = ar.pfs, 4, 92, 0, 96
-Line 367
+Line 410
          .rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1]
- /*      Software pipelined loop:
+ //      Software pipelined loop:
-  *      s1_p: The values of src and dst are loaded
+ //      s1_p: The values of src and dst are loaded
-  *      s2_p: The dst-values are converted to 16-bit-values
+ //      s2_p: The dst-values are converted to 16-bit-values
-  *      s3_p: The values of src and dst are added
+ //      s3_p: The values of src and dst are added
-  *      s4_p: The Results are packed into 8-bit-values
+ //      s4_p: The Results are packed into 8-bit-values
-  *      s5_p: The 8-bit-values are stored at the dst-adresses
+ //      s5_p: The 8-bit-values are stored at the dst-adresses
-  */
  .Loop_16to8add:
          {.mii
-Line 407
+Line 450
- /*****************************************************************************
+ ///////////////////////////////////////////////////////////////////////////////
- *
+ //
- * transfer_8to16sub_ia64
+ // transfer_8to16sub_ia64
- *
+ //
- * The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The
+ // The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The
- * Difference of cur and ref ist stored at the dct-adresses and cur is copied
+ // Difference of cur and ref ist stored at the dct-adresses and cur is copied
- * into the ref-array.
+ // into the ref-array.
- *
+ //
- * You must assume, that the data adressed by 'ref' are misaligned in memory.
+ // You must assume, that the data adressed by 'ref' are misaligned in memory.
- * But you can assume, that the other data are aligned (at least I hope so).
+ // But you can assume, that the other data are aligned (at least I hope so).
- *
+ //
- ****************************************************************************/
+ ///////////////////////////////////////////////////////////////////////////////
          .align 16
          .global transfer_8to16sub_ia64#
-Line 447
+Line 490
          _dct = r21 // Register für die Zieladressen des 2. dct-Blocks
-         .body
  //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
          .save ar.lc, r2
          mov oldLC = ar.lc
          mov oldPR = pr
+         .body
  //      *** Allocating new stackframe, define rotating registers ***
          alloc r9 = ar.pfs, 4, 92, 0, 96
-Line 482
+Line 525
          .rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1]
- /*      Software pipelined loop:
+ //      Software pipelined loop:
-  *      s1_p: The values of ref and cur ale loaded, a copy of cur is made.
+ //      s1_p: The values of ref and cur ale loaded, a copy of cur is made.
-  *      s2_p: cur is converted to 16-bit and thehe misaligned values of ref are
+ //      s2_p: cur is converted to 16-bit and thehe misaligned values of ref are
-  *            shifted...
+ //            shifted...
-  *      s3_p: ... and copied together.
+ //      s3_p: ... and copied together.
-  *      s4_p: This ref-value is converted to 16-bit. The values of cur are stored
+ //      s4_p: This ref-value is converted to 16-bit. The values of cur are stored
-  *            at the ref-adresses.
+ //            at the ref-adresses.
-  *      s5_p: the ref- abd cur-values are substracted...
+ //      s5_p: the ref- abd cur-values are substracted...
-  *      s6_p: ...and the result is stored at the dct-adresses.
+ //      s6_p: ...and the result is stored at the dct-adresses.
-  */
  loop_8to16sub:
          {.mii
-Line 537
+Line 580
- /*****************************************************************************
+ ///////////////////////////////////////////////////////////////////////////////
- *
+ //
- * transfer_8to16sub2_ia64
+ // transfer_8to16sub2_ia64
- *
+ //
- * At the time, this function was written, it was not yet in use.
+ // At the time, this function was written, it was not yet in use.
- * We assume that the values of ref1/2 are misaligned.
+ // We assume that the values of ref1/2 are misaligned.
- *
+ //
- * The values of ref1/2 and cur are loaded, the ref-values need misalignment-
+ // The values of ref1/2 and cur are loaded, the ref-values need misalignment-
- * treatment. The values are converted to 16-bit using unpack. The average of
+ // treatment. The values are converted to 16-bit using unpack. The average of
- * ref1 and ref2 is computed with pavg and substacted from cur. The results are
+ // ref1 and ref2 is computed with pavg and substacted from cur. The results are
- * stored at the dct-adresses.
+ // stored at the dct-adresses.
- * pavg1.raz is used to get the same results as the C-code-function.
+ // pavg1.raz is used to get the same results as the C-code-function.
- *
+ //
- *****************************************************************************/
+ ///////////////////////////////////////////////////////////////////////////////
          .text
          .align 16
-Line 582
+Line 625
          aoffset_1 = r24
          aoffset_2 = r25
-         .body
  //      *** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
          .save ar.lc, r2
          mov oldLC = ar.lc
          mov oldPR = pr
+         .body
  //      *** Saving Paramters ***
  //      *** (as inputregisters r32 + are needed for register-rotation) ***
          mov dct_ar = r32
-Line 624
+Line 667
          .rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1]
          .rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1]
- /*      software pipelined loop:
-  *      ld_stage:   The values of ref1, ref2, cur are loaded
+ //      software pipelined loop:
-  *      sh_stage:   The misaligned values of ref1/2 are shifted...
+ //      ld_stage:   The values of ref1, ref2, cur are loaded
-  *      or_stage:   ...and copied together.
+ //      sh_stage:   The misaligned values of ref1/2 are shifted...
-  *      pavg_stage: The average of ref1 and ref2 is computed.
+ //      or_stage:   ...and copied together.
-  *      up_stage:   The result and the cur-values are converted to 16-bit.
+ //      pavg_stage: The average of ref1 and ref2 is computed.
-  *      psub_stage: Those values are substracted...
+ //      up_stage:   The result and the cur-values are converted to 16-bit.
-  *      st_stage:   ...and stored at the dct-adresses.
+ //      psub_stage: Those values are substracted...
-  */
+ //      st_stage:   ...and stored at the dct-adresses.
  .Loop_8to16sub2:
          {.mii

 Legend:



Removed from v.304
 


changed lines


 
Added in v.305
 Legend:



Removed from v.304
 


changed lines


 
Added in v.305
-Removed from v.304
+Added in v.305

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4