1 |
/**************************************************************************** |
// **************************************************************************** |
2 |
* |
// * |
3 |
* mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, |
// * XVID MPEG-4 VIDEO CODEC |
4 |
* University of Karlsruhe, Germany, 03.06.2002, during the laboratory |
// * - IA64 8bit<->16bit transfer - |
5 |
* "IA-64 Video Codec Assember Parktikum" at IPD Goos. |
// * |
6 |
* |
// * Copyright(C) 2002 Sebastian Felis, Max Stengel |
7 |
* Annotations: |
// * |
8 |
* =========== |
// * This program is free software; you can redistribute it and/or modify it |
9 |
* |
// * under the terms of the GNU General Public License as published by |
10 |
* - All functions work on 8x8-matrices. While the C-code-functions treat each |
// * the Free Software Foundation; either version 2 of the License, or |
11 |
* element seperatly, the functions in this assembler-code treat a whole line |
// * (at your option) any later version. |
12 |
* simultaneously. So one loop is saved. |
// * |
13 |
* The remaining loop is relized by using softwarepipelining with rotating |
// * This program is distributed in the hope that it will be useful, |
14 |
* rregisters. |
// * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
* - Register renaming is used for better readability |
// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
* - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both |
// * GNU General Public License for more details. |
17 |
* parts are shifted and joined together with an "OR"-Instruction. |
// * |
18 |
* - First parameter is stored in GR 32, next in GR 33, and so on. They must be |
// * You should have received a copy of the GNU General Public License |
19 |
* saved, as these GRs are used for register-rotation. |
// * along with this program; if not, write to the Free Software |
20 |
* - Some of the orininal, German comments used during development are left in |
// * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
* in the code. They shouldn't bother anyone. |
// * |
22 |
* |
// * $Id: mem_transfer_ia64.s,v 1.6 2009-02-19 17:07:29 Isibaar Exp $ |
23 |
* Anmerkungen: |
// * |
24 |
* ============ |
// ***************************************************************************/ |
25 |
* |
// |
26 |
* - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code |
// **************************************************************************** |
27 |
* jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- |
// * |
28 |
* Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. |
// * mem_transfer_ia64.s, IA-64 8bit<->16bit transfer |
29 |
* Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit |
// * |
30 |
* rotierenden Registern realisiert. |
// * This version was implemented during an IA-64 practical training at |
31 |
* - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. |
// * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/) |
32 |
* - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke |
// * |
33 |
* geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem |
// **************************************************************************** |
34 |
* logischen Oder zusammenkopiert. |
|
35 |
* - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- |
/////////////////////////////////////////////////////////////////////////////// |
36 |
* sichert werden, da die Register für die register-Rotation benötigt werden. |
// |
37 |
* - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase |
// mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, |
38 |
* sind im Code verblieben. Sie sollten niemanden stören. |
// University of Karlsruhe, Germany, 03.06.2002, during the laboratory |
39 |
* |
// "IA-64 Video Codec Assember Parktikum" at IPD Goos. |
40 |
****************************************************************************/ |
|
41 |
|
///// History ///////////////////////////////////////////////////////////////// |
42 |
|
// |
43 |
|
// - 16.07.2002: several minor changes for ecc-conformity |
44 |
|
// - 03.06.2002: initial version |
45 |
|
// |
46 |
|
|
47 |
|
/////////////////////////////////////////////////////////////////////////////// |
48 |
|
// |
49 |
|
// Annotations: |
50 |
|
// =========== |
51 |
|
// |
52 |
|
// - All functions work on 8x8-matrices. While the C-code-functions treat each |
53 |
|
// element seperatly, the functions in this assembler-code treat a whole line |
54 |
|
// simultaneously. So one loop is saved. |
55 |
|
// The remaining loop is relized by using softwarepipelining with rotating |
56 |
|
// rregisters. |
57 |
|
// - Register renaming is used for better readability |
58 |
|
// - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both |
59 |
|
// parts are shifted and joined together with an "OR"-Instruction. |
60 |
|
// - First parameter is stored in GR 32, next in GR 33, and so on. They must be |
61 |
|
// saved, as these GRs are used for register-rotation. |
62 |
|
// - Some of the orininal, German comments used during development are left in |
63 |
|
// in the code. They shouldn't bother anyone. |
64 |
|
// |
65 |
|
// Anmerkungen: |
66 |
|
// ============ |
67 |
|
// |
68 |
|
// - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code |
69 |
|
// jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- |
70 |
|
// Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. |
71 |
|
// Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit |
72 |
|
// rotierenden Registern realisiert. |
73 |
|
// - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. |
74 |
|
// - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke |
75 |
|
// geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem |
76 |
|
// logischen Oder zusammenkopiert. |
77 |
|
// - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- |
78 |
|
// sichert werden, da die Register für die register-Rotation benötigt werden. |
79 |
|
// - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase |
80 |
|
// sind im Code verblieben. Sie sollten niemanden stören. |
81 |
|
// |
82 |
|
/////////////////////////////////////////////////////////////////////////////// |
83 |
|
|
84 |
|
|
85 |
// *** define Latencies for software pipilines *** |
// *** define Latencies for software pipilines *** |
97 |
.text |
.text |
98 |
|
|
99 |
|
|
100 |
/**************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
101 |
* |
// |
102 |
* transfer8x8_copy_ia64 |
// transfer8x8_copy_ia64 |
103 |
* |
// |
104 |
* SRC is missaligned, to align the source load two 8-bytes-words, shift it, |
// SRC is missaligned, to align the source load two 8-bytes-words, shift it, |
105 |
* join them and store the aligned source into the destination address. |
// join them and store the aligned source into the destination address. |
106 |
* |
// |
107 |
****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
108 |
|
|
109 |
.align 16 |
.align 16 |
110 |
.global transfer8x8_copy_ia64# |
.global transfer8x8_copy_ia64# |
127 |
offset = r18 // shift right offset |
offset = r18 // shift right offset |
128 |
aoffset = r19 // shift left offset |
aoffset = r19 // shift left offset |
129 |
|
|
|
|
|
|
.body |
|
|
|
|
130 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
131 |
.save ar.lc, oldLC |
.save ar.lc, oldLC |
132 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
133 |
mov oldPR = pr |
mov oldPR = pr |
134 |
|
|
135 |
|
.body |
136 |
|
|
137 |
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
138 |
alloc r9 = ar.pfs, 3, 29, 0, 32 |
alloc r9 = ar.pfs, 3, 29, 0, 32 |
139 |
|
|
159 |
.rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] |
.rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] |
160 |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] |
161 |
|
|
162 |
/* Software pipelined loop: |
|
163 |
* Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 |
// Software pipelined loop: |
164 |
* Stage 2: Shift both values of source to SHD_R and SHD_L |
// Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 |
165 |
* Stage 3: Join both parts together with OR |
// Stage 2: Shift both values of source to SHD_R and SHD_L |
166 |
* Stage 4: Store aligned date to destination and add stride to destination address */ |
// Stage 3: Join both parts together with OR |
167 |
|
// Stage 4: Store aligned date to destination and add stride to destination address |
168 |
|
|
169 |
|
|
170 |
.Loop_8x8copy: |
.Loop_8x8copy: |
171 |
{.mii |
{.mii |
172 |
(ld_stage[0]) ld8 src_v1[0] = [src_1], stride |
(ld_stage[0]) ld8 src_v1[0] = [src_1], stride |
195 |
|
|
196 |
|
|
197 |
|
|
198 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
199 |
* |
// |
200 |
* transfer_8to16copy_ia64 |
// transfer_8to16copy_ia64 |
201 |
* |
// |
202 |
* SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, |
// SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, |
203 |
* UNPACK is used. So 8 bytes are loaded from source, unpacked to two |
// UNPACK is used. So 8 bytes are loaded from source, unpacked to two |
204 |
* 4 x 16 bit values and stored to the destination. Destination is a continuous |
// 4 x 16 bit values and stored to the destination. Destination is a continuous |
205 |
* array of 64 x 16 bit signed data. To store the next line, only 16 must be |
// array of 64 x 16 bit signed data. To store the next line, only 16 must be |
206 |
* added to the destination address. |
// added to the destination address. |
207 |
*****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
208 |
|
|
209 |
.align 16 |
.align 16 |
210 |
.global transfer_8to16copy_ia64# |
.global transfer_8to16copy_ia64# |
225 |
src = r16 |
src = r16 |
226 |
stride = r17 |
stride = r17 |
227 |
|
|
|
|
|
|
.body |
|
|
|
|
228 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
229 |
.save ar.lc, oldLC |
.save ar.lc, oldLC |
230 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
231 |
mov oldPR = pr |
mov oldPR = pr |
232 |
|
|
233 |
|
|
234 |
|
.body |
235 |
|
|
236 |
// *** Allocating new stackframe, define rotating registers *** |
// *** Allocating new stackframe, define rotating registers *** |
237 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
238 |
|
|
253 |
.rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] |
.rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] |
254 |
.rotp ld_stage[LL], upack_stage[UL], st_stage[1] |
.rotp ld_stage[LL], upack_stage[UL], st_stage[1] |
255 |
|
|
256 |
/* Software pipelined loop: |
|
257 |
* Stage 1: Load value of SRC |
// Software pipelined loop: |
258 |
* Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data |
// Stage 1: Load value of SRC |
259 |
* Stage 3: Store both 8 byte of 16 bit data */ |
// Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data |
260 |
|
// Stage 3: Store both 8 byte of 16 bit data |
261 |
|
|
262 |
|
|
263 |
.Loop_8to16copy: |
.Loop_8to16copy: |
264 |
{.mii |
{.mii |
265 |
(ld_stage[0]) ld8 src_v[0] = [src], stride |
(ld_stage[0]) ld8 src_v[0] = [src], stride |
283 |
|
|
284 |
|
|
285 |
|
|
286 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
287 |
* |
// |
288 |
* transfer_16to8copy_ia64 |
// transfer_16to8copy_ia64 |
289 |
* |
// |
290 |
* src is a 64 x 16 bit signed continuous array. To convert the 16 bit |
// src is a 64 x 16 bit signed continuous array. To convert the 16 bit |
291 |
* values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of |
// values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of |
292 |
* 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word |
// 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word |
293 |
* of 8 x 8 unsigned data to the destination. |
// of 8 x 8 unsigned data to the destination. |
294 |
****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
295 |
|
|
296 |
.align 16 |
.align 16 |
297 |
.global transfer_16to8copy_ia64# |
.global transfer_16to8copy_ia64# |
305 |
src_2 = r17 |
src_2 = r17 |
306 |
stride = r16 |
stride = r16 |
307 |
|
|
|
|
|
|
.body |
|
|
|
|
308 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
309 |
.save ar.lc, oldLC |
.save ar.lc, oldLC |
310 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
311 |
mov oldPR = pr |
mov oldPR = pr |
312 |
|
|
313 |
|
|
314 |
|
.body |
315 |
|
|
316 |
// *** Allocating new stackframe, define rotating registers *** |
// *** Allocating new stackframe, define rotating registers *** |
317 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
318 |
|
|
334 |
.rotp ld_stage[LL], pack_stage[PL], st_stage[1] |
.rotp ld_stage[LL], pack_stage[PL], st_stage[1] |
335 |
|
|
336 |
|
|
337 |
/* Software pipelined loop: |
// Software pipelined loop: |
338 |
* Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data |
// Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data |
339 |
* Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data |
// Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data |
340 |
* Stage 3: Store the 8 byte to the destination address and add stride to |
// Stage 3: Store the 8 byte to the destination address and add stride to |
341 |
* destination address (to get the next 8 byte line of destination)*/ |
// destination address (to get the next 8 byte line of destination) |
342 |
|
|
343 |
|
|
344 |
.Loop_16to8copy: |
.Loop_16to8copy: |
345 |
{.mmi |
{.mmi |
346 |
(ld_stage[0]) ld8 src_v1[0] = [src_1], 16 |
(ld_stage[0]) ld8 src_v1[0] = [src_1], 16 |
363 |
|
|
364 |
|
|
365 |
|
|
366 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
367 |
* |
// |
368 |
* transfer_16to8add_ia64 |
// transfer_16to8add_ia64 |
369 |
* |
// |
370 |
* The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- |
// The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- |
371 |
* bit-values. These are "parallel-added" to the values of src. The result is |
// bit-values. These are "parallel-added" to the values of src. The result is |
372 |
* converted into 8-bit-values using "PACK" and stored at the adress of dst. |
// converted into 8-bit-values using "PACK" and stored at the adress of dst. |
373 |
* We assume that there is no misalignment. |
// We assume that there is no misalignment. |
374 |
* |
// |
375 |
*****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
376 |
|
|
377 |
.align 16 |
.align 16 |
378 |
.global transfer_16to8add_ia64# |
.global transfer_16to8add_ia64# |
388 |
|
|
389 |
_src = r17 |
_src = r17 |
390 |
|
|
|
|
|
|
.body |
|
|
|
|
391 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
392 |
.save ar.lc, r2 |
.save ar.lc, r2 |
393 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
394 |
mov oldPR = pr |
mov oldPR = pr |
395 |
|
|
396 |
|
|
397 |
|
.body |
398 |
|
|
399 |
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
400 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
401 |
|
|
416 |
.rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] |
.rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] |
417 |
|
|
418 |
|
|
419 |
/* Software pipelined loop: |
// Software pipelined loop: |
420 |
* s1_p: The values of src and dst are loaded |
// s1_p: The values of src and dst are loaded |
421 |
* s2_p: The dst-values are converted to 16-bit-values |
// s2_p: The dst-values are converted to 16-bit-values |
422 |
* s3_p: The values of src and dst are added |
// s3_p: The values of src and dst are added |
423 |
* s4_p: The Results are packed into 8-bit-values |
// s4_p: The Results are packed into 8-bit-values |
424 |
* s5_p: The 8-bit-values are stored at the dst-adresses |
// s5_p: The 8-bit-values are stored at the dst-adresses |
425 |
*/ |
|
426 |
|
|
427 |
.Loop_16to8add: |
.Loop_16to8add: |
428 |
{.mii |
{.mii |
456 |
|
|
457 |
|
|
458 |
|
|
459 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
460 |
* |
// |
461 |
* transfer_8to16sub_ia64 |
// transfer_8to16sub_ia64 |
462 |
* |
// |
463 |
* The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The |
// The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The |
464 |
* Difference of cur and ref ist stored at the dct-adresses and cur is copied |
// Difference of cur and ref ist stored at the dct-adresses and cur is copied |
465 |
* into the ref-array. |
// into the ref-array. |
466 |
* |
// |
467 |
* You must assume, that the data adressed by 'ref' are misaligned in memory. |
// You must assume, that the data adressed by 'ref' are misaligned in memory. |
468 |
* But you can assume, that the other data are aligned (at least I hope so). |
// But you can assume, that the other data are aligned (at least I hope so). |
469 |
* |
// |
470 |
****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
471 |
|
|
472 |
.align 16 |
.align 16 |
473 |
.global transfer_8to16sub_ia64# |
.global transfer_8to16sub_ia64# |
496 |
|
|
497 |
_dct = r21 // Register für die Zieladressen des 2. dct-Blocks |
_dct = r21 // Register für die Zieladressen des 2. dct-Blocks |
498 |
|
|
|
|
|
|
.body |
|
|
|
|
499 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
500 |
.save ar.lc, r2 |
.save ar.lc, r2 |
501 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
502 |
mov oldPR = pr |
mov oldPR = pr |
503 |
|
|
504 |
|
|
505 |
|
.body |
506 |
|
|
507 |
// *** Allocating new stackframe, define rotating registers *** |
// *** Allocating new stackframe, define rotating registers *** |
508 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
509 |
|
|
531 |
.rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] |
.rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] |
532 |
|
|
533 |
|
|
534 |
/* Software pipelined loop: |
// Software pipelined loop: |
535 |
* s1_p: The values of ref and cur ale loaded, a copy of cur is made. |
// s1_p: The values of ref and cur ale loaded, a copy of cur is made. |
536 |
* s2_p: cur is converted to 16-bit and thehe misaligned values of ref are |
// s2_p: cur is converted to 16-bit and thehe misaligned values of ref are |
537 |
* shifted... |
// shifted... |
538 |
* s3_p: ... and copied together. |
// s3_p: ... and copied together. |
539 |
* s4_p: This ref-value is converted to 16-bit. The values of cur are stored |
// s4_p: This ref-value is converted to 16-bit. The values of cur are stored |
540 |
* at the ref-adresses. |
// at the ref-adresses. |
541 |
* s5_p: the ref- abd cur-values are substracted... |
// s5_p: the ref- abd cur-values are substracted... |
542 |
* s6_p: ...and the result is stored at the dct-adresses. |
// s6_p: ...and the result is stored at the dct-adresses. |
543 |
*/ |
|
544 |
|
|
545 |
loop_8to16sub: |
loop_8to16sub: |
546 |
{.mii |
{.mii |
586 |
|
|
587 |
|
|
588 |
|
|
589 |
/***************************************************************************** |
/////////////////////////////////////////////////////////////////////////////// |
590 |
* |
// |
591 |
* transfer_8to16sub2_ia64 |
// transfer_8to16sub2_ia64 |
592 |
* |
// |
593 |
* At the time, this function was written, it was not yet in use. |
// At the time, this function was written, it was not yet in use. |
594 |
* We assume that the values of ref1/2 are misaligned. |
// We assume that the values of ref1/2 are misaligned. |
595 |
* |
// |
596 |
* The values of ref1/2 and cur are loaded, the ref-values need misalignment- |
// The values of ref1/2 and cur are loaded, the ref-values need misalignment- |
597 |
* treatment. The values are converted to 16-bit using unpack. The average of |
// treatment. The values are converted to 16-bit using unpack. The average of |
598 |
* ref1 and ref2 is computed with pavg and substacted from cur. The results are |
// ref1 and ref2 is computed with pavg and substacted from cur. The results are |
599 |
* stored at the dct-adresses. |
// stored at the dct-adresses. |
600 |
* pavg1.raz is used to get the same results as the C-code-function. |
// pavg1.raz is used to get the same results as the C-code-function. |
601 |
* |
// |
602 |
*****************************************************************************/ |
/////////////////////////////////////////////////////////////////////////////// |
603 |
|
|
604 |
.text |
.text |
605 |
.align 16 |
.align 16 |
631 |
aoffset_1 = r24 |
aoffset_1 = r24 |
632 |
aoffset_2 = r25 |
aoffset_2 = r25 |
633 |
|
|
|
|
|
|
.body |
|
|
|
|
634 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
635 |
.save ar.lc, r2 |
.save ar.lc, r2 |
636 |
mov oldLC = ar.lc |
mov oldLC = ar.lc |
637 |
mov oldPR = pr |
mov oldPR = pr |
638 |
|
|
639 |
|
|
640 |
|
.body |
641 |
|
|
642 |
// *** Saving Paramters *** |
// *** Saving Paramters *** |
643 |
// *** (as inputregisters r32 + are needed for register-rotation) *** |
// *** (as inputregisters r32 + are needed for register-rotation) *** |
644 |
mov dct_ar = r32 |
mov dct_ar = r32 |
673 |
.rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] |
.rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] |
674 |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] |
675 |
|
|
676 |
/* software pipelined loop: |
|
677 |
* ld_stage: The values of ref1, ref2, cur are loaded |
// software pipelined loop: |
678 |
* sh_stage: The misaligned values of ref1/2 are shifted... |
// ld_stage: The values of ref1, ref2, cur are loaded |
679 |
* or_stage: ...and copied together. |
// sh_stage: The misaligned values of ref1/2 are shifted... |
680 |
* pavg_stage: The average of ref1 and ref2 is computed. |
// or_stage: ...and copied together. |
681 |
* up_stage: The result and the cur-values are converted to 16-bit. |
// pavg_stage: The average of ref1 and ref2 is computed. |
682 |
* psub_stage: Those values are substracted... |
// up_stage: The result and the cur-values are converted to 16-bit. |
683 |
* st_stage: ...and stored at the dct-adresses. |
// psub_stage: Those values are substracted... |
684 |
*/ |
// st_stage: ...and stored at the dct-adresses. |
685 |
|
|
686 |
|
|
687 |
.Loop_8to16sub2: |
.Loop_8to16sub2: |
688 |
{.mii |
{.mii |