1 |
.file "mem_transfer.c" |
/**************************************************************************** |
2 |
.pred.safe_across_calls p1-p5,p16-p63 |
* |
3 |
.common transfer_8to16copy#,8,8 |
* mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel, |
4 |
|
* University of Karlsruhe, Germany, 03.06.2002, during the laboratory |
5 |
|
* "IA-64 Video Codec Assember Parktikum" at IPD Goos. |
6 |
|
* |
7 |
|
* Annotations: |
8 |
|
* =========== |
9 |
|
* |
10 |
|
* - All functions work on 8x8-matrices. While the C-code-functions treat each |
11 |
|
* element seperatly, the functions in this assembler-code treat a whole line |
12 |
|
* simultaneously. So one loop is saved. |
13 |
|
* The remaining loop is relized by using softwarepipelining with rotating |
14 |
|
* rregisters. |
15 |
|
* - Register renaming is used for better readability |
16 |
|
* - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both |
17 |
|
* parts are shifted and joined together with an "OR"-Instruction. |
18 |
|
* - First parameter is stored in GR 32, next in GR 33, and so on. They must be |
19 |
|
* saved, as these GRs are used for register-rotation. |
20 |
|
* - Some of the orininal, German comments used during development are left in |
21 |
|
* in the code. They shouldn't bother anyone. |
22 |
|
* |
23 |
|
* Anmerkungen: |
24 |
|
* ============ |
25 |
|
* |
26 |
|
* - Alle Funtionen arbeiten mit 8x8-Matrizen. Während die Funktionen im C-Code |
27 |
|
* jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler- |
28 |
|
* Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden. |
29 |
|
* Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit |
30 |
|
* rotierenden Registern realisiert. |
31 |
|
* - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet. |
32 |
|
* - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Blöcke |
33 |
|
* geladen, beide Teile mit "shift"-Operationen zurechterückt und mit einem |
34 |
|
* logischen Oder zusammenkopiert. |
35 |
|
* - Die Parameter werden in den Registern ab GR 32 übergeben. Sie müssen ge- |
36 |
|
* sichert werden, da die Register für die register-Rotation benötigt werden. |
37 |
|
* - Einige der ursprünglichen, deutschen Kommentare aus der Entwicklungsphase |
38 |
|
* sind im Code verblieben. Sie sollten niemanden stören. |
39 |
|
* |
40 |
|
****************************************************************************/ |
41 |
|
|
42 |
|
|
43 |
|
// *** define Latencies for software pipilines *** |
44 |
|
|
45 |
|
LL = 3 // Load |
46 |
|
SL = 3 // Store |
47 |
|
PL = 1 // Pack |
48 |
|
SHL = 1 // Shift |
49 |
|
OL = 1 // Or |
50 |
|
UL = 1 // Unpack |
51 |
|
PAL = 1 // Parallel Add |
52 |
|
PSL = 1 // Parallel Subtract |
53 |
|
PAVGL = 1 // Parallel Avarage |
54 |
|
|
55 |
.text |
.text |
56 |
|
|
57 |
|
|
58 |
|
/**************************************************************************** |
59 |
|
* |
60 |
|
* transfer8x8_copy_ia64 |
61 |
|
* |
62 |
|
* SRC is missaligned, to align the source load two 8-bytes-words, shift it, |
63 |
|
* join them and store the aligned source into the destination address. |
64 |
|
* |
65 |
|
****************************************************************************/ |
66 |
|
|
67 |
|
.align 16 |
68 |
|
.global transfer8x8_copy_ia64# |
69 |
|
.proc transfer8x8_copy_ia64# |
70 |
|
|
71 |
|
transfer8x8_copy_ia64: |
72 |
|
.prologue |
73 |
|
|
74 |
|
// *** register renaming *** |
75 |
|
zero = r0 |
76 |
|
|
77 |
|
oldLC = r2 |
78 |
|
oldPR = r3 |
79 |
|
|
80 |
|
src_1 = r14 // left aligned address of src |
81 |
|
src_2 = r15 // right aligned address of src |
82 |
|
dst = r16 // destination address |
83 |
|
stride = r17 |
84 |
|
|
85 |
|
offset = r18 // shift right offset |
86 |
|
aoffset = r19 // shift left offset |
87 |
|
|
88 |
|
|
89 |
|
.body |
90 |
|
|
91 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
92 |
|
.save ar.lc, oldLC |
93 |
|
mov oldLC = ar.lc |
94 |
|
mov oldPR = pr |
95 |
|
|
96 |
|
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
97 |
|
alloc r9 = ar.pfs, 3, 29, 0, 32 |
98 |
|
|
99 |
|
// *** Saving Parameters *** |
100 |
|
mov dst = r32 |
101 |
|
mov stride = r34 |
102 |
|
|
103 |
|
// *** Misalingment-Treatment *** |
104 |
|
and src_1 = -8, r33 // Computing adress of first aligned block containing src-values |
105 |
|
dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress |
106 |
|
;; |
107 |
|
sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl |
108 |
|
add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values |
109 |
|
|
110 |
|
// *** init loop: set loop counter, epilog counter, predicates *** |
111 |
|
mov ar.lc = 7 |
112 |
|
mov ar.ec = LL + SHL + OL + 1 |
113 |
|
mov pr.rot = 1 << 16 |
114 |
|
;; |
115 |
|
|
116 |
|
// *** define register arrays and predicate array for software pipeline *** |
117 |
|
// src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left |
118 |
|
.rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1] |
119 |
|
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1] |
120 |
|
|
121 |
|
/* Software pipelined loop: |
122 |
|
* Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2 |
123 |
|
* Stage 2: Shift both values of source to SHD_R and SHD_L |
124 |
|
* Stage 3: Join both parts together with OR |
125 |
|
* Stage 4: Store aligned date to destination and add stride to destination address */ |
126 |
|
.Loop_8x8copy: |
127 |
|
{.mii |
128 |
|
(ld_stage[0]) ld8 src_v1[0] = [src_1], stride |
129 |
|
(sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset |
130 |
|
} |
131 |
|
{.mii |
132 |
|
(ld_stage[0]) ld8 src_v2[0] = [src_2], stride |
133 |
|
(sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset |
134 |
|
(or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL] |
135 |
|
} |
136 |
|
{.mib |
137 |
|
(st_stage[0]) st8 [dst] = value[OL] |
138 |
|
(st_stage[0]) add dst = dst, stride |
139 |
|
br.ctop.sptk.few .Loop_8x8copy |
140 |
|
;; |
141 |
|
} |
142 |
|
|
143 |
|
// *** Restore old LC and PRs *** |
144 |
|
mov ar.lc = oldLC |
145 |
|
mov pr = oldPR, -1 |
146 |
|
|
147 |
|
br.ret.sptk.many b0 |
148 |
|
|
149 |
|
.endp transfer8x8_copy_ia64# |
150 |
|
|
151 |
|
|
152 |
|
|
153 |
|
|
154 |
|
/***************************************************************************** |
155 |
|
* |
156 |
|
* transfer_8to16copy_ia64 |
157 |
|
* |
158 |
|
* SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values, |
159 |
|
* UNPACK is used. So 8 bytes are loaded from source, unpacked to two |
160 |
|
* 4 x 16 bit values and stored to the destination. Destination is a continuous |
161 |
|
* array of 64 x 16 bit signed data. To store the next line, only 16 must be |
162 |
|
* added to the destination address. |
163 |
|
*****************************************************************************/ |
164 |
|
|
165 |
.align 16 |
.align 16 |
166 |
.global transfer_8to16copy_ia64# |
.global transfer_8to16copy_ia64# |
167 |
.proc transfer_8to16copy_ia64# |
.proc transfer_8to16copy_ia64# |
168 |
|
|
169 |
|
|
170 |
transfer_8to16copy_ia64: |
transfer_8to16copy_ia64: |
171 |
.prologue |
.prologue |
172 |
.save ar.lc, r2 |
|
173 |
mov r2 = ar.lc |
// *** register renaming *** |
174 |
|
oldLC = r2 |
175 |
|
oldPR = r3 |
176 |
|
|
177 |
|
zero = r0 // damit ist die Zahl "zero" = 0 gemeint |
178 |
|
|
179 |
|
dst_1 = r14 // destination address for first 4 x 16 bit values |
180 |
|
dst_2 = r15 // destination address for second 4 x 16 bit values |
181 |
|
src = r16 |
182 |
|
stride = r17 |
183 |
|
|
184 |
|
|
185 |
.body |
.body |
186 |
addl r14 = 7, r0 |
|
187 |
mov r21 = r0 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
188 |
mov r20 = r0 |
.save ar.lc, oldLC |
189 |
;; |
mov oldLC = ar.lc |
190 |
mov ar.lc = r14 |
mov oldPR = pr |
191 |
;; |
|
192 |
.L101: |
// *** Allocating new stackframe, define rotating registers *** |
193 |
addl r19 = 1, r0 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
194 |
zxt4 r14 = r21 |
|
195 |
dep.z r15 = r20, 1, 32 |
// *** Saving Paramters *** |
196 |
;; |
mov dst_1 = r32 // fist 4 x 16 bit values |
197 |
add r16 = r21, r19 |
add dst_2 = 8, r32 // second 4 x 16 bit values |
198 |
add r14 = r33, r14 |
mov src = r33 |
199 |
add r17 = r20, r19 |
mov stride = r34 |
200 |
;; |
|
201 |
ld1 r18 = [r14] |
// *** init loop: set loop counter, epilog counter, predicates *** |
202 |
add r15 = r15, r32 |
mov ar.lc = 7 |
203 |
zxt4 r16 = r16 |
mov ar.ec = LL + UL + 1 |
204 |
;; |
mov pr.rot = 1 << 16 |
205 |
st2 [r15] = r18 |
;; |
206 |
addl r19 = 2, r0 |
|
207 |
add r16 = r33, r16 |
// *** define register arrays and predicate array for software pipeline *** |
208 |
dep.z r17 = r17, 1, 32 |
// src_v = source value, dst_v1 = destination value 1 |
209 |
;; |
.rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1] |
210 |
ld1 r15 = [r16] |
.rotp ld_stage[LL], upack_stage[UL], st_stage[1] |
211 |
add r14 = r21, r19 |
|
212 |
add r18 = r20, r19 |
/* Software pipelined loop: |
213 |
add r17 = r17, r32 |
* Stage 1: Load value of SRC |
214 |
;; |
* Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data |
215 |
zxt4 r14 = r14 |
* Stage 3: Store both 8 byte of 16 bit data */ |
216 |
st2 [r17] = r15 |
.Loop_8to16copy: |
217 |
addl r19 = 3, r0 |
{.mii |
218 |
;; |
(ld_stage[0]) ld8 src_v[0] = [src], stride |
219 |
add r14 = r33, r14 |
(upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL] |
220 |
add r15 = r21, r19 |
(upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL] |
221 |
dep.z r18 = r18, 1, 32 |
} |
222 |
;; |
{.mmb |
223 |
ld1 r17 = [r14] |
(st_stage[0]) st8 [dst_1] = dst_v1[UL], 16 |
224 |
add r16 = r20, r19 |
(st_stage[0]) st8 [dst_2] = dst_v2[UL], 16 |
225 |
add r18 = r18, r32 |
br.ctop.sptk.few .Loop_8to16copy |
226 |
zxt4 r15 = r15 |
;; |
227 |
;; |
} |
228 |
st2 [r18] = r17 |
|
229 |
addl r19 = 4, r0 |
// *** Restore old LC and PRs *** |
230 |
add r15 = r33, r15 |
mov ar.lc = oldLC |
231 |
dep.z r16 = r16, 1, 32 |
mov pr = oldPR, -1 |
232 |
;; |
|
|
ld1 r18 = [r15] |
|
|
add r14 = r21, r19 |
|
|
add r17 = r20, r19 |
|
|
add r16 = r16, r32 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
st2 [r16] = r18 |
|
|
addl r19 = 5, r0 |
|
|
;; |
|
|
add r14 = r33, r14 |
|
|
add r15 = r21, r19 |
|
|
add r16 = r20, r19 |
|
|
dep.z r17 = r17, 1, 32 |
|
|
;; |
|
|
ld1 r18 = [r14] |
|
|
addl r19 = 6, r0 |
|
|
add r17 = r17, r32 |
|
|
zxt4 r15 = r15 |
|
|
;; |
|
|
st2 [r17] = r18 |
|
|
add r14 = r21, r19 |
|
|
add r15 = r33, r15 |
|
|
dep.z r16 = r16, 1, 32 |
|
|
add r17 = r20, r19 |
|
|
;; |
|
|
ld1 r18 = [r15] |
|
|
add r16 = r16, r32 |
|
|
zxt4 r14 = r14 |
|
|
;; |
|
|
st2 [r16] = r18 |
|
|
addl r19 = 7, r0 |
|
|
add r14 = r33, r14 |
|
|
;; |
|
|
ld1 r15 = [r14] |
|
|
add r16 = r21, r19 |
|
|
dep.z r17 = r17, 1, 32 |
|
|
add r14 = r20, r19 |
|
|
;; |
|
|
add r17 = r17, r32 |
|
|
zxt4 r16 = r16 |
|
|
;; |
|
|
st2 [r17] = r15 |
|
|
dep.z r14 = r14, 1, 32 |
|
|
add r16 = r33, r16 |
|
|
;; |
|
|
add r14 = r14, r32 |
|
|
ld1 r15 = [r16] |
|
|
add r21 = r21, r34 |
|
|
;; |
|
|
st2 [r14] = r15 |
|
|
adds r20 = 8, r20 |
|
|
br.cloop.sptk.few .L101 |
|
|
;; |
|
|
mov ar.lc = r2 |
|
233 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
234 |
.endp transfer_8to16copy_ia64# |
.endp transfer_8to16copy_ia64# |
235 |
.common transfer_16to8copy#,8,8 |
|
236 |
|
|
237 |
|
|
238 |
|
|
239 |
|
/***************************************************************************** |
240 |
|
* |
241 |
|
* transfer_16to8copy_ia64 |
242 |
|
* |
243 |
|
* src is a 64 x 16 bit signed continuous array. To convert the 16 bit |
244 |
|
* values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of |
245 |
|
* 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word |
246 |
|
* of 8 x 8 unsigned data to the destination. |
247 |
|
****************************************************************************/ |
248 |
|
|
249 |
.align 16 |
.align 16 |
250 |
.global transfer_16to8copy_ia64# |
.global transfer_16to8copy_ia64# |
251 |
.proc transfer_16to8copy_ia64# |
.proc transfer_16to8copy_ia64# |
252 |
transfer_16to8copy_ia64: |
transfer_16to8copy_ia64: |
253 |
.prologue |
.prologue |
254 |
|
|
255 |
|
// *** register renaming *** |
256 |
|
dst = r14 |
257 |
|
src_1 = r15 |
258 |
|
src_2 = r17 |
259 |
|
stride = r16 |
260 |
|
|
261 |
|
|
262 |
.body |
.body |
263 |
mov r22 = r0 |
|
264 |
addl r21 = 255, r0 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
265 |
mov r20 = r0 |
.save ar.lc, oldLC |
266 |
mov r19 = r0 |
mov oldLC = ar.lc |
267 |
.L25: |
mov oldPR = pr |
268 |
mov r18 = r0 |
|
269 |
;; |
// *** Allocating new stackframe, define rotating registers *** |
270 |
.L29: |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
271 |
add r14 = r19, r18 |
|
272 |
;; |
// *** Saving Paramters *** |
273 |
dep.z r14 = r14, 1, 32 |
mov dst = r32 |
274 |
;; |
mov src_1 = r33 |
275 |
add r14 = r14, r33 |
add src_2 = 8, r33 |
276 |
;; |
mov stride = r34 |
277 |
ld2 r15 = [r14] |
|
278 |
;; |
// *** init loop: set loop counter, epilog counter, predicates *** |
279 |
sxt2 r15 = r15 |
mov ar.lc = 7 |
280 |
;; |
mov ar.ec = LL + PL + 1 |
281 |
mov r16 = r15 |
mov pr.rot = 1 << 16 |
282 |
;; |
;; |
283 |
cmp4.le p6, p7 = r0, r16 |
|
284 |
;; |
// *** define register arrays and predicate array for software pipeline *** |
285 |
(p7) mov r16 = r0 |
// src_v1 = source value 1, dst_v = destination value |
286 |
(p7) br.cond.dpnt .L106 |
.rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1] |
287 |
;; |
.rotp ld_stage[LL], pack_stage[PL], st_stage[1] |
288 |
cmp4.ge p6, p7 = r21, r16 |
|
289 |
;; |
|
290 |
(p7) addl r16 = 255, r0 |
/* Software pipelined loop: |
291 |
.L106: |
* Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data |
292 |
add r14 = r20, r18 |
* Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data |
293 |
adds r17 = 1, r18 |
* Stage 3: Store the 8 byte to the destination address and add stride to |
294 |
;; |
* destination address (to get the next 8 byte line of destination)*/ |
295 |
zxt4 r14 = r14 |
.Loop_16to8copy: |
296 |
add r15 = r19, r17 |
{.mmi |
297 |
;; |
(ld_stage[0]) ld8 src_v1[0] = [src_1], 16 |
298 |
add r14 = r32, r14 |
(ld_stage[0]) ld8 src_v2[0] = [src_2], 16 |
299 |
dep.z r15 = r15, 1, 32 |
(pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL] |
300 |
;; |
} |
301 |
st1 [r14] = r16 |
{.mib |
302 |
add r15 = r15, r33 |
(st_stage[0]) st8 [dst] = dst_v[PL] |
303 |
;; |
(st_stage[0]) add dst = dst, stride |
304 |
ld2 r14 = [r15] |
br.ctop.sptk.few .Loop_16to8copy |
305 |
;; |
;; |
306 |
sxt2 r14 = r14 |
} |
307 |
;; |
|
308 |
mov r16 = r14 |
// *** Restore old LC and PRs *** |
309 |
;; |
mov ar.lc = oldLC |
310 |
cmp4.le p6, p7 = r0, r16 |
mov pr = oldPR, -1 |
311 |
;; |
|
|
(p7) mov r16 = r0 |
|
|
(p7) br.cond.dpnt .L110 |
|
|
;; |
|
|
cmp4.ge p6, p7 = r21, r16 |
|
|
;; |
|
|
(p7) addl r16 = 255, r0 |
|
|
.L110: |
|
|
add r14 = r20, r17 |
|
|
adds r17 = 2, r18 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
add r15 = r19, r17 |
|
|
;; |
|
|
add r14 = r32, r14 |
|
|
dep.z r15 = r15, 1, 32 |
|
|
;; |
|
|
st1 [r14] = r16 |
|
|
add r15 = r15, r33 |
|
|
;; |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
mov r16 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r16 |
|
|
;; |
|
|
(p7) mov r16 = r0 |
|
|
(p7) br.cond.dpnt .L114 |
|
|
;; |
|
|
cmp4.ge p6, p7 = r21, r16 |
|
|
;; |
|
|
(p7) addl r16 = 255, r0 |
|
|
.L114: |
|
|
add r14 = r20, r17 |
|
|
adds r17 = 3, r18 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
add r15 = r19, r17 |
|
|
;; |
|
|
add r14 = r32, r14 |
|
|
dep.z r15 = r15, 1, 32 |
|
|
;; |
|
|
st1 [r14] = r16 |
|
|
add r15 = r15, r33 |
|
|
;; |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
mov r15 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r15 |
|
|
;; |
|
|
(p7) mov r15 = r0 |
|
|
(p7) br.cond.dpnt .L118 |
|
|
;; |
|
|
cmp4.ge p6, p7 = r21, r15 |
|
|
;; |
|
|
(p7) addl r15 = 255, r0 |
|
|
.L118: |
|
|
add r14 = r20, r17 |
|
|
adds r18 = 4, r18 |
|
|
;; |
|
|
zxt4 r14 = r14 |
|
|
cmp4.geu p6, p7 = 7, r18 |
|
|
;; |
|
|
add r14 = r32, r14 |
|
|
;; |
|
|
st1 [r14] = r15 |
|
|
(p6) br.cond.dptk .L29 |
|
|
adds r22 = 1, r22 |
|
|
add r20 = r20, r34 |
|
|
adds r19 = 8, r19 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r22 |
|
|
(p6) br.cond.dptk .L25 |
|
312 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
313 |
.endp transfer_16to8copy_ia64# |
.endp transfer_16to8copy_ia64# |
314 |
.common transfer_8to16sub#,8,8 |
|
315 |
|
|
316 |
|
|
317 |
|
/***************************************************************************** |
318 |
|
* |
319 |
|
* transfer_16to8add_ia64 |
320 |
|
* |
321 |
|
* The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16- |
322 |
|
* bit-values. These are "parallel-added" to the values of src. The result is |
323 |
|
* converted into 8-bit-values using "PACK" and stored at the adress of dst. |
324 |
|
* We assume that there is no misalignment. |
325 |
|
* |
326 |
|
*****************************************************************************/ |
327 |
|
|
328 |
|
.align 16 |
329 |
|
.global transfer_16to8add_ia64# |
330 |
|
.proc transfer_16to8add_ia64# |
331 |
|
|
332 |
|
transfer_16to8add_ia64: |
333 |
|
.prologue |
334 |
|
|
335 |
|
// *** register renaming *** |
336 |
|
dst = r14 |
337 |
|
src = r15 |
338 |
|
stride = r16 |
339 |
|
|
340 |
|
_src = r17 |
341 |
|
|
342 |
|
|
343 |
|
.body |
344 |
|
|
345 |
|
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
346 |
|
.save ar.lc, r2 |
347 |
|
mov oldLC = ar.lc |
348 |
|
mov oldPR = pr |
349 |
|
|
350 |
|
// *** Allocating new stackframe, initialize LC, Epilogue-Counter and PR *** |
351 |
|
alloc r9 = ar.pfs, 4, 92, 0, 96 |
352 |
|
|
353 |
|
// *** Saving Paramters *** |
354 |
|
mov dst = r32 |
355 |
|
mov src = r33 |
356 |
|
mov stride = r34 |
357 |
|
add _src = 8, r33 |
358 |
|
|
359 |
|
// *** init loop: set loop counter, epilog counter, predicates *** |
360 |
|
mov ar.lc = 7 |
361 |
|
mov ar.ec = LL + UL + PAL + PL + 1 |
362 |
|
mov pr.rot = 1 << 16 |
363 |
|
;; |
364 |
|
|
365 |
|
// *** define register arrays and predicate array for software pipeline *** |
366 |
|
.rotr _dst[LL+UL+PAL+PL+1], dst8[PL+1], pixel_1[PAL+1], pixel_2[PAL+1], w_dst16_1[UL+1], w_src_1[LL+UL+1], w_dst16_2[UL+1], w_src_2[LL+UL+1], w_dst8[LL+1] |
367 |
|
.rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1] |
368 |
|
|
369 |
|
|
370 |
|
/* Software pipelined loop: |
371 |
|
* s1_p: The values of src and dst are loaded |
372 |
|
* s2_p: The dst-values are converted to 16-bit-values |
373 |
|
* s3_p: The values of src and dst are added |
374 |
|
* s4_p: The Results are packed into 8-bit-values |
375 |
|
* s5_p: The 8-bit-values are stored at the dst-adresses |
376 |
|
*/ |
377 |
|
|
378 |
|
.Loop_16to8add: |
379 |
|
{.mii |
380 |
|
(s1_p[0]) ld8 w_src_1[0] = [src], 16 // läd die 1. Hälfte der j. Zeile von src (i = 0..3) |
381 |
|
(s1_p[0]) mov _dst[0] = dst // erhöht die Adresse von dst um stride |
382 |
|
(s3_p[0]) padd2.sss pixel_1[0] = w_dst16_1[UL], w_src_1[LL+UL] // parallele Addition von scr und dst |
383 |
|
} |
384 |
|
{.mii |
385 |
|
(s1_p[0]) ld8 w_dst8[0] = [dst], stride // läd die j. Zeile von dst |
386 |
|
(s2_p[0]) unpack1.l w_dst16_1[0] = r0, w_dst8[LL]; // dst wird für i = 0..3 in 16-Bit umgewandelt |
387 |
|
(s2_p[0]) unpack1.h w_dst16_2[0] = r0, w_dst8[LL]; // dst wird für i = 4..7 in 16-Bit umgewandelt |
388 |
|
} |
389 |
|
{.mii |
390 |
|
(s1_p[0]) ld8 w_src_2[0] = [_src], 16 // läd die 2. Hälfte der j. Zeile von src (i = 4..7) |
391 |
|
(s3_p[0]) padd2.sss pixel_2[0] = w_dst16_2[UL], w_src_2[LL+UL] // parallele Addition von scr und dst |
392 |
|
(s4_p[0]) pack2.uss dst8[0] = pixel_1[PAL], pixel_2[PAL] // wandelt die Summen (pixel) in 8-Bit Werte um. Die Überprüfung der Wertebereiche erfolgt automatisch |
393 |
|
} |
394 |
|
{.mmb |
395 |
|
(s5_p[0]) st8 [_dst[LL+UL+PAL+PL]] = dst8[PL] // speichert dst ab |
396 |
|
(s1_p[0]) nop.m 0 |
397 |
|
br.ctop.sptk.few .Loop_16to8add |
398 |
|
;; |
399 |
|
} |
400 |
|
|
401 |
|
// *** Restore old LC and PRs *** |
402 |
|
mov ar.lc = oldLC |
403 |
|
mov pr = oldPR, -1 |
404 |
|
|
405 |
|
br.ret.sptk.many b0 |
406 |
|
.endp transfer_16to8add_ia64# |
407 |
|
|
408 |
|
|
409 |
|
|
410 |
|
/***************************************************************************** |
411 |
|
* |
412 |
|
* transfer_8to16sub_ia64 |
413 |
|
* |
414 |
|
* The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The |
415 |
|
* Difference of cur and ref ist stored at the dct-adresses and cur is copied |
416 |
|
* into the ref-array. |
417 |
|
* |
418 |
|
* You must assume, that the data adressed by 'ref' are misaligned in memory. |
419 |
|
* But you can assume, that the other data are aligned (at least I hope so). |
420 |
|
* |
421 |
|
****************************************************************************/ |
422 |
|
|
423 |
.align 16 |
.align 16 |
424 |
.global transfer_8to16sub_ia64# |
.global transfer_8to16sub_ia64# |
425 |
.proc transfer_8to16sub_ia64# |
.proc transfer_8to16sub_ia64# |
426 |
|
|
427 |
|
|
428 |
transfer_8to16sub_ia64: |
transfer_8to16sub_ia64: |
429 |
.prologue |
.prologue |
430 |
|
|
431 |
|
// *** register renaming *** |
432 |
|
oldLC = r2 |
433 |
|
oldPR = r3 |
434 |
|
|
435 |
|
zero = r0 // damit ist die Zahl "zero" = 0 gemeint |
436 |
|
|
437 |
|
//Die folgenden Register erhalten die gleichen Namen, wie die Variablen in der C-Vorlage |
438 |
|
dct = r14 |
439 |
|
cur = r15 |
440 |
|
ref = r34 // muss nicht extra gesichert werden, deswegen bleibt das ÜbergabeRegister in dieser Liste |
441 |
|
stride = r16 |
442 |
|
|
443 |
|
offset = r17 // Offset der falsch ausgerichteten Daten zum zurechtrücken |
444 |
|
aoffset = r18 // Gegenstück zum Offset, |
445 |
|
ref_a1 = r19 // Adresse des ersten 64-Bit Blocks von ref |
446 |
|
ref_a2 = r20 // Adresse des zweiten 64-Bit Blocks von ref |
447 |
|
|
448 |
|
_dct = r21 // Register für die Zieladressen des 2. dct-Blocks |
449 |
|
|
450 |
|
|
451 |
.body |
.body |
452 |
mov r25 = r0 |
|
453 |
mov r24 = r0 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
454 |
mov r23 = r0 |
.save ar.lc, r2 |
455 |
.L39: |
mov oldLC = ar.lc |
456 |
mov r22 = r0 |
mov oldPR = pr |
457 |
;; |
|
458 |
.L43: |
// *** Allocating new stackframe, define rotating registers *** |
459 |
add r15 = r23, r22 |
alloc r9 = ar.pfs, 4, 92, 0, 96 |
460 |
adds r20 = 1, r22 |
|
461 |
add r16 = r24, r22 |
// *** Saving Paramters *** |
462 |
;; |
mov dct = r32 |
463 |
zxt4 r15 = r15 |
mov cur = r33 |
464 |
add r18 = r23, r20 |
// mov ref = r34: ref is unaligned, get aligned ref below... |
465 |
dep.z r16 = r16, 1, 32 |
mov stride = r35 |
466 |
;; |
|
467 |
add r19 = r34, r15 |
and ref_a1 = -8, ref // Die Adresse des ersten 64-Bit Blocks, in dem ref liegt, wird berechnet (entspricht mod 8) |
468 |
zxt4 r18 = r18 |
dep offset = ref, zero, 3, 3 |
469 |
add r16 = r16, r32 |
;; |
470 |
add r15 = r33, r15 |
add ref_a2 = 8, ref_a1 |
471 |
;; |
sub aoffset = 64, offset // Gegenstück zum Offset wird berechnet |
472 |
ld1 r14 = [r19] |
add _dct = 8, dct // Die Adresse für den 2. dct-Block wird berechnet, um 8 Byte (= 64 Bit) höher als beim 1. Block |
473 |
add r21 = r34, r18 |
|
474 |
ld1 r17 = [r15] |
// *** init loop: set loop counter, epilog counter, predicates *** |
475 |
adds r19 = 2, r22 |
mov ar.lc = 7 |
476 |
add r18 = r33, r18 |
mov ar.ec = LL + SHL + OL + UL + PSL + 1 |
477 |
;; |
mov pr.rot = 1 << 16 |
478 |
st1 [r15] = r14 |
;; |
479 |
sub r17 = r17, r14 |
|
480 |
add r20 = r24, r20 |
// *** define register arrays and predicate array for software pipeline *** |
481 |
;; |
.rotr c[LL+1], ref_v1[LL+1], ref_v2[LL+1], c16_1[SHL+OL+UL+1], c16_2[SHL+OL+UL+1], ref_shdr[SHL+1], ref_shdl[SHL+1], r[OL+1], r16_1[UL+1], r16_2[UL+1], dct_1[PSL+1], dct_2[PSL+1], _cur[LL+SHL+OL+UL+1] |
482 |
st2 [r16] = r17 |
.rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1] |
483 |
dep.z r20 = r20, 1, 32 |
|
484 |
ld1 r14 = [r21] |
|
485 |
ld1 r15 = [r18] |
/* Software pipelined loop: |
486 |
add r16 = r23, r19 |
* s1_p: The values of ref and cur ale loaded, a copy of cur is made. |
487 |
;; |
* s2_p: cur is converted to 16-bit and thehe misaligned values of ref are |
488 |
st1 [r18] = r14 |
* shifted... |
489 |
sub r15 = r15, r14 |
* s3_p: ... and copied together. |
490 |
zxt4 r16 = r16 |
* s4_p: This ref-value is converted to 16-bit. The values of cur are stored |
491 |
add r20 = r20, r32 |
* at the ref-adresses. |
492 |
;; |
* s5_p: the ref- abd cur-values are substracted... |
493 |
add r18 = r34, r16 |
* s6_p: ...and the result is stored at the dct-adresses. |
494 |
adds r17 = 3, r22 |
*/ |
495 |
st2 [r20] = r15 |
|
496 |
add r16 = r33, r16 |
loop_8to16sub: |
497 |
add r19 = r24, r19 |
{.mii |
498 |
;; |
(s1_p[0]) ld8 ref_v1[0] = [ref_a1], stride // läd den 1. 64-Bit-Block, der einen Teil der ref-Daten enthält |
499 |
ld1 r14 = [r18] |
(s1_p[0]) mov _cur[0] = cur // cur wird für spätere Verwendung gesichert |
500 |
add r15 = r23, r17 |
(s2_p[0]) shr.u ref_shdr[0] = ref_v1[LL], offset // Die rechte Hälfte wird zurechtgerückt |
501 |
dep.z r19 = r19, 1, 32 |
} |
502 |
ld1 r18 = [r16] |
{.mii |
503 |
;; |
(s1_p[0]) ld8 ref_v2[0] = [ref_a2], stride // läd den 2. 64-Bit-Block |
504 |
zxt4 r15 = r15 |
(s2_p[0]) shl ref_shdl[0] = ref_v2[LL], aoffset // Die linke Hälfte wird zurechtgerückt |
505 |
add r19 = r19, r32 |
(s3_p[0]) or r[0] = ref_shdr[SHL], ref_shdl[SHL] // Die zurechtgerückten Daten werden in r zusammenkopiert |
506 |
st1 [r16] = r14 |
} |
507 |
sub r18 = r18, r14 |
{.mii |
508 |
;; |
(s1_p[0]) ld8 c[0] = [cur], stride //läd die j. Zeile von cur komplett |
509 |
add r20 = r34, r15 |
(s2_p[0]) unpack1.l c16_1[0] = zero, c[LL]; // c wird für i = 0..3 in 16-Bit umgewandelt |
510 |
st2 [r19] = r18 |
(s2_p[0]) unpack1.h c16_2[0] = zero, c[LL]; // c wird für i = 4..7 in 16-Bit umgewandelt |
511 |
add r15 = r33, r15 |
} |
512 |
add r17 = r24, r17 |
{.mii |
513 |
;; |
(s4_p[0]) st8 [_cur[LL+SHL+OL]] = r[OL] // cur wird auf den Wert von r gesetzt |
514 |
ld1 r14 = [r20] |
//Umwandeln der 8-Bit r und c -Werte in 16-bit Werte |
515 |
ld1 r16 = [r15] |
(s4_p[0]) unpack1.l r16_1[0] = zero, r[OL]; // r wird für i = 0..3 in 16-Bit umgewandelt |
516 |
dep.z r17 = r17, 1, 32 |
(s4_p[0]) unpack1.h r16_2[0] = zero, r[OL]; // r wird für i = 4..7 in 16-Bit umgewandelt |
517 |
;; |
} |
518 |
add r17 = r17, r32 |
{.mii |
519 |
adds r22 = 4, r22 |
(s5_p[0]) psub2.sss dct_1[0] = c16_1[SHL+OL+UL], r16_1[UL] // Subtraktion der 1. Häfte der j. Zeile |
520 |
st1 [r15] = r14 |
(s5_p[0]) psub2.sss dct_2[0] = c16_2[SHL+OL+UL], r16_2[UL] // Subtraktion der 2. Hälfte |
521 |
sub r16 = r16, r14 |
} |
522 |
;; |
{.mmb |
523 |
cmp4.geu p6, p7 = 7, r22 |
(s6_p[0]) st8 [dct] = dct_1[PSL], 16 // speichert den 1. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert |
524 |
st2 [r17] = r16 |
(s6_p[0]) st8 [_dct] = dct_2[PSL], 16 // speichert den 2. 64-Bit-Block an der vorgesehenen Adresse, erhöhen der Adresse um 16 Byte für den nächsten Wert |
525 |
(p6) br.cond.dptk .L43 |
br.ctop.sptk.few loop_8to16sub // Und hopp |
526 |
adds r25 = 1, r25 |
;; |
527 |
adds r24 = 8, r24 |
} |
528 |
add r23 = r23, r35 |
|
529 |
;; |
// *** Restore old LC and PRs *** |
530 |
cmp4.geu p6, p7 = 7, r25 |
mov ar.lc = oldLC |
531 |
(p6) br.cond.dptk .L39 |
mov pr = oldPR, -1 |
532 |
|
|
533 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
534 |
.endp transfer_8to16sub_ia64# |
.endp transfer_8to16sub_ia64# |
535 |
.common transfer_8to16sub2#,8,8 |
|
536 |
|
|
537 |
|
|
538 |
|
|
539 |
|
|
540 |
|
/***************************************************************************** |
541 |
|
* |
542 |
|
* transfer_8to16sub2_ia64 |
543 |
|
* |
544 |
|
* At the time, this function was written, it was not yet in use. |
545 |
|
* We assume that the values of ref1/2 are misaligned. |
546 |
|
* |
547 |
|
* The values of ref1/2 and cur are loaded, the ref-values need misalignment- |
548 |
|
* treatment. The values are converted to 16-bit using unpack. The average of |
549 |
|
* ref1 and ref2 is computed with pavg and substacted from cur. The results are |
550 |
|
* stored at the dct-adresses. |
551 |
|
* pavg1.raz is used to get the same results as the C-code-function. |
552 |
|
* |
553 |
|
*****************************************************************************/ |
554 |
|
|
555 |
|
.text |
556 |
.align 16 |
.align 16 |
557 |
.global transfer_8to16sub2_ia64# |
.global transfer_8to16sub2_ia64# |
558 |
.proc transfer_8to16sub2_ia64# |
.proc transfer_8to16sub2_ia64# |
559 |
|
|
560 |
transfer_8to16sub2_ia64: |
transfer_8to16sub2_ia64: |
561 |
.prologue |
.prologue |
562 |
.save ar.lc, r2 |
|
563 |
mov r2 = ar.lc |
// *** register renaming *** |
564 |
|
// We've tried to keep the C-Code names as often as possible, at least as |
565 |
|
// part of register-names |
566 |
|
oldLC = r2 |
567 |
|
oldPR = r3 |
568 |
|
|
569 |
|
zero = r0 |
570 |
|
|
571 |
|
dct_al = r14 // dct: adress of left block in one line |
572 |
|
dct_ar = r15 // dct: adress of right block in one line |
573 |
|
cur = r16 |
574 |
|
ref1_al = r17 // ref1: aligned adress of lower part |
575 |
|
ref1_ah = r18 // ref1: aligned adress of higher part |
576 |
|
ref2_al = r19 // ref2: aligned adress of lower part |
577 |
|
ref2_ah = r20 // ref2: aligned adress of higher part |
578 |
|
stride = r21 |
579 |
|
|
580 |
|
offset_1 = r22 |
581 |
|
offset_2 = r23 |
582 |
|
aoffset_1 = r24 |
583 |
|
aoffset_2 = r25 |
584 |
|
|
585 |
|
|
586 |
.body |
.body |
587 |
mov r28 = r0 |
|
588 |
addl r27 = 255, r0 |
// *** Saving old Loop-Counter (LC) and Predicate Registers (PR) *** |
|
mov r26 = r0 |
|
|
mov r25 = r0 |
|
|
.L50: |
|
|
addl r14 = 3, r0 |
|
|
mov r21 = r0 |
|
|
;; |
|
|
mov ar.lc = r14 |
|
|
;; |
|
|
.L138: |
|
|
add r14 = r26, r21 |
|
|
add r17 = r25, r21 |
|
|
adds r19 = 1, r21 |
|
|
;; |
|
|
zxt4 r17 = r17 |
|
|
dep.z r14 = r14, 1, 32 |
|
|
add r18 = r25, r19 |
|
|
;; |
|
|
add r15 = r34, r17 |
|
|
add r23 = r14, r32 |
|
|
add r20 = r35, r17 |
|
|
;; |
|
|
ld1 r14 = [r15] |
|
|
ld1 r16 = [r20] |
|
|
add r17 = r33, r17 |
|
|
;; |
|
|
add r14 = r14, r16 |
|
|
ld1 r15 = [r17] |
|
|
zxt4 r18 = r18 |
|
|
;; |
|
|
adds r14 = 1, r14 |
|
|
add r24 = r35, r18 |
|
|
add r22 = r34, r18 |
|
|
;; |
|
|
shr.u r14 = r14, 1 |
|
|
add r19 = r26, r19 |
|
|
add r16 = r33, r18 |
|
|
;; |
|
|
cmp4.ge p6, p7 = r27, r14 |
|
|
dep.z r19 = r19, 1, 32 |
|
|
adds r21 = 2, r21 |
|
|
;; |
|
|
(p7) addl r14 = 255, r0 |
|
|
add r19 = r19, r32 |
|
|
;; |
|
|
sub r14 = r15, r14 |
|
|
;; |
|
|
st2 [r23] = r14 |
|
|
ld1 r14 = [r24] |
|
|
ld1 r15 = [r22] |
|
|
ld1 r16 = [r16] |
|
|
;; |
|
|
add r15 = r15, r14 |
|
|
;; |
|
|
adds r15 = 1, r15 |
|
|
;; |
|
|
shr.u r14 = r15, 1 |
|
|
;; |
|
|
cmp4.ge p6, p7 = r27, r14 |
|
|
;; |
|
|
(p7) addl r14 = 255, r0 |
|
|
;; |
|
|
sub r14 = r16, r14 |
|
|
;; |
|
|
st2 [r19] = r14 |
|
|
br.cloop.sptk.few .L138 |
|
|
adds r28 = 1, r28 |
|
|
adds r26 = 8, r26 |
|
|
add r25 = r25, r36 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 7, r28 |
|
|
(p6) br.cond.dptk .L50 |
|
|
mov ar.lc = r2 |
|
|
br.ret.sptk.many b0 |
|
|
.endp transfer_8to16sub2_ia64# |
|
|
.common transfer_16to8add#,8,8 |
|
|
.align 16 |
|
|
.global transfer_16to8add_ia64# |
|
|
.proc transfer_16to8add_ia64# |
|
|
transfer_16to8add_ia64: |
|
|
.prologue |
|
589 |
.save ar.lc, r2 |
.save ar.lc, r2 |
590 |
mov r2 = ar.lc |
mov oldLC = ar.lc |
591 |
.body |
mov oldPR = pr |
592 |
mov r26 = r0 |
|
593 |
addl r25 = 255, r0 |
// *** Saving Paramters *** |
594 |
mov r24 = r0 |
// *** (as inputregisters r32 + are needed for register-rotation) *** |
595 |
mov r21 = r0 |
mov dct_ar = r32 |
596 |
.L62: |
add dct_al = 8, r32 |
597 |
addl r14 = 3, r0 |
mov cur = r33 |
598 |
mov r20 = r0 |
|
599 |
;; |
and ref1_al = -8, r34 |
600 |
mov ar.lc = r14 |
and ref2_al = -8, r35 // ref2 aligned adrress of lower part |
601 |
;; |
|
602 |
.L149: |
mov stride = r36 |
603 |
adds r17 = 1, r20 |
|
604 |
add r14 = r21, r20 |
// *** Calculations for Misaligment-Handling *** |
605 |
add r15 = r24, r20 |
dep offset_1 = r34, zero, 3, 3 |
606 |
;; |
dep offset_2 = r35, zero, 3, 3 |
607 |
zxt4 r14 = r14 |
;; |
608 |
add r18 = r21, r17 |
add ref1_ah = 8, ref1_al |
609 |
dep.z r15 = r15, 1, 32 |
add ref2_ah = 8, ref2_al |
610 |
;; |
sub aoffset_1 = 64, offset_1 |
611 |
add r23 = r32, r14 |
sub aoffset_2 = 64, offset_2 |
612 |
zxt4 r18 = r18 |
;; |
613 |
add r15 = r15, r33 |
|
614 |
;; |
// *** Allocating new stackframe, define rotating registers *** |
615 |
mov r16 = r23 |
alloc r9 = ar.pfs, 5, 91, 0, 96 |
616 |
add r22 = r32, r18 |
|
617 |
ld2 r14 = [r15] |
// *** init loop: set loop counter, epilog counter, predicates *** |
618 |
;; |
mov ar.lc = 7 |
619 |
ld1 r18 = [r16] |
mov ar.ec = LL + SHL + OL + PAVGL + UL +PSL + 1 |
620 |
add r19 = r24, r17 |
mov pr.rot = 1 << 16 |
621 |
adds r20 = 2, r20 |
;; |
622 |
;; |
|
623 |
add r14 = r14, r18 |
// *** define register arrays and predicate array for software pipeline *** |
624 |
dep.z r19 = r19, 1, 32 |
.rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1] |
625 |
mov r16 = r22 |
.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1] |
626 |
;; |
|
627 |
sxt2 r14 = r14 |
/* software pipelined loop: |
628 |
add r19 = r19, r33 |
* ld_stage: The values of ref1, ref2, cur are loaded |
629 |
;; |
* sh_stage: The misaligned values of ref1/2 are shifted... |
630 |
cmp4.le p6, p7 = r0, r14 |
* or_stage: ...and copied together. |
631 |
cmp4.ge p8, p9 = r25, r14 |
* pavg_stage: The average of ref1 and ref2 is computed. |
632 |
;; |
* up_stage: The result and the cur-values are converted to 16-bit. |
633 |
(p7) mov r14 = r0 |
* psub_stage: Those values are substracted... |
634 |
(p7) br.cond.dpnt .L143 |
* st_stage: ...and stored at the dct-adresses. |
635 |
;; |
*/ |
636 |
(p9) addl r14 = 255, r0 |
|
637 |
;; |
.Loop_8to16sub2: |
638 |
.L143: |
{.mii |
639 |
st1 [r23] = r14 |
(ld_stage[0]) ld8 c[0] = [cur], stride |
640 |
ld1 r14 = [r22] |
(sh_stage[0]) shr.u ref1_l[0] = ref1_vl[LL], offset_1 |
641 |
ld2 r15 = [r19] |
(sh_stage[0]) shl ref1_h[0] = ref1_vh[LL], aoffset_1 |
642 |
;; |
} |
643 |
add r15 = r15, r14 |
{.mii |
644 |
;; |
(ld_stage[0]) ld8 ref1_vl[0] = [ref1_al], stride |
645 |
sxt2 r15 = r15 |
(sh_stage[0]) shr.u ref2_l[0] = ref2_vl[LL], offset_2 |
646 |
;; |
(sh_stage[0]) shl ref2_h[0] = ref2_vh[LL], aoffset_2 |
647 |
cmp4.le p6, p7 = r0, r15 |
} |
648 |
cmp4.ge p8, p9 = r25, r15 |
{.mii |
649 |
;; |
(ld_stage[0]) ld8 ref1_vh[0] = [ref1_ah], stride |
650 |
(p7) mov r15 = r0 |
(or_stage[0]) or ref1_aligned[0] = ref1_h[SHL], ref1_l[SHL] |
651 |
(p7) br.cond.dpnt .L147 |
(or_stage[0]) or ref2_aligned[0] = ref2_h[SHL], ref2_l[SHL] |
652 |
;; |
} |
653 |
(p9) addl r15 = 255, r0 |
{.mii |
654 |
;; |
(ld_stage[0]) ld8 ref2_vl[0] = [ref2_al], stride |
655 |
.L147: |
(pavg_stage[0]) pavg1.raz r[0] = ref1_aligned[OL], ref2_aligned[OL] |
656 |
st1 [r16] = r15 |
(up_stage[0]) unpack1.l r16_r[0] = zero, r[PAVGL] |
657 |
br.cloop.sptk.few .L149 |
} |
658 |
adds r26 = 1, r26 |
{.mii |
659 |
adds r24 = 8, r24 |
(ld_stage[0]) ld8 ref2_vh[0] = [ref2_ah], stride |
660 |
add r21 = r21, r34 |
(up_stage[0]) unpack1.h r16_l[0] = zero, r[PAVGL] |
661 |
;; |
(up_stage[0]) unpack1.l c16_r[0] = zero, c[LL+SHL+OL+PAVGL] |
662 |
cmp4.geu p6, p7 = 7, r26 |
} |
663 |
(p6) br.cond.dptk .L62 |
{.mii |
664 |
mov ar.lc = r2 |
(st_stage[0]) st8 [dct_ar] = dct16_r[PSL], 16 |
665 |
|
(up_stage[0]) unpack1.h c16_l[0] = zero, c[LL+SHL+OL+PAVGL] |
666 |
|
(psub_stage[0]) psub2.sss dct16_l[0] = c16_l[UL], r16_l[UL] |
667 |
|
} |
668 |
|
{.mib |
669 |
|
(st_stage[0]) st8 [dct_al] = dct16_l[PSL], 16 |
670 |
|
(psub_stage[0]) psub2.sss dct16_r[0] = c16_r[UL], r16_r[UL] |
671 |
|
br.ctop.sptk.few .Loop_8to16sub2 // Und hopp |
672 |
|
;; |
673 |
|
} |
674 |
|
|
675 |
|
// *** Restore old LC and PRs *** |
676 |
|
mov ar.lc = oldLC |
677 |
|
mov pr = oldPR, -1 |
678 |
|
|
679 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
680 |
.endp transfer_16to8add_ia64# |
.endp transfer_8to16sub2_ia64# |
|
.common transfer8x8_copy#,8,8 |
|
|
.align 16 |
|
|
.global transfer8x8_copy_ia64# |
|
|
.proc transfer8x8_copy_ia64# |
|
|
transfer8x8_copy_ia64: |
|
|
.prologue |
|
|
.save ar.lc, r2 |
|
|
mov r2 = ar.lc |
|
|
.body |
|
|
addl r14 = 7, r0 |
|
|
mov r21 = r0 |
|
|
;; |
|
|
mov ar.lc = r14 |
|
|
;; |
|
|
.L168: |
|
|
zxt4 r14 = r21 |
|
|
adds r15 = 1, r21 |
|
|
adds r18 = 2, r21 |
|
|
;; |
|
|
add r16 = r33, r14 |
|
|
zxt4 r15 = r15 |
|
|
zxt4 r18 = r18 |
|
|
;; |
|
|
ld1 r17 = [r16] |
|
|
add r14 = r32, r14 |
|
|
add r19 = r33, r15 |
|
|
;; |
|
|
st1 [r14] = r17 |
|
|
add r15 = r32, r15 |
|
|
add r20 = r33, r18 |
|
|
ld1 r16 = [r19] |
|
|
adds r14 = 3, r21 |
|
|
add r18 = r32, r18 |
|
|
;; |
|
|
st1 [r15] = r16 |
|
|
zxt4 r14 = r14 |
|
|
adds r17 = 4, r21 |
|
|
ld1 r15 = [r20] |
|
|
;; |
|
|
add r19 = r33, r14 |
|
|
zxt4 r17 = r17 |
|
|
st1 [r18] = r15 |
|
|
add r14 = r32, r14 |
|
|
;; |
|
|
add r20 = r33, r17 |
|
|
ld1 r15 = [r19] |
|
|
adds r16 = 5, r21 |
|
|
add r17 = r32, r17 |
|
|
;; |
|
|
st1 [r14] = r15 |
|
|
zxt4 r16 = r16 |
|
|
adds r18 = 6, r21 |
|
|
ld1 r14 = [r20] |
|
|
;; |
|
|
add r19 = r33, r16 |
|
|
zxt4 r18 = r18 |
|
|
st1 [r17] = r14 |
|
|
add r16 = r32, r16 |
|
|
;; |
|
|
add r20 = r33, r18 |
|
|
ld1 r14 = [r19] |
|
|
adds r15 = 7, r21 |
|
|
add r18 = r32, r18 |
|
|
;; |
|
|
st1 [r16] = r14 |
|
|
zxt4 r15 = r15 |
|
|
add r21 = r21, r34 |
|
|
ld1 r16 = [r20] |
|
|
;; |
|
|
add r17 = r33, r15 |
|
|
st1 [r18] = r16 |
|
|
add r15 = r32, r15 |
|
|
;; |
|
|
ld1 r14 = [r17] |
|
|
;; |
|
|
st1 [r15] = r14 |
|
|
br.cloop.sptk.few .L168 |
|
|
;; |
|
|
mov ar.lc = r2 |
|
|
br.ret.sptk.many b0 |
|
|
.endp transfer8x8_copy_ia64# |
|
|
.ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)" |
|