1 |
.text |
// ------------------------------------------------------------------------------ |
2 |
.align 16 |
// * |
3 |
.global sad16_ia64# |
// * Optimized Assembler Versions of sad8 and sad16 |
4 |
.proc sad16_ia64# |
// * |
5 |
sad16_ia64: |
// ------------------------------------------------------------------------------ |
6 |
|
// * |
7 |
_LL=3 |
// * Hannes Jütting and Christopher Özbek |
8 |
_SL=1 |
// * {s_juetti,s_oezbek}@ira.uka.de |
9 |
_OL=1 |
// * |
10 |
_PL=1 |
// * Programmed for the IA64 laboratory held at University Karlsruhe 2002 |
11 |
_AL=1 |
// * http://www.info.uni-karlsruhe.de/~rubino/ia64p/ |
12 |
|
// * |
13 |
alloc r9=ar.pfs,4,44,0,48 |
// ------------------------------------------------------------------------------ |
14 |
|
// * |
15 |
mov r8 = r0 |
// * These are the optimized assembler versions of sad8 and sad16, which calculate |
16 |
|
// * the sum of absolute differences between two 8x8/16x16 block matrices. |
17 |
mov r20 = ar.lc |
// * |
18 |
mov r21 = pr |
// * Our approach uses: |
19 |
|
// * - The Itanium command psad1, which solves the problem in hardware. |
20 |
dep.z r22 = r32, 3, 3 // erste 3 Bit mit 8 multiplizieren |
// * - Modulo-Scheduled Loops as the best way to loop unrolling on the IA64 |
21 |
dep.z r23 = r33, 3, 3 // in r22 und r23 -> Schiebeflags |
// * EPIC architecture |
22 |
|
// * - Alignment resolving to avoid memory faults |
23 |
and r14 = -8, r32 // Parameter in untere Register kopieren |
// * |
24 |
and r15 = -8, r33 // Ref Cur mit 11111...1000 and-en |
// ------------------------------------------------------------------------------ |
|
mov r16 = r34 |
|
|
mov r17 = r35 |
|
|
;; |
|
|
add r18 = 8, r14 // Adressenvorausberechnen |
|
|
add r19 = 8, r15 |
|
|
|
|
|
sub r24 = 64, r22 // Schiftanzahl ausrechnen |
|
|
sub r25 = 64, r23 |
|
|
|
|
|
add r26 = 16, r14 // Adressenvorausberechnen |
|
|
add r27 = 16, r15 |
|
|
|
|
|
// Loop-counter initialisieren |
|
|
mov ar.lc = 15 // Loop 16 mal durchlaufen |
|
|
mov ar.ec = _LL + _SL + _OL + _PL + _AL + _AL // Die Loop am Schluss noch neun mal durchlaufen |
|
|
|
|
|
// Rotating Predicate Register zuruecksetzen und P16 auf 1 |
|
|
mov pr.rot = 1 << 16 |
|
|
;; |
|
|
|
|
|
// Array-Konstrukte initialisieren |
|
|
.rotr _ald1[_LL+1], _ald2[_LL+1], _ald3[_LL+1], _ald4[_LL+1], _ald5[_LL+1], _ald6[_LL+1], _shru1[_SL+1], _shl1[_SL+1], _shru2[_SL], _shl2[_SL], _shru3[_SL], _shl3[_SL], _shru4[_SL], _shl4[_SL+1], _or1[_OL], _or2[_OL], _or3[_OL], _or4[_OL+1], _psadr1[_PL+1], _psadr2[_PL+1], _addr1[_AL+1] |
|
|
.rotp _aldp[_LL], _shp[_SL], _orp[_OL], _psadrp[_PL], _addrp1[_AL], _addrp2[_AL] |
|
|
|
|
|
.L_loop_16: |
|
|
{.mmi |
|
|
(_aldp[0]) ld8 _ald1[0] = [r14], r16 // Cur Erste 8 Byte |
|
|
(_aldp[0]) ld8 _ald2[0] = [r18], r16 // Cur Zweite 8 Byte |
|
|
(_psadrp[0]) psad1 _psadr1[0] = _or2[0], _or4[0] // Psadden |
|
|
} |
|
|
{.mmi |
|
|
(_aldp[0]) ld8 _ald3[0] = [r26], r16 // Cur Dritte 8 Byte |
|
|
(_aldp[0]) ld8 _ald4[0] = [r15], r16 // Ref Erste 8 Byte |
|
|
(_psadrp[0]) psad1 _psadr2[0] = _or3[0], _or4[_OL] // _or2 +1 |
|
|
} |
|
|
{.mmi |
|
|
(_aldp[0]) ld8 _ald5[0] = [r19], r16 // Ref Zweite 8 Byte |
|
|
(_aldp[0]) ld8 _ald6[0] = [r27], r16 // Ref Dritte 8 Byte |
|
|
(_shp[0]) shr.u _shru1[0] = _ald1[_LL], r22 |
|
|
} |
|
|
{.mii |
|
|
(_orp[0]) or _or1[0] = _shl2[0], _shru3[0] // _shru2 + 1 und _shl2 + 1 |
|
|
(_shp[0]) shl _shl1[0] = _ald2[_LL], r24 |
|
|
(_shp[0]) shr.u _shru2[0] = _ald2[_LL], r22 |
|
|
} |
|
|
{.mii |
|
|
(_orp[0]) or _or2[0] = _shl3[0], _shru4[0] // _shru3 + 1 und _shl3 + 1 |
|
|
(_shp[0]) shl _shl2[0] = _ald3[_LL], r24 |
|
|
(_shp[0]) shr.u _shru3[0] = _ald4[_LL], r23 |
|
|
} |
|
|
{.mii |
|
|
(_orp[0]) or _or3[0] = _shl4[0], _shl4[_SL] //_shru4 + 1 und _shl4 + 1 |
|
|
(_shp[0]) shl _shl3[0] = _ald5[_LL], r25 |
|
|
(_shp[0]) shr.u _shru4[0] = _ald5[_LL], r23 |
|
|
} |
|
|
{.mmi |
|
|
(_orp[0]) or _or4[0] = _shru1[_SL], _shl1[_SL] |
|
|
(_shp[0]) shl _shl4[0]= _ald6[_LL], r25 |
|
|
} |
|
|
{.mmb |
|
|
(_addrp1[0]) add _addr1[0] = _psadr1[_PL], _psadr2[_PL] // Aufsummieren |
|
|
(_addrp2[0]) add r8 = r8, _addr1[_AL] |
|
|
br.ctop.sptk.few .L_loop_16 |
|
|
;; |
|
|
} |
|
|
// Register zurueckschreiben |
|
|
mov ar.lc = r20 |
|
|
mov pr = r21,-1 |
|
|
br.ret.sptk.many rp |
|
|
.endp sad16_ia64# |
|
|
|
|
|
|
|
|
.align 16 |
|
|
.global sad8_ia64# |
|
|
.proc sad8_ia64# |
|
|
|
|
|
sad8_ia64: |
|
|
|
|
|
LL=3 |
|
|
SL=1 |
|
|
OL=1 |
|
|
PL=1 |
|
|
AL=1 |
|
|
|
|
|
alloc r9=ar.pfs,3,29,0,32 |
|
|
mov r20 = ar.lc |
|
|
mov r21 = pr |
|
|
|
|
|
dep.z r22 = r32, 3, 3 // erste 3 Bit mit 8 multiplizieren |
|
|
dep.z r23 = r33, 3, 3 // in r22 und r23 -> Schiebeflags |
|
|
|
|
|
mov r8 = r0 // . . . . |
|
|
and r14 = -8, r32 // 0xFFFFFFFFFFFFFFF8, r32 |
|
|
and r15 = -8, r33 // 0xFFFFFFFFFFFFFFF8, r33 |
|
|
mov r16 = r34 |
|
|
// mov r17 = r35 |
|
|
;; |
|
|
|
|
|
add r18 = 8, r14 |
|
|
add r19 = 8, r15 |
|
25 |
|
|
|
sub r24 = 64, r22 |
|
|
sub r25 = 64, r23 |
|
26 |
|
|
|
// Loop-counter initialisieren |
|
|
mov ar.lc = 7 // Loop 7 mal durchlaufen |
|
|
mov ar.ec = LL + SL + OL + PL + AL // Die Loop am Schluss noch zehn mal durchlaufen |
|
|
|
|
|
// Rotating Predicate Register zuruecksetzen und P16 auf 1 |
|
|
mov pr.rot = 1 << 16 |
|
|
;; |
|
|
.rotr ald1[LL+1], ald2[LL+1], ald3[LL+1], ald4[LL+1], shru1[SL+1], shl1[SL+1], shru2[SL+1], shl2[SL+1], or1[OL+1], or2[OL+1], psadr[PL+1], addr[AL+1] |
|
|
.rotp aldp[LL], shp[SL], orp[OL], psadrp[PL], addrp[AL] |
|
|
.L_loop_8: |
|
|
{.mmi |
|
|
(aldp[0]) ld8 ald1[0] = [r14], r16 // Cur laden |
|
|
(aldp[0]) ld8 ald2[0] = [r18], r16 |
|
|
(shp[0]) shr.u shru1[0] = ald1[LL], r22 // mergen |
|
|
} |
|
|
{.mii |
|
|
(orp[0]) or or1[0] = shru1[SL], shl1[SL] |
|
|
(shp[0]) shl shl1[0] = ald2[LL], r24 |
|
|
(shp[0]) shr.u shru2[0] = ald3[LL], r23 // mergen |
|
|
} |
|
|
{.mmi |
|
|
(aldp[0]) ld8 ald3[0] = [r15], r16 // Ref laden |
|
|
(aldp[0]) ld8 ald4[0] = [r19], r16 |
|
|
(shp[0]) shl shl2[0] = ald4[LL], r25 |
|
|
} |
|
|
{.mmi |
|
|
(orp[0]) or or2[0] = shru2[SL], shl2[SL] |
|
|
(addrp[0]) add r8 = r8, psadr[PL] |
|
|
(psadrp[0]) psad1 psadr[0] = or1[OL], or2[OL] |
|
|
} |
|
|
{.mbb |
|
|
br.ctop.sptk.few .L_loop_8 |
|
|
;; |
|
|
} |
|
|
|
|
|
mov ar.lc = r20 |
|
|
mov pr = r21,-1 |
|
|
br.ret.sptk.many b0 |
|
|
.endp sad8_ia64# |
|
27 |
|
|
28 |
|
|
29 |
.common sad16bi#,8,8 |
.common sad16bi#,8,8 |
124 |
.endp sad16bi_ia64# |
.endp sad16bi_ia64# |
125 |
|
|
126 |
|
|
127 |
.common dev16#,8,8 |
|
128 |
|
|
129 |
|
|
130 |
|
|
131 |
|
|
132 |
|
|
133 |
|
.text |
134 |
.align 16 |
.align 16 |
135 |
.global dev16_ia64# |
.global dev16_ia64# |
136 |
.proc dev16_ia64# |
.proc dev16_ia64# |
137 |
|
.auto |
138 |
dev16_ia64: |
dev16_ia64: |
139 |
.prologue |
// renamings for better readability |
140 |
zxt4 r33 = r33 |
stride = r18 |
141 |
.save ar.lc, r2 |
pfs = r19 //for saving previous function state |
142 |
mov r2 = ar.lc |
cura0 = r20 //address of first 8-byte block of cur |
143 |
|
cura1 = r21 //address of second 8-byte block of cur |
144 |
|
mean0 = r22 //registers for calculating the sum in parallel |
145 |
|
mean1 = r23 |
146 |
|
mean2 = r24 |
147 |
|
mean3 = r25 |
148 |
|
dev0 = r26 //same for the deviation |
149 |
|
dev1 = r27 |
150 |
|
dev2 = r28 |
151 |
|
dev3 = r29 |
152 |
|
|
153 |
.body |
.body |
154 |
mov r21 = r0 |
alloc pfs = ar.pfs, 2, 38, 0, 40 |
155 |
mov r8 = r0 |
|
156 |
mov r23 = r32 |
mov cura0 = in0 |
157 |
mov r24 = r0 |
mov stride = in1 |
158 |
|
add cura1 = 8, cura0 |
159 |
|
|
160 |
|
.rotr c[32], psad[8] // just using rotating registers to get an array ;-) |
161 |
|
|
162 |
|
.explicit |
163 |
|
{.mmi |
164 |
|
ld8 c[0] = [cura0], stride // load them ... |
165 |
|
ld8 c[1] = [cura1], stride |
166 |
;; |
;; |
167 |
mov r25 = r33 |
} |
168 |
.L50: |
{.mmi |
169 |
mov r22 = r0 |
ld8 c[2] = [cura0], stride |
170 |
mov r20 = r23 |
ld8 c[3] = [cura1], stride |
|
;; |
|
|
.L54: |
|
|
mov r16 = r20 |
|
|
adds r14 = 2, r20 |
|
|
adds r15 = 3, r20 |
|
|
;; |
|
|
ld1 r17 = [r16], 1 |
|
|
ld1 r18 = [r14] |
|
|
ld1 r19 = [r15] |
|
|
;; |
|
|
ld1 r14 = [r16] |
|
|
add r21 = r17, r21 |
|
|
adds r15 = 4, r20 |
|
|
;; |
|
|
add r21 = r14, r21 |
|
|
ld1 r16 = [r15] |
|
|
adds r22 = 8, r22 |
|
|
;; |
|
|
add r21 = r18, r21 |
|
|
adds r14 = 5, r20 |
|
|
adds r15 = 6, r20 |
|
|
;; |
|
|
add r21 = r19, r21 |
|
|
ld1 r17 = [r14] |
|
|
ld1 r18 = [r15] |
|
|
;; |
|
|
add r21 = r16, r21 |
|
|
adds r14 = 7, r20 |
|
|
cmp4.geu p6, p7 = 15, r22 |
|
|
;; |
|
|
add r21 = r17, r21 |
|
|
ld1 r15 = [r14] |
|
|
adds r20 = 8, r20 |
|
|
;; |
|
|
add r21 = r18, r21 |
|
|
;; |
|
|
add r21 = r15, r21 |
|
|
(p6) br.cond.dptk .L54 |
|
|
adds r24 = 1, r24 |
|
|
add r23 = r23, r25 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 15, r24 |
|
|
(p6) br.cond.dptk .L50 |
|
|
extr.u r14 = r21, 8, 24 |
|
|
mov r23 = r32 |
|
|
mov r24 = r0 |
|
|
;; |
|
|
mov r21 = r14 |
|
|
.L60: |
|
|
addl r14 = 3, r0 |
|
|
mov r17 = r23 |
|
171 |
;; |
;; |
172 |
mov ar.lc = r14 |
} |
173 |
|
{.mmi |
174 |
|
ld8 c[4] = [cura0], stride |
175 |
|
ld8 c[5] = [cura1], stride |
176 |
;; |
;; |
177 |
.L144: |
} |
178 |
mov r16 = r17 |
{.mmi |
179 |
|
ld8 c[6] = [cura0], stride |
180 |
|
ld8 c[7] = [cura1], stride |
181 |
;; |
;; |
182 |
ld1 r14 = [r16], 1 |
} |
183 |
|
{.mmi |
184 |
|
ld8 c[8] = [cura0], stride |
185 |
|
ld8 c[9] = [cura1], stride |
186 |
;; |
;; |
187 |
sub r15 = r14, r21 |
} |
188 |
|
{.mmi |
189 |
|
ld8 c[10] = [cura0], stride |
190 |
|
ld8 c[11] = [cura1], stride |
191 |
;; |
;; |
192 |
cmp4.ge p6, p7 = 0, r15 |
} |
193 |
|
{.mii |
194 |
|
ld8 c[12] = [cura0], stride |
195 |
|
psad1 mean0 = c[0], r0 // get the sum of them ... |
196 |
|
psad1 mean1 = c[1], r0 |
197 |
|
} |
198 |
|
{.mmi |
199 |
|
ld8 c[13] = [cura1], stride |
200 |
;; |
;; |
201 |
(p7) add r8 = r8, r15 |
ld8 c[14] = [cura0], stride |
202 |
(p6) sub r14 = r21, r14 |
psad1 mean2 = c[2], r0 |
203 |
|
} |
204 |
|
{.mii |
205 |
|
ld8 c[15] = [cura1], stride |
206 |
|
psad1 mean3 = c[3], r0 |
207 |
;; |
;; |
208 |
(p6) add r8 = r8, r14 |
psad1 psad[0] = c[4], r0 |
209 |
ld1 r14 = [r16] |
} |
210 |
|
{.mmi |
211 |
|
ld8 c[16] = [cura0], stride |
212 |
|
ld8 c[17] = [cura1], stride |
213 |
|
psad1 psad[1] = c[5], r0 |
214 |
;; |
;; |
215 |
sub r15 = r14, r21 |
} |
216 |
adds r16 = 2, r17 |
{.mii |
217 |
|
ld8 c[18] = [cura0], stride |
218 |
|
psad1 psad[2] = c[6], r0 |
219 |
|
psad1 psad[3] = c[7], r0 |
220 |
|
} |
221 |
|
{.mmi |
222 |
|
ld8 c[19] = [cura1], stride |
223 |
;; |
;; |
224 |
cmp4.ge p6, p7 = 0, r15 |
ld8 c[20] = [cura0], stride |
225 |
|
psad1 psad[4] = c[8], r0 |
226 |
|
} |
227 |
|
{.mii |
228 |
|
ld8 c[21] = [cura1], stride |
229 |
|
psad1 psad[5] = c[9], r0 |
230 |
;; |
;; |
231 |
(p7) add r8 = r8, r15 |
add mean0 = mean0, psad[0] |
232 |
(p6) sub r14 = r21, r14 |
} |
233 |
|
{.mmi |
234 |
|
ld8 c[22] = [cura0], stride |
235 |
|
ld8 c[23] = [cura1], stride |
236 |
|
add mean1 = mean1, psad[1] |
237 |
;; |
;; |
238 |
(p6) add r8 = r8, r14 |
} |
239 |
ld1 r14 = [r16] |
{.mii |
240 |
|
ld8 c[24] = [cura0], stride |
241 |
|
psad1 psad[0] = c[10], r0 |
242 |
|
psad1 psad[1] = c[11], r0 |
243 |
|
} |
244 |
|
{.mmi |
245 |
|
ld8 c[25] = [cura1], stride |
246 |
;; |
;; |
247 |
sub r15 = r14, r21 |
ld8 c[26] = [cura0], stride |
248 |
adds r16 = 3, r17 |
add mean2 = mean2, psad[2] |
249 |
|
} |
250 |
|
{.mii |
251 |
|
ld8 c[27] = [cura1], stride |
252 |
|
add mean3 = mean3, psad[3] |
253 |
;; |
;; |
254 |
cmp4.ge p6, p7 = 0, r15 |
psad1 psad[2] = c[12], r0 |
255 |
adds r17 = 4, r17 |
} |
256 |
|
{.mmi |
257 |
|
ld8 c[28] = [cura0], stride |
258 |
|
ld8 c[29] = [cura1], stride |
259 |
|
psad1 psad[3] = c[13], r0 |
260 |
;; |
;; |
261 |
(p7) add r8 = r8, r15 |
} |
262 |
(p6) sub r14 = r21, r14 |
{.mii |
263 |
|
ld8 c[30] = [cura0] |
264 |
|
psad1 psad[6] = c[14], r0 |
265 |
|
psad1 psad[7] = c[15], r0 |
266 |
|
} |
267 |
|
{.mmi |
268 |
|
ld8 c[31] = [cura1] |
269 |
;; |
;; |
270 |
(p6) add r8 = r8, r14 |
add mean0 = mean0, psad[0] |
271 |
ld1 r14 = [r16] |
add mean1 = mean1, psad[1] |
272 |
|
} |
273 |
|
{.mii |
274 |
|
add mean2 = mean2, psad[4] |
275 |
|
add mean3 = mean3, psad[5] |
276 |
;; |
;; |
277 |
sub r15 = r14, r21 |
psad1 psad[0] = c[16], r0 |
278 |
|
} |
279 |
|
{.mmi |
280 |
|
add mean0 = mean0, psad[2] |
281 |
|
add mean1 = mean1, psad[3] |
282 |
|
psad1 psad[1] = c[17], r0 |
283 |
;; |
;; |
284 |
cmp4.ge p6, p7 = 0, r15 |
} |
285 |
|
{.mii |
286 |
|
add mean2 = mean2, psad[6] |
287 |
|
psad1 psad[2] = c[18], r0 |
288 |
|
psad1 psad[3] = c[19], r0 |
289 |
|
} |
290 |
|
{.mmi |
291 |
|
add mean3 = mean3, psad[7] |
292 |
;; |
;; |
293 |
(p7) add r8 = r8, r15 |
add mean0 = mean0, psad[0] |
294 |
(p6) sub r14 = r21, r14 |
psad1 psad[4] = c[20], r0 |
295 |
|
} |
296 |
|
{.mii |
297 |
|
add mean1 = mean1, psad[1] |
298 |
|
psad1 psad[5] = c[21], r0 |
299 |
;; |
;; |
300 |
(p6) add r8 = r8, r14 |
psad1 psad[6] = c[22], r0 |
301 |
br.cloop.sptk.few .L144 |
} |
302 |
adds r24 = 1, r24 |
{.mmi |
303 |
add r23 = r23, r33 |
add mean2 = mean2, psad[2] |
304 |
|
add mean3 = mean3, psad[3] |
305 |
|
psad1 psad[7] = c[23], r0 |
306 |
;; |
;; |
307 |
cmp4.geu p6, p7 = 15, r24 |
} |
308 |
(p6) br.cond.dptk .L60 |
{.mii |
309 |
mov ar.lc = r2 |
add mean0 = mean0, psad[4] |
310 |
|
psad1 psad[0] = c[24], r0 |
311 |
|
psad1 psad[1] = c[25], r0 |
312 |
|
} |
313 |
|
{.mmi |
314 |
|
add mean1 = mean1, psad[5] |
315 |
|
;; |
316 |
|
add mean2 = mean2, psad[6] |
317 |
|
psad1 psad[2] = c[26], r0 |
318 |
|
} |
319 |
|
{.mii |
320 |
|
add mean3 = mean3, psad[7] |
321 |
|
psad1 psad[3] = c[27], r0 |
322 |
|
;; |
323 |
|
psad1 psad[4] = c[28], r0 |
324 |
|
} |
325 |
|
{.mmi |
326 |
|
add mean0 = mean0, psad[0] |
327 |
|
add mean1 = mean1, psad[1] |
328 |
|
psad1 psad[5] = c[29], r0 |
329 |
|
;; |
330 |
|
} |
331 |
|
{.mii |
332 |
|
add mean2 = mean2, psad[2] |
333 |
|
psad1 psad[6] = c[30], r0 |
334 |
|
psad1 psad[7] = c[31], r0 |
335 |
|
} |
336 |
|
{.mmi |
337 |
|
add mean3 = mean3, psad[3] |
338 |
|
;; |
339 |
|
add mean0 = mean0, psad[4] |
340 |
|
add mean1 = mean1, psad[5] |
341 |
|
} |
342 |
|
{.mbb |
343 |
|
add mean2 = mean2, mean3 |
344 |
|
nop.b 1 |
345 |
|
nop.b 1 |
346 |
|
;; |
347 |
|
} |
348 |
|
{.mib |
349 |
|
add mean0 = mean0, psad[6] |
350 |
|
add mean1 = mean1, psad[7] |
351 |
|
nop.b 1 |
352 |
|
;; |
353 |
|
} |
354 |
|
{.mib |
355 |
|
add mean0 = mean0, mean1 |
356 |
|
// add mean2 = 127, mean2 // this could make our division more exactly, but does not help much |
357 |
|
;; |
358 |
|
} |
359 |
|
{.mib |
360 |
|
add mean0 = mean0, mean2 |
361 |
|
;; |
362 |
|
} |
363 |
|
|
364 |
|
{.mib |
365 |
|
shr.u mean0 = mean0, 8 // divide them ... |
366 |
|
;; |
367 |
|
} |
368 |
|
{.mib |
369 |
|
mux1 mean0 = mean0, @brcst |
370 |
|
;; |
371 |
|
} |
372 |
|
{.mii |
373 |
|
nop.m 0 |
374 |
|
psad1 dev0 = c[0], mean0 // and do a sad again ... |
375 |
|
psad1 dev1 = c[1], mean0 |
376 |
|
} |
377 |
|
{.mii |
378 |
|
nop.m 0 |
379 |
|
psad1 dev2 = c[2], mean0 |
380 |
|
psad1 dev3 = c[3], mean0 |
381 |
|
} |
382 |
|
{.mii |
383 |
|
nop.m 0 |
384 |
|
psad1 psad[0] = c[4], mean0 |
385 |
|
psad1 psad[1] = c[5], mean0 |
386 |
|
} |
387 |
|
{.mii |
388 |
|
nop.m 0 |
389 |
|
psad1 psad[2] = c[6], mean0 |
390 |
|
psad1 psad[3] = c[7], mean0 |
391 |
|
} |
392 |
|
{.mii |
393 |
|
nop.m 0 |
394 |
|
psad1 psad[4] = c[8], mean0 |
395 |
|
psad1 psad[5] = c[9], mean0 |
396 |
|
;; |
397 |
|
} |
398 |
|
{.mii |
399 |
|
add dev0 = dev0, psad[0] |
400 |
|
psad1 psad[6] = c[10], mean0 |
401 |
|
psad1 psad[7] = c[11], mean0 |
402 |
|
} |
403 |
|
{.mmi |
404 |
|
add dev1 = dev1, psad[1] |
405 |
|
|
406 |
|
add dev2 = dev2, psad[2] |
407 |
|
psad1 psad[0] = c[12], mean0 |
408 |
|
} |
409 |
|
{.mii |
410 |
|
add dev3 = dev3, psad[3] |
411 |
|
psad1 psad[1] = c[13], mean0 |
412 |
|
;; |
413 |
|
psad1 psad[2] = c[14], mean0 |
414 |
|
} |
415 |
|
{.mmi |
416 |
|
add dev0 = dev0, psad[4] |
417 |
|
add dev1 = dev1, psad[5] |
418 |
|
psad1 psad[3] = c[15], mean0 |
419 |
|
} |
420 |
|
{.mii |
421 |
|
add dev2 = dev2, psad[6] |
422 |
|
psad1 psad[4] = c[16], mean0 |
423 |
|
psad1 psad[5] = c[17], mean0 |
424 |
|
} |
425 |
|
{.mmi |
426 |
|
add dev3 = dev3, psad[7] |
427 |
|
;; |
428 |
|
add dev0 = dev0, psad[0] |
429 |
|
psad1 psad[6] = c[18], mean0 |
430 |
|
} |
431 |
|
{.mii |
432 |
|
add dev1 = dev1, psad[1] |
433 |
|
psad1 psad[7] = c[19], mean0 |
434 |
|
|
435 |
|
psad1 psad[0] = c[20], mean0 |
436 |
|
} |
437 |
|
{.mmi |
438 |
|
add dev2 = dev2, psad[2] |
439 |
|
add dev3 = dev3, psad[3] |
440 |
|
psad1 psad[1] = c[21], mean0 |
441 |
|
;; |
442 |
|
} |
443 |
|
{.mii |
444 |
|
add dev0 = dev0, psad[4] |
445 |
|
psad1 psad[2] = c[22], mean0 |
446 |
|
psad1 psad[3] = c[23], mean0 |
447 |
|
} |
448 |
|
{.mmi |
449 |
|
add dev1 = dev1, psad[5] |
450 |
|
|
451 |
|
add dev2 = dev2, psad[6] |
452 |
|
psad1 psad[4] = c[24], mean0 |
453 |
|
} |
454 |
|
{.mii |
455 |
|
add dev3 = dev3, psad[7] |
456 |
|
psad1 psad[5] = c[25], mean0 |
457 |
|
;; |
458 |
|
psad1 psad[6] = c[26], mean0 |
459 |
|
} |
460 |
|
{.mmi |
461 |
|
add dev0 = dev0, psad[0] |
462 |
|
add dev1 = dev1, psad[1] |
463 |
|
psad1 psad[7] = c[27], mean0 |
464 |
|
} |
465 |
|
{.mii |
466 |
|
add dev2 = dev2, psad[2] |
467 |
|
psad1 psad[0] = c[28], mean0 |
468 |
|
psad1 psad[1] = c[29], mean0 |
469 |
|
} |
470 |
|
{.mmi |
471 |
|
add dev3 = dev3, psad[3] |
472 |
|
;; |
473 |
|
add dev0 = dev0, psad[4] |
474 |
|
psad1 psad[2] = c[30], mean0 |
475 |
|
} |
476 |
|
{.mii |
477 |
|
add dev1 = dev1, psad[5] |
478 |
|
psad1 psad[3] = c[31], mean0 |
479 |
|
;; |
480 |
|
add dev2 = dev2, psad[6] |
481 |
|
} |
482 |
|
{.mmi |
483 |
|
add dev3 = dev3, psad[7] |
484 |
|
add dev0 = dev0, psad[0] |
485 |
|
add dev1 = dev1, psad[1] |
486 |
|
;; |
487 |
|
} |
488 |
|
{.mii |
489 |
|
add dev2 = dev2, psad[2] |
490 |
|
add dev3 = dev3, psad[3] |
491 |
|
add ret0 = dev0, dev1 |
492 |
|
;; |
493 |
|
} |
494 |
|
{.mib |
495 |
|
add dev2 = dev2, dev3 |
496 |
|
nop.i 1 |
497 |
|
nop.b 1 |
498 |
|
;; |
499 |
|
} |
500 |
|
{.mib |
501 |
|
add ret0 = ret0, dev2 |
502 |
|
nop.i 1 |
503 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
504 |
|
} |
505 |
.endp dev16_ia64# |
.endp dev16_ia64# |
506 |
|
|
507 |
|
|
508 |
|
// ########################################################### |
509 |
|
// ########################################################### |
510 |
|
// Neue version von gruppe 01 ################################ |
511 |
|
// ########################################################### |
512 |
|
// ########################################################### |
513 |
|
|
514 |
|
|
515 |
|
|
516 |
|
.text |
517 |
|
.align 16 |
518 |
|
.global sad16_ia64# |
519 |
|
.proc sad16_ia64# |
520 |
|
sad16_ia64: |
521 |
|
alloc r1 = ar.pfs, 4, 76, 0, 0 |
522 |
|
mov r2 = pr |
523 |
|
dep r14 = r0, r33, 0, 3 // r14 = (r33 div 8)*8 (aligned version of ref) |
524 |
|
dep.z r31 = r33, 0, 3 // r31 = r33 mod 8 (misalignment of ref) |
525 |
|
;; |
526 |
|
mov r64 = r34 //(1) calculate multiples of stride |
527 |
|
shl r65 = r34, 1 //(2) for being able to load all the |
528 |
|
shladd r66 = r34, 1, r34 //(3) data at once |
529 |
|
shl r67 = r34, 2 //(4) |
530 |
|
shladd r68 = r34, 2, r34 //(5) |
531 |
|
shl r71 = r34, 3 //(8) |
532 |
|
shladd r72 = r34, 3, r34 //(9) |
533 |
|
;; |
534 |
|
shl r69 = r66, 1 //(6) |
535 |
|
shladd r70 = r66, 1, r34 //(7) |
536 |
|
shl r73 = r68, 1 //(10) |
537 |
|
shladd r74 = r68, 1, r34 //(11) |
538 |
|
shl r75 = r66, 2 //(12) |
539 |
|
shladd r76 = r66, 2, r34 //(13) |
540 |
|
shladd r77 = r66, 2, r65 //(14) |
541 |
|
shladd r78 = r66, 2, r66 //(15) |
542 |
|
;; |
543 |
|
cmp.eq p16, p17 = 0, r31 // prepare predicates according to the misalignment |
544 |
|
cmp.eq p18, p19 = 2, r31 // ref |
545 |
|
cmp.eq p20, p21 = 4, r31 |
546 |
|
cmp.eq p22, p23 = 6, r31 |
547 |
|
cmp.eq p24, p25 = 1, r31 |
548 |
|
cmp.eq p26, p27 = 3, r31 |
549 |
|
cmp.eq p28, p29 = 5, r31 |
550 |
|
mov r96 = r14 // and calculate all the adresses where we have |
551 |
|
mov r33 = r32 // to load from |
552 |
|
add r97 = r14, r64 |
553 |
|
add r35 = r32, r64 |
554 |
|
add r98 = r14, r65 |
555 |
|
add r37 = r32, r65 |
556 |
|
add r99 = r14, r66 |
557 |
|
add r39 = r32, r66 |
558 |
|
add r100 = r14, r67 |
559 |
|
add r41 = r32, r67 |
560 |
|
add r101 = r14, r68 |
561 |
|
add r43 = r32, r68 |
562 |
|
add r102 = r14, r69 |
563 |
|
add r45 = r32, r69 |
564 |
|
add r103 = r14, r70 |
565 |
|
add r47 = r32, r70 |
566 |
|
add r104 = r14, r71 |
567 |
|
add r49 = r32, r71 |
568 |
|
add r105 = r14, r72 |
569 |
|
add r51 = r32, r72 |
570 |
|
add r106 = r14, r73 |
571 |
|
add r53 = r32, r73 |
572 |
|
add r107 = r14, r74 |
573 |
|
add r55 = r32, r74 |
574 |
|
add r108 = r14, r75 |
575 |
|
add r57 = r32, r75 |
576 |
|
add r109 = r14, r76 |
577 |
|
add r59 = r32, r76 |
578 |
|
add r110 = r14, r77 |
579 |
|
add r61 = r32, r77 |
580 |
|
add r111 = r14, r78 |
581 |
|
add r63 = r32, r78 |
582 |
|
;; |
583 |
|
ld8 r32 = [r33], 8 // Load all the data which is needed for the sad |
584 |
|
ld8 r34 = [r35], 8 // in the registers. the goal is to have the array |
585 |
|
ld8 r36 = [r37], 8 // adressed by cur in the registers r32 - r63 and |
586 |
|
ld8 r38 = [r39], 8 // the aray adressed by ref in the registers |
587 |
|
ld8 r40 = [r41], 8 // r64 - r95. The registers r96 - r111 are needed |
588 |
|
ld8 r42 = [r43], 8 // to load the aligned 24 bits in which the |
589 |
|
ld8 r44 = [r45], 8 // needed misaligned 16 bits must be. |
590 |
|
ld8 r46 = [r47], 8 // After loading we start a preprocessing which |
591 |
|
ld8 r48 = [r49], 8 // guarantees that the data adressed by ref is in |
592 |
|
ld8 r50 = [r51], 8 // the registers r64 - r95. |
593 |
|
ld8 r52 = [r53], 8 |
594 |
|
ld8 r54 = [r55], 8 |
595 |
|
ld8 r56 = [r57], 8 |
596 |
|
ld8 r58 = [r59], 8 |
597 |
|
ld8 r60 = [r61], 8 |
598 |
|
ld8 r62 = [r63], 8 |
599 |
|
ld8 r64 = [r96], 8 |
600 |
|
ld8 r66 = [r97], 8 |
601 |
|
ld8 r68 = [r98], 8 |
602 |
|
ld8 r70 = [r99], 8 |
603 |
|
ld8 r72 = [r100], 8 |
604 |
|
ld8 r74 = [r101], 8 |
605 |
|
ld8 r76 = [r102], 8 |
606 |
|
ld8 r78 = [r103], 8 |
607 |
|
ld8 r80 = [r104], 8 |
608 |
|
ld8 r82 = [r105], 8 |
609 |
|
ld8 r84 = [r106], 8 |
610 |
|
ld8 r86 = [r107], 8 |
611 |
|
ld8 r88 = [r108], 8 |
612 |
|
ld8 r90 = [r109], 8 |
613 |
|
ld8 r92 = [r110], 8 |
614 |
|
ld8 r94 = [r111], 8 |
615 |
|
;; |
616 |
|
ld8 r33 = [r33] |
617 |
|
ld8 r35 = [r35] |
618 |
|
ld8 r37 = [r37] |
619 |
|
ld8 r39 = [r39] |
620 |
|
ld8 r41 = [r41] |
621 |
|
ld8 r43 = [r43] |
622 |
|
ld8 r45 = [r45] |
623 |
|
ld8 r47 = [r47] |
624 |
|
ld8 r49 = [r49] |
625 |
|
ld8 r51 = [r51] |
626 |
|
ld8 r53 = [r53] |
627 |
|
ld8 r55 = [r55] |
628 |
|
ld8 r57 = [r57] |
629 |
|
ld8 r59 = [r59] |
630 |
|
ld8 r61 = [r61] |
631 |
|
ld8 r63 = [r63] |
632 |
|
ld8 r65 = [r96], 8 |
633 |
|
ld8 r67 = [r97], 8 |
634 |
|
ld8 r69 = [r98], 8 |
635 |
|
ld8 r71 = [r99], 8 |
636 |
|
ld8 r73 = [r100], 8 |
637 |
|
ld8 r75 = [r101], 8 |
638 |
|
ld8 r77 = [r102], 8 |
639 |
|
ld8 r79 = [r103], 8 |
640 |
|
ld8 r81 = [r104], 8 |
641 |
|
ld8 r83 = [r105], 8 |
642 |
|
ld8 r85 = [r106], 8 |
643 |
|
ld8 r87 = [r107], 8 |
644 |
|
ld8 r89 = [r108], 8 |
645 |
|
ld8 r91 = [r109], 8 |
646 |
|
ld8 r93 = [r110], 8 |
647 |
|
ld8 r95 = [r111], 8 |
648 |
|
(p16) br.cond.dptk.many .Lber // If ref is aligned, everything is loaded and we can start the calculation |
649 |
|
;; |
650 |
|
ld8 r96 = [r96] // If not, we have to load a bit more |
651 |
|
ld8 r97 = [r97] |
652 |
|
ld8 r98 = [r98] |
653 |
|
ld8 r99 = [r99] |
654 |
|
ld8 r100 = [r100] |
655 |
|
ld8 r101 = [r101] |
656 |
|
ld8 r102 = [r102] |
657 |
|
ld8 r103 = [r103] |
658 |
|
ld8 r104 = [r104] |
659 |
|
ld8 r105 = [r105] |
660 |
|
ld8 r106 = [r106] |
661 |
|
ld8 r107 = [r107] |
662 |
|
ld8 r108 = [r108] |
663 |
|
ld8 r109 = [r109] |
664 |
|
ld8 r110 = [r110] |
665 |
|
ld8 r111 = [r111] |
666 |
|
(p24) br.cond.dptk.many .Lmod1 // according to the misalignment, we have |
667 |
|
(p18) br.cond.dpnt.many .Lmod2 // to jump to different preprocessing routines |
668 |
|
(p26) br.cond.dpnt.many .Lmod3 |
669 |
|
(p20) br.cond.dpnt.many .Lmod4 |
670 |
|
(p28) br.cond.dpnt.many .Lmod5 |
671 |
|
(p22) br.cond.dpnt.many .Lmod6 |
672 |
|
;; |
673 |
|
.Lmod7: // this jump point is not needed |
674 |
|
shrp r64 = r65, r64, 56 // in these blocks, we do the preprocessing |
675 |
|
shrp r65 = r96, r65, 56 |
676 |
|
shrp r66 = r67, r66, 56 |
677 |
|
shrp r67 = r97, r67, 56 |
678 |
|
shrp r68 = r69, r68, 56 |
679 |
|
shrp r69 = r98, r69, 56 |
680 |
|
shrp r70 = r71, r70, 56 |
681 |
|
shrp r71 = r99, r71, 56 |
682 |
|
shrp r72 = r73, r72, 56 |
683 |
|
shrp r73 = r100, r73, 56 |
684 |
|
shrp r74 = r75, r74, 56 |
685 |
|
shrp r75 = r101, r75, 56 |
686 |
|
shrp r76 = r77, r76, 56 |
687 |
|
shrp r77 = r102, r77, 56 |
688 |
|
shrp r78 = r79, r78, 56 |
689 |
|
shrp r79 = r103, r79, 56 |
690 |
|
shrp r80 = r81, r80, 56 |
691 |
|
shrp r81 = r104, r81, 56 |
692 |
|
shrp r82 = r83, r82, 56 |
693 |
|
shrp r83 = r105, r83, 56 |
694 |
|
shrp r84 = r85, r84, 56 |
695 |
|
shrp r85 = r106, r85, 56 |
696 |
|
shrp r86 = r87, r86, 56 |
697 |
|
shrp r87 = r107, r87, 56 |
698 |
|
shrp r88 = r89, r88, 56 |
699 |
|
shrp r89 = r108, r89, 56 |
700 |
|
shrp r90 = r91, r90, 56 |
701 |
|
shrp r91 = r109, r91, 56 |
702 |
|
shrp r92 = r93, r92, 56 |
703 |
|
shrp r93 = r110, r93, 56 |
704 |
|
shrp r94 = r95, r94, 56 |
705 |
|
shrp r95 = r111, r95, 56 |
706 |
|
br.cond.sptk.many .Lber // and then we jump to the calculation |
707 |
|
;; |
708 |
|
.Lmod6: |
709 |
|
shrp r64 = r65, r64, 48 |
710 |
|
shrp r65 = r96, r65, 48 |
711 |
|
shrp r66 = r67, r66, 48 |
712 |
|
shrp r67 = r97, r67, 48 |
713 |
|
shrp r68 = r69, r68, 48 |
714 |
|
shrp r69 = r98, r69, 48 |
715 |
|
shrp r70 = r71, r70, 48 |
716 |
|
shrp r71 = r99, r71, 48 |
717 |
|
shrp r72 = r73, r72, 48 |
718 |
|
shrp r73 = r100, r73, 48 |
719 |
|
shrp r74 = r75, r74, 48 |
720 |
|
shrp r75 = r101, r75, 48 |
721 |
|
shrp r76 = r77, r76, 48 |
722 |
|
shrp r77 = r102, r77, 48 |
723 |
|
shrp r78 = r79, r78, 48 |
724 |
|
shrp r79 = r103, r79, 48 |
725 |
|
shrp r80 = r81, r80, 48 |
726 |
|
shrp r81 = r104, r81, 48 |
727 |
|
shrp r82 = r83, r82, 48 |
728 |
|
shrp r83 = r105, r83, 48 |
729 |
|
shrp r84 = r85, r84, 48 |
730 |
|
shrp r85 = r106, r85, 48 |
731 |
|
shrp r86 = r87, r86, 48 |
732 |
|
shrp r87 = r107, r87, 48 |
733 |
|
shrp r88 = r89, r88, 48 |
734 |
|
shrp r89 = r108, r89, 48 |
735 |
|
shrp r90 = r91, r90, 48 |
736 |
|
shrp r91 = r109, r91, 48 |
737 |
|
shrp r92 = r93, r92, 48 |
738 |
|
shrp r93 = r110, r93, 48 |
739 |
|
shrp r94 = r95, r94, 48 |
740 |
|
shrp r95 = r111, r95, 48 |
741 |
|
br.cond.sptk.many .Lber |
742 |
|
;; |
743 |
|
.Lmod5: |
744 |
|
shrp r64 = r65, r64, 40 |
745 |
|
shrp r65 = r96, r65, 40 |
746 |
|
shrp r66 = r67, r66, 40 |
747 |
|
shrp r67 = r97, r67, 40 |
748 |
|
shrp r68 = r69, r68, 40 |
749 |
|
shrp r69 = r98, r69, 40 |
750 |
|
shrp r70 = r71, r70, 40 |
751 |
|
shrp r71 = r99, r71, 40 |
752 |
|
shrp r72 = r73, r72, 40 |
753 |
|
shrp r73 = r100, r73, 40 |
754 |
|
shrp r74 = r75, r74, 40 |
755 |
|
shrp r75 = r101, r75, 40 |
756 |
|
shrp r76 = r77, r76, 40 |
757 |
|
shrp r77 = r102, r77, 40 |
758 |
|
shrp r78 = r79, r78, 40 |
759 |
|
shrp r79 = r103, r79, 40 |
760 |
|
shrp r80 = r81, r80, 40 |
761 |
|
shrp r81 = r104, r81, 40 |
762 |
|
shrp r82 = r83, r82, 40 |
763 |
|
shrp r83 = r105, r83, 40 |
764 |
|
shrp r84 = r85, r84, 40 |
765 |
|
shrp r85 = r106, r85, 40 |
766 |
|
shrp r86 = r87, r86, 40 |
767 |
|
shrp r87 = r107, r87, 40 |
768 |
|
shrp r88 = r89, r88, 40 |
769 |
|
shrp r89 = r108, r89, 40 |
770 |
|
shrp r90 = r91, r90, 40 |
771 |
|
shrp r91 = r109, r91, 40 |
772 |
|
shrp r92 = r93, r92, 40 |
773 |
|
shrp r93 = r110, r93, 40 |
774 |
|
shrp r94 = r95, r94, 40 |
775 |
|
shrp r95 = r111, r95, 40 |
776 |
|
br.cond.sptk.many .Lber |
777 |
|
;; |
778 |
|
.Lmod4: |
779 |
|
shrp r64 = r65, r64, 32 |
780 |
|
shrp r65 = r96, r65, 32 |
781 |
|
shrp r66 = r67, r66, 32 |
782 |
|
shrp r67 = r97, r67, 32 |
783 |
|
shrp r68 = r69, r68, 32 |
784 |
|
shrp r69 = r98, r69, 32 |
785 |
|
shrp r70 = r71, r70, 32 |
786 |
|
shrp r71 = r99, r71, 32 |
787 |
|
shrp r72 = r73, r72, 32 |
788 |
|
shrp r73 = r100, r73, 32 |
789 |
|
shrp r74 = r75, r74, 32 |
790 |
|
shrp r75 = r101, r75, 32 |
791 |
|
shrp r76 = r77, r76, 32 |
792 |
|
shrp r77 = r102, r77, 32 |
793 |
|
shrp r78 = r79, r78, 32 |
794 |
|
shrp r79 = r103, r79, 32 |
795 |
|
shrp r80 = r81, r80, 32 |
796 |
|
shrp r81 = r104, r81, 32 |
797 |
|
shrp r82 = r83, r82, 32 |
798 |
|
shrp r83 = r105, r83, 32 |
799 |
|
shrp r84 = r85, r84, 32 |
800 |
|
shrp r85 = r106, r85, 32 |
801 |
|
shrp r86 = r87, r86, 32 |
802 |
|
shrp r87 = r107, r87, 32 |
803 |
|
shrp r88 = r89, r88, 32 |
804 |
|
shrp r89 = r108, r89, 32 |
805 |
|
shrp r90 = r91, r90, 32 |
806 |
|
shrp r91 = r109, r91, 32 |
807 |
|
shrp r92 = r93, r92, 32 |
808 |
|
shrp r93 = r110, r93, 32 |
809 |
|
shrp r94 = r95, r94, 32 |
810 |
|
shrp r95 = r111, r95, 32 |
811 |
|
br.cond.sptk.many .Lber |
812 |
|
;; |
813 |
|
.Lmod3: |
814 |
|
shrp r64 = r65, r64, 24 |
815 |
|
shrp r65 = r96, r65, 24 |
816 |
|
shrp r66 = r67, r66, 24 |
817 |
|
shrp r67 = r97, r67, 24 |
818 |
|
shrp r68 = r69, r68, 24 |
819 |
|
shrp r69 = r98, r69, 24 |
820 |
|
shrp r70 = r71, r70, 24 |
821 |
|
shrp r71 = r99, r71, 24 |
822 |
|
shrp r72 = r73, r72, 24 |
823 |
|
shrp r73 = r100, r73, 24 |
824 |
|
shrp r74 = r75, r74, 24 |
825 |
|
shrp r75 = r101, r75, 24 |
826 |
|
shrp r76 = r77, r76, 24 |
827 |
|
shrp r77 = r102, r77, 24 |
828 |
|
shrp r78 = r79, r78, 24 |
829 |
|
shrp r79 = r103, r79, 24 |
830 |
|
shrp r80 = r81, r80, 24 |
831 |
|
shrp r81 = r104, r81, 24 |
832 |
|
shrp r82 = r83, r82, 24 |
833 |
|
shrp r83 = r105, r83, 24 |
834 |
|
shrp r84 = r85, r84, 24 |
835 |
|
shrp r85 = r106, r85, 24 |
836 |
|
shrp r86 = r87, r86, 24 |
837 |
|
shrp r87 = r107, r87, 24 |
838 |
|
shrp r88 = r89, r88, 24 |
839 |
|
shrp r89 = r108, r89, 24 |
840 |
|
shrp r90 = r91, r90, 24 |
841 |
|
shrp r91 = r109, r91, 24 |
842 |
|
shrp r92 = r93, r92, 24 |
843 |
|
shrp r93 = r110, r93, 24 |
844 |
|
shrp r94 = r95, r94, 24 |
845 |
|
shrp r95 = r111, r95, 24 |
846 |
|
br.cond.sptk.many .Lber |
847 |
|
;; |
848 |
|
.Lmod2: |
849 |
|
shrp r64 = r65, r64, 16 |
850 |
|
shrp r65 = r96, r65, 16 |
851 |
|
shrp r66 = r67, r66, 16 |
852 |
|
shrp r67 = r97, r67, 16 |
853 |
|
shrp r68 = r69, r68, 16 |
854 |
|
shrp r69 = r98, r69, 16 |
855 |
|
shrp r70 = r71, r70, 16 |
856 |
|
shrp r71 = r99, r71, 16 |
857 |
|
shrp r72 = r73, r72, 16 |
858 |
|
shrp r73 = r100, r73, 16 |
859 |
|
shrp r74 = r75, r74, 16 |
860 |
|
shrp r75 = r101, r75, 16 |
861 |
|
shrp r76 = r77, r76, 16 |
862 |
|
shrp r77 = r102, r77, 16 |
863 |
|
shrp r78 = r79, r78, 16 |
864 |
|
shrp r79 = r103, r79, 16 |
865 |
|
shrp r80 = r81, r80, 16 |
866 |
|
shrp r81 = r104, r81, 16 |
867 |
|
shrp r82 = r83, r82, 16 |
868 |
|
shrp r83 = r105, r83, 16 |
869 |
|
shrp r84 = r85, r84, 16 |
870 |
|
shrp r85 = r106, r85, 16 |
871 |
|
shrp r86 = r87, r86, 16 |
872 |
|
shrp r87 = r107, r87, 16 |
873 |
|
shrp r88 = r89, r88, 16 |
874 |
|
shrp r89 = r108, r89, 16 |
875 |
|
shrp r90 = r91, r90, 16 |
876 |
|
shrp r91 = r109, r91, 16 |
877 |
|
shrp r92 = r93, r92, 16 |
878 |
|
shrp r93 = r110, r93, 16 |
879 |
|
shrp r94 = r95, r94, 16 |
880 |
|
shrp r95 = r111, r95, 16 |
881 |
|
br.cond.sptk.many .Lber |
882 |
|
;; |
883 |
|
.Lmod1: |
884 |
|
shrp r64 = r65, r64, 8 |
885 |
|
shrp r65 = r96, r65, 8 |
886 |
|
shrp r66 = r67, r66, 8 |
887 |
|
shrp r67 = r97, r67, 8 |
888 |
|
shrp r68 = r69, r68, 8 |
889 |
|
shrp r69 = r98, r69, 8 |
890 |
|
shrp r70 = r71, r70, 8 |
891 |
|
shrp r71 = r99, r71, 8 |
892 |
|
shrp r72 = r73, r72, 8 |
893 |
|
shrp r73 = r100, r73, 8 |
894 |
|
shrp r74 = r75, r74, 8 |
895 |
|
shrp r75 = r101, r75, 8 |
896 |
|
shrp r76 = r77, r76, 8 |
897 |
|
shrp r77 = r102, r77, 8 |
898 |
|
shrp r78 = r79, r78, 8 |
899 |
|
shrp r79 = r103, r79, 8 |
900 |
|
shrp r80 = r81, r80, 8 |
901 |
|
shrp r81 = r104, r81, 8 |
902 |
|
shrp r82 = r83, r82, 8 |
903 |
|
shrp r83 = r105, r83, 8 |
904 |
|
shrp r84 = r85, r84, 8 |
905 |
|
shrp r85 = r106, r85, 8 |
906 |
|
shrp r86 = r87, r86, 8 |
907 |
|
shrp r87 = r107, r87, 8 |
908 |
|
shrp r88 = r89, r88, 8 |
909 |
|
shrp r89 = r108, r89, 8 |
910 |
|
shrp r90 = r91, r90, 8 |
911 |
|
shrp r91 = r109, r91, 8 |
912 |
|
shrp r92 = r93, r92, 8 |
913 |
|
shrp r93 = r110, r93, 8 |
914 |
|
shrp r94 = r95, r94, 8 |
915 |
|
shrp r95 = r111, r95, 8 |
916 |
|
.Lber: |
917 |
|
;; |
918 |
|
psad1 r32 = r32, r64 // Here we do the calculation. |
919 |
|
psad1 r33 = r33, r65 // The machine is providing a fast method |
920 |
|
psad1 r34 = r34, r66 // for calculating sad, so we use it |
921 |
|
psad1 r35 = r35, r67 |
922 |
|
psad1 r36 = r36, r68 |
923 |
|
psad1 r37 = r37, r69 |
924 |
|
psad1 r38 = r38, r70 |
925 |
|
psad1 r39 = r39, r71 |
926 |
|
psad1 r40 = r40, r72 |
927 |
|
psad1 r41 = r41, r73 |
928 |
|
psad1 r42 = r42, r74 |
929 |
|
psad1 r43 = r43, r75 |
930 |
|
psad1 r44 = r44, r76 |
931 |
|
psad1 r45 = r45, r77 |
932 |
|
psad1 r46 = r46, r78 |
933 |
|
psad1 r47 = r47, r79 |
934 |
|
psad1 r48 = r48, r80 |
935 |
|
psad1 r49 = r49, r81 |
936 |
|
psad1 r50 = r50, r82 |
937 |
|
psad1 r51 = r51, r83 |
938 |
|
psad1 r52 = r52, r84 |
939 |
|
psad1 r53 = r53, r85 |
940 |
|
psad1 r54 = r54, r86 |
941 |
|
psad1 r55 = r55, r87 |
942 |
|
psad1 r56 = r56, r88 |
943 |
|
psad1 r57 = r57, r89 |
944 |
|
psad1 r58 = r58, r90 |
945 |
|
psad1 r59 = r59, r91 |
946 |
|
psad1 r60 = r60, r92 |
947 |
|
psad1 r61 = r61, r93 |
948 |
|
psad1 r62 = r62, r94 |
949 |
|
psad1 r63 = r63, r95 |
950 |
|
;; |
951 |
|
add r32 = r32, r63 // at last, we have to sum up |
952 |
|
add r33 = r33, r62 // in 5 stages |
953 |
|
add r34 = r34, r61 |
954 |
|
add r35 = r35, r60 |
955 |
|
add r36 = r36, r59 |
956 |
|
add r37 = r37, r58 |
957 |
|
add r38 = r38, r57 |
958 |
|
add r39 = r39, r56 |
959 |
|
add r40 = r40, r55 |
960 |
|
add r41 = r41, r54 |
961 |
|
add r42 = r42, r53 |
962 |
|
add r43 = r43, r52 |
963 |
|
add r44 = r44, r51 |
964 |
|
add r45 = r45, r50 |
965 |
|
add r46 = r46, r49 |
966 |
|
add r47 = r47, r48 |
967 |
|
;; |
968 |
|
add r32 = r32, r47 |
969 |
|
add r33 = r33, r46 |
970 |
|
add r34 = r34, r45 |
971 |
|
add r35 = r35, r44 |
972 |
|
add r36 = r36, r43 |
973 |
|
add r37 = r37, r42 |
974 |
|
add r38 = r38, r41 |
975 |
|
add r39 = r39, r40 |
976 |
|
;; |
977 |
|
add r32 = r32, r39 |
978 |
|
add r33 = r33, r38 |
979 |
|
add r34 = r34, r37 |
980 |
|
add r35 = r35, r36 |
981 |
|
;; |
982 |
|
add r32 = r32, r35 |
983 |
|
add r33 = r33, r34 |
984 |
|
;; |
985 |
|
add r8 = r32, r33 // and store the result in r8 |
986 |
|
mov pr = r2, -1 |
987 |
|
mov ar.pfs = r1 |
988 |
|
br.ret.sptk.many b0 |
989 |
|
.endp sad16_ia64# |
990 |
|
|
991 |
|
|
992 |
|
|
993 |
|
|
994 |
|
.align 16 |
995 |
|
.global sad8_ia64# |
996 |
|
.proc sad8_ia64# |
997 |
|
sad8_ia64: |
998 |
|
alloc r1 = ar.pfs, 3, 21, 0, 0 |
999 |
|
mov r2 = pr |
1000 |
|
dep r14 = r0, r33, 0, 3 // calculate aligned version of ref |
1001 |
|
dep.z r31 = r33, 0, 3 // calculate misalignment of ref |
1002 |
|
;; |
1003 |
|
mov r40 = r34 //(1) calculate multiples of stride |
1004 |
|
shl r41 = r34, 1 //(2) |
1005 |
|
shladd r42 = r34, 1, r34 //(3) |
1006 |
|
shl r43 = r34, 2 //(4) |
1007 |
|
shladd r44 = r34, 2, r34 //(5) |
1008 |
|
;; |
1009 |
|
cmp.eq p16, p17 = 0, r31 // set predicates according to the misalignment of ref |
1010 |
|
cmp.eq p18, p19 = 2, r31 |
1011 |
|
shl r45 = r42, 1 //(6) |
1012 |
|
cmp.eq p20, p21 = 4, r31 |
1013 |
|
cmp.eq p22, p23 = 6, r31 |
1014 |
|
shladd r46 = r42, 1, r34 //(7) |
1015 |
|
cmp.eq p24, p25 = 1, r31 |
1016 |
|
cmp.eq p26, p27 = 3, r31 |
1017 |
|
cmp.eq p28, p29 = 5, r31 |
1018 |
|
;; |
1019 |
|
mov r48 = r14 // calculate memory adresses of data |
1020 |
|
add r33 = r32, r40 |
1021 |
|
add r49 = r14, r40 |
1022 |
|
add r34 = r32, r41 |
1023 |
|
add r50 = r14, r41 |
1024 |
|
add r35 = r32, r42 |
1025 |
|
add r51 = r14, r42 |
1026 |
|
add r36 = r32, r43 |
1027 |
|
add r52 = r14, r43 |
1028 |
|
add r37 = r32, r44 |
1029 |
|
add r53 = r14, r44 |
1030 |
|
add r38 = r32, r45 |
1031 |
|
add r54 = r14, r45 |
1032 |
|
add r39 = r32, r46 |
1033 |
|
add r55 = r14, r46 |
1034 |
|
;; |
1035 |
|
ld8 r32 = [r32] // load everythingund alles wird geladen |
1036 |
|
ld8 r33 = [r33] // cur is located in r32 - r39 |
1037 |
|
ld8 r34 = [r34] // ref in r40 - r47 |
1038 |
|
ld8 r35 = [r35] |
1039 |
|
ld8 r36 = [r36] |
1040 |
|
ld8 r37 = [r37] |
1041 |
|
ld8 r38 = [r38] |
1042 |
|
ld8 r39 = [r39] |
1043 |
|
ld8 r40 = [r48] ,8 |
1044 |
|
ld8 r41 = [r49] ,8 |
1045 |
|
ld8 r42 = [r50] ,8 |
1046 |
|
ld8 r43 = [r51] ,8 |
1047 |
|
ld8 r44 = [r52] ,8 |
1048 |
|
ld8 r45 = [r53] ,8 |
1049 |
|
ld8 r46 = [r54] ,8 |
1050 |
|
ld8 r47 = [r55] ,8 |
1051 |
|
(p16) br.cond.dptk.many .Lber2 // if ref is aligned, we can start the calculation |
1052 |
|
;; |
1053 |
|
ld8 r48 = [r48] // if not, we have to load some more |
1054 |
|
ld8 r49 = [r49] // because of the alignment of ld8 |
1055 |
|
ld8 r50 = [r50] |
1056 |
|
ld8 r51 = [r51] |
1057 |
|
ld8 r52 = [r52] |
1058 |
|
ld8 r53 = [r53] |
1059 |
|
ld8 r54 = [r54] |
1060 |
|
ld8 r55 = [r55] |
1061 |
|
(p24) br.cond.dptk.many .Lmode1 |
1062 |
|
(p18) br.cond.dpnt.many .Lmode2 |
1063 |
|
(p26) br.cond.dpnt.many .Lmode3 |
1064 |
|
(p20) br.cond.dpnt.many .Lmode4 |
1065 |
|
(p28) br.cond.dpnt.many .Lmode5 |
1066 |
|
(p22) br.cond.dpnt.many .Lmode6 |
1067 |
|
;; |
1068 |
|
.Lmode7: // this jump piont is not needed, it is for better understandment |
1069 |
|
shrp r40 = r48, r40, 56 // here we do some preprocessing on the data |
1070 |
|
shrp r41 = r49, r41, 56 // this is because of the alignment problem of ref |
1071 |
|
shrp r42 = r50, r42, 56 |
1072 |
|
shrp r43 = r51, r43, 56 |
1073 |
|
shrp r44 = r52, r44, 56 |
1074 |
|
shrp r45 = r53, r45, 56 |
1075 |
|
shrp r46 = r54, r46, 56 |
1076 |
|
shrp r47 = r55, r47, 56 |
1077 |
|
br.cond.sptk.many .Lber2 |
1078 |
|
;; |
1079 |
|
.Lmode6: |
1080 |
|
shrp r40 = r48, r40, 48 |
1081 |
|
shrp r41 = r49, r41, 48 |
1082 |
|
shrp r42 = r50, r42, 48 |
1083 |
|
shrp r43 = r51, r43, 48 |
1084 |
|
shrp r44 = r52, r44, 48 |
1085 |
|
shrp r45 = r53, r45, 48 |
1086 |
|
shrp r46 = r54, r46, 48 |
1087 |
|
shrp r47 = r55, r47, 48 |
1088 |
|
br.cond.sptk.many .Lber2 |
1089 |
|
;; |
1090 |
|
.Lmode5: |
1091 |
|
shrp r40 = r48, r40, 40 |
1092 |
|
shrp r41 = r49, r41, 40 |
1093 |
|
shrp r42 = r50, r42, 40 |
1094 |
|
shrp r43 = r51, r43, 40 |
1095 |
|
shrp r44 = r52, r44, 40 |
1096 |
|
shrp r45 = r53, r45, 40 |
1097 |
|
shrp r46 = r54, r46, 40 |
1098 |
|
shrp r47 = r55, r47, 40 |
1099 |
|
br.cond.sptk.many .Lber2 |
1100 |
|
;; |
1101 |
|
.Lmode4: |
1102 |
|
shrp r40 = r48, r40, 32 |
1103 |
|
shrp r41 = r49, r41, 32 |
1104 |
|
shrp r42 = r50, r42, 32 |
1105 |
|
shrp r43 = r51, r43, 32 |
1106 |
|
shrp r44 = r52, r44, 32 |
1107 |
|
shrp r45 = r53, r45, 32 |
1108 |
|
shrp r46 = r54, r46, 32 |
1109 |
|
shrp r47 = r55, r47, 32 |
1110 |
|
br.cond.sptk.many .Lber2 |
1111 |
|
;; |
1112 |
|
.Lmode3: |
1113 |
|
shrp r40 = r48, r40, 24 |
1114 |
|
shrp r41 = r49, r41, 24 |
1115 |
|
shrp r42 = r50, r42, 24 |
1116 |
|
shrp r43 = r51, r43, 24 |
1117 |
|
shrp r44 = r52, r44, 24 |
1118 |
|
shrp r45 = r53, r45, 24 |
1119 |
|
shrp r46 = r54, r46, 24 |
1120 |
|
shrp r47 = r55, r47, 24 |
1121 |
|
br.cond.sptk.many .Lber2 |
1122 |
|
;; |
1123 |
|
.Lmode2: |
1124 |
|
shrp r40 = r48, r40, 16 |
1125 |
|
shrp r41 = r49, r41, 16 |
1126 |
|
shrp r42 = r50, r42, 16 |
1127 |
|
shrp r43 = r51, r43, 16 |
1128 |
|
shrp r44 = r52, r44, 16 |
1129 |
|
shrp r45 = r53, r45, 16 |
1130 |
|
shrp r46 = r54, r46, 16 |
1131 |
|
shrp r47 = r55, r47, 16 |
1132 |
|
br.cond.sptk.many .Lber2 |
1133 |
|
;; |
1134 |
|
.Lmode1: |
1135 |
|
shrp r40 = r48, r40, 8 |
1136 |
|
shrp r41 = r49, r41, 8 |
1137 |
|
shrp r42 = r50, r42, 8 |
1138 |
|
shrp r43 = r51, r43, 8 |
1139 |
|
shrp r44 = r52, r44, 8 |
1140 |
|
shrp r45 = r53, r45, 8 |
1141 |
|
shrp r46 = r54, r46, 8 |
1142 |
|
shrp r47 = r55, r47, 8 |
1143 |
|
.Lber2: |
1144 |
|
;; |
1145 |
|
psad1 r32 = r32, r40 // we start calculating sad |
1146 |
|
psad1 r33 = r33, r41 // using th psad1 command of IA64 |
1147 |
|
psad1 r34 = r34, r42 |
1148 |
|
psad1 r35 = r35, r43 |
1149 |
|
psad1 r36 = r36, r44 |
1150 |
|
psad1 r37 = r37, r45 |
1151 |
|
psad1 r38 = r38, r46 |
1152 |
|
psad1 r39 = r39, r47 |
1153 |
|
;; |
1154 |
|
add r32 = r32, r33 // then we sum up everything |
1155 |
|
add r33 = r34, r35 |
1156 |
|
add r34 = r36, r37 |
1157 |
|
add r35 = r38, r39 |
1158 |
|
;; |
1159 |
|
add r32 = r32, r33 |
1160 |
|
add r33 = r34, r35 |
1161 |
|
;; |
1162 |
|
add r8 = r32, r33 // and store the result un r8 |
1163 |
|
mov pr = r2, -1 |
1164 |
|
mov ar.pfs = r1 |
1165 |
|
br.ret.sptk.many b0 |
1166 |
|
.endp sad8_ia64# |