Parent Directory | Revision Log
Revision 250 -
(view)
(download)
Original Path: trunk/xvidcore/src/motion/ia64_asm/sad_ia64.s
1 : | ia64p | 250 | // ------------------------------------------------------------------------------ |
2 : | // * | ||
3 : | // * Optimized Assembler Versions of sad8 and sad16 | ||
4 : | // * | ||
5 : | // ------------------------------------------------------------------------------ | ||
6 : | // * | ||
7 : | // * Hannes Jütting and Christopher Özbek | ||
8 : | // * {s_juetti,s_oezbek}@ira.uka.de | ||
9 : | // * | ||
10 : | // * Programmed for the IA64 laboratory held at University Karlsruhe 2002 | ||
11 : | // * http://www.info.uni-karlsruhe.de/~rubino/ia64p/ | ||
12 : | // * | ||
13 : | // ------------------------------------------------------------------------------ | ||
14 : | // * | ||
15 : | // * These are the optimized assembler versions of sad8 and sad16, which calculate | ||
16 : | // * the sum of absolute differences between two 8x8/16x16 block matrices. | ||
17 : | // * | ||
18 : | // * Our approach uses: | ||
19 : | // * - The Itanium command psad1, which solves the problem in hardware. | ||
20 : | // * - Modulo-Scheduled Loops as the best way to loop unrolling on the IA64 | ||
21 : | // * EPIC architecture | ||
22 : | // * - Alignment resolving to avoid memory faults | ||
23 : | // * | ||
24 : | // ------------------------------------------------------------------------------ | ||
25 : | |||
26 : | .text | ||
27 : | |||
28 : | // ------------------------------------------------------------------------------ | ||
29 : | // * SAD16_IA64 | ||
30 : | // * | ||
31 : | // * In: | ||
32 : | // * r32 = cur (aligned) | ||
33 : | // * r33 = ref (not aligned) | ||
34 : | // * r34 = stride | ||
35 : | // * r35 = bestsad | ||
36 : | // * Out: | ||
37 : | // * r8 = sum of absolute differences | ||
38 : | // * | ||
39 : | // ------------------------------------------------------------------------------ | ||
40 : | |||
41 : | ia64p | 205 | .align 16 |
42 : | .global sad16_ia64# | ||
43 : | .proc sad16_ia64# | ||
44 : | sad16_ia64: | ||
45 : | |||
46 : | |||
47 : | ia64p | 250 | // Define Latencies |
48 : | LL16=3 // load latency | ||
49 : | SL16=1 // shift latency | ||
50 : | OL16=1 // or latency | ||
51 : | PL16=1 // psad latency | ||
52 : | AL16=1 // add latency | ||
53 : | ia64p | 205 | |
54 : | ia64p | 250 | // Allocate Registern in RSE |
55 : | alloc r9=ar.pfs,4,36,0,40 | ||
56 : | ia64p | 205 | |
57 : | ia64p | 250 | // lfetch [r32] // might help |
58 : | |||
59 : | mov r8 = r0 // clear the return reg | ||
60 : | ia64p | 205 | |
61 : | ia64p | 250 | // Save LC and predicates |
62 : | mov r20 = ar.lc | ||
63 : | mov r21 = pr | ||
64 : | ia64p | 205 | |
65 : | ia64p | 250 | dep.z r23 = r33, 3, 3 // get the # of bits ref is misaligned |
66 : | and r15 = -8, r33 // align the ref pointer by deleting the last 3 bit | ||
67 : | |||
68 : | mov r14 = r32 // save the cur pointer | ||
69 : | mov r16 = r34 // save stride | ||
70 : | mov r17 = r35 // save bestsad | ||
71 : | |||
72 : | ia64p | 205 | ;; |
73 : | ia64p | 250 | add r18 = 8, r14 // precalc second cur pointer |
74 : | add r19 = 8, r15 // precalc second ref pointer | ||
75 : | add r27 = 16, r15 // precalc third ref pointer | ||
76 : | sub r25 = 64, r23 // # of right shifts | ||
77 : | ia64p | 205 | |
78 : | ia64p | 250 | // Initialize Loop-counters |
79 : | mov ar.lc = 15 // loop 16 times | ||
80 : | mov ar.ec = LL16 + SL16 + OL16 + PL16 + AL16 + AL16 | ||
81 : | mov pr.rot = 1 << 16 // reseting rotating predicate regs and set p16 to 1 | ||
82 : | ia64p | 205 | ;; |
83 : | |||
84 : | ia64p | 250 | // Intialize Arrays for Register Rotation |
85 : | .rotr r_cur_ld1[LL16+SL16+OL16+1], r_cur_ld2[LL16+SL16+OL16+1], r_ref_16_ld1[LL16+1], r_ref_16_ld2[LL16+1], r_ref_16_ld3[LL16+1], r_ref_16_shru1[SL16], r_ref_16_shl1[SL16], r_ref_16_shru2[SL16], r_ref_16_shl2[SL16+1], r_ref_16_or1[OL16], r_ref_16_or2[OL16+1], r_psad1[PL16+1], r_psad2[PL16+1], r_add_16[AL16+1] | ||
86 : | .rotp p_ld_16[LL16], p_sh_16[SL16], p_or_16[OL16], p_psad_16[PL16], p_add1_16[AL16], p_add2_16[AL16] | ||
87 : | ia64p | 205 | |
88 : | ia64p | 250 | .L_loop16: |
89 : | ia64p | 205 | {.mmi |
90 : | ia64p | 250 | (p_ld_16[0]) ld8 r_cur_ld1[0] = [r14], r16 // Cur load first 8 Byte |
91 : | (p_ld_16[0]) ld8 r_cur_ld2[0] = [r18], r16 // Cur load next 8 Byte | ||
92 : | (p_psad_16[0]) psad1 r_psad1[0] = r_cur_ld1[LL16+SL16+OL16], r_ref_16_or2[0] // psad of cur and ref | ||
93 : | ia64p | 205 | } |
94 : | {.mmi | ||
95 : | ia64p | 250 | (p_ld_16[0]) ld8 r_ref_16_ld1[0] = [r15], r16 // Ref load first 8 Byte (unaligned) |
96 : | (p_ld_16[0]) ld8 r_ref_16_ld2[0] = [r19], r16 // Ref load next 8 Byte (unaligned) | ||
97 : | (p_psad_16[0]) psad1 r_psad2[0] = r_cur_ld2[LL16+SL16+OL16], r_ref_16_or2[OL16] // psad of cur_2 and ref_2 | ||
98 : | ia64p | 205 | } |
99 : | {.mii | ||
100 : | ia64p | 250 | (p_ld_16[0]) ld8 r_ref_16_ld3[0] = [r27], r16 // Ref load third 8 Byte (unaligned) |
101 : | (p_or_16[0]) or r_ref_16_or1[0] = r_ref_16_shl1[0], r_ref_16_shru2[0] // Ref or r_ref_16_shl1 + 1 and r_ref_16_shl1 + 1 | ||
102 : | (p_sh_16[0]) shr.u r_ref_16_shru1[0] = r_ref_16_ld1[LL16], r23 // Ref shift | ||
103 : | ia64p | 205 | } |
104 : | {.mii | ||
105 : | ia64p | 250 | (p_or_16[0]) or r_ref_16_or2[0] = r_ref_16_shl2[0], r_ref_16_shl2[SL16] // Ref or r_ref_shru2 + 1 and r_ref_shl2 + 1 |
106 : | (p_sh_16[0]) shl r_ref_16_shl1[0] = r_ref_16_ld2[LL16], r25 // Ref shift | ||
107 : | (p_sh_16[0]) shr.u r_ref_16_shru2[0] = r_ref_16_ld2[LL16], r23 // Ref shift | ||
108 : | ia64p | 205 | } |
109 : | ia64p | 250 | {.mib |
110 : | (p_add2_16[0]) cmp.ge.unc p6, p7 = r8, r17 | ||
111 : | (p_sh_16[0]) shl r_ref_16_shl2[0]= r_ref_16_ld3[LL16], r25 // Ref shift | ||
112 : | (p6) br.spnt.few .L_loop_exit16 | ||
113 : | ia64p | 205 | } |
114 : | {.mmb | ||
115 : | ia64p | 250 | (p_add1_16[0]) add r_add_16[0] = r_psad1[PL16], r_psad2[PL16] // add the psad results |
116 : | (p_add2_16[0]) add r8 = r8, r_add_16[AL16] // add the results to the sum | ||
117 : | br.ctop.sptk.few .L_loop16 | ||
118 : | ia64p | 205 | ;; |
119 : | } | ||
120 : | ia64p | 250 | .L_loop_exit16: |
121 : | |||
122 : | // Restore LC and predicates | ||
123 : | mov ar.lc = r20 | ||
124 : | mov pr = r21,-1 | ||
125 : | |||
126 : | // Return | ||
127 : | ia64p | 205 | br.ret.sptk.many rp |
128 : | .endp sad16_ia64# | ||
129 : | |||
130 : | ia64p | 250 | // ------------------------------------------------------------------------------ |
131 : | // * SAD8_IA64 | ||
132 : | // * | ||
133 : | // * In: | ||
134 : | // * r32 = cur (aligned) | ||
135 : | // * r33 = ref (not aligned) | ||
136 : | // * r34 = stride | ||
137 : | // * Out: | ||
138 : | // * r8 = sum of absolute differences | ||
139 : | // * | ||
140 : | // ------------------------------------------------------------------------------ | ||
141 : | |||
142 : | ia64p | 205 | .align 16 |
143 : | .global sad8_ia64# | ||
144 : | .proc sad8_ia64# | ||
145 : | |||
146 : | sad8_ia64: | ||
147 : | |||
148 : | |||
149 : | ia64p | 250 | // Define Latencies |
150 : | LL8=3 // load latency | ||
151 : | SL8=1 // shift latency | ||
152 : | OL8=1 // or latency | ||
153 : | PL8=1 // psad latency | ||
154 : | AL8=1 // add latency | ||
155 : | ia64p | 205 | |
156 : | ia64p | 250 | // Allocate Registers in RSE |
157 : | alloc r9 = ar.pfs,3,21,0,24 | ||
158 : | |||
159 : | // lfetch [r32] // Maybe this helps? | ||
160 : | |||
161 : | mov r8 = r0 // Initialize result | ||
162 : | |||
163 : | mov r14 = r32 // Save Cur | ||
164 : | and r15 = -8, r33 // Align the Ref pointer by deleting the last 3 bit | ||
165 : | mov r16 = r34 // Save Stride | ||
166 : | |||
167 : | // Save LC and predicates | ||
168 : | mov r20 = ar.lc | ||
169 : | mov r21 = pr | ||
170 : | ia64p | 205 | |
171 : | ia64p | 250 | dep.z r23 = r33, 3, 3 // get the # of bits ref is misaligned |
172 : | ia64p | 205 | |
173 : | ia64p | 250 | ;; |
174 : | ia64p | 205 | |
175 : | ia64p | 250 | add r19 = 8, r15 // Precalculate second load-offset |
176 : | sub r25 = 64, r23 // Precalculate # of shifts | ||
177 : | ia64p | 205 | |
178 : | ia64p | 250 | // Initialize Loop-Counters |
179 : | mov ar.lc = 7 // Loop 7 times | ||
180 : | mov ar.ec = LL8 + SL8 + OL8 + PL8 + AL8 // Epiloque | ||
181 : | mov pr.rot = 1 << 16 // Reset Predicate Registers and initialize with P16 | ||
182 : | |||
183 : | // Initalize Arrays for Register Rotation | ||
184 : | .rotr r_cur_ld[LL8+SL8+OL8+1], r_ref_ld1[LL8+1], r_ref_ld2[LL8+1], r_shru[SL8+1], r_shl[SL8+1], r_or[OL8+1], r_psad[PL8+1] | ||
185 : | .rotp p_ld[LL8], p_sh[SL8], p_or[OL8], p_psad[PL8], p_add[AL8] | ||
186 : | |||
187 : | ia64p | 205 | ;; |
188 : | ia64p | 250 | .L_loop8: |
189 : | // {.mmi | ||
190 : | (p_ld[0]) ld8 r_ref_ld1[0] = [r15], r16 // Load 1st 8Byte from Ref | ||
191 : | (p_ld[0]) ld8 r_cur_ld[0] = [r14], r16 // Load Cur | ||
192 : | (p_psad[0]) psad1 r_psad[0] = r_cur_ld[LL8+SL8+OL8], r_or[OL8] // Do the Calculation | ||
193 : | // } | ||
194 : | // {.mii | ||
195 : | (p_ld[0]) ld8 r_ref_ld2[0] = [r19], r16 // Load 2nd 8Byte from Ref | ||
196 : | (p_sh[0]) shr.u r_shru[0] = r_ref_ld1[LL8], r23 // Shift unaligned Ref parts | ||
197 : | (p_sh[0]) shl r_shl[0] = r_ref_ld2[LL8], r25 // Shift unaligned Ref parts | ||
198 : | // } | ||
199 : | // {.mib | ||
200 : | (p_or[0]) or r_or[0] = r_shru[SL8], r_shl[SL8] // Combine unaligned Ref parts | ||
201 : | (p_add[0]) add r8 = r8, r_psad[PL8] // Sum psad result | ||
202 : | br.ctop.sptk.few .L_loop8 | ||
203 : | ia64p | 205 | ;; |
204 : | ia64p | 250 | // } |
205 : | ia64p | 205 | |
206 : | ia64p | 250 | // Restore Loop counters |
207 : | ia64p | 205 | mov ar.lc = r20 |
208 : | mov pr = r21,-1 | ||
209 : | ia64p | 250 | |
210 : | // Return | ||
211 : | ia64p | 205 | br.ret.sptk.many b0 |
212 : | .endp sad8_ia64# | ||
213 : | |||
214 : | |||
215 : | .common sad16bi#,8,8 | ||
216 : | .align 16 | ||
217 : | .global sad16bi_ia64# | ||
218 : | .proc sad16bi_ia64# | ||
219 : | sad16bi_ia64: | ||
220 : | .prologue | ||
221 : | .save ar.lc, r2 | ||
222 : | mov r2 = ar.lc | ||
223 : | .body | ||
224 : | zxt4 r35 = r35 | ||
225 : | mov r8 = r0 | ||
226 : | mov r23 = r0 | ||
227 : | addl r22 = 255, r0 | ||
228 : | .L21: | ||
229 : | addl r14 = 7, r0 | ||
230 : | mov r19 = r32 | ||
231 : | mov r21 = r34 | ||
232 : | mov r20 = r33 | ||
233 : | ;; | ||
234 : | mov ar.lc = r14 | ||
235 : | ;; | ||
236 : | .L105: | ||
237 : | mov r17 = r20 | ||
238 : | mov r18 = r21 | ||
239 : | ;; | ||
240 : | ld1 r14 = [r17], 1 | ||
241 : | ld1 r15 = [r18], 1 | ||
242 : | ;; | ||
243 : | add r14 = r14, r15 | ||
244 : | ;; | ||
245 : | adds r14 = 1, r14 | ||
246 : | ;; | ||
247 : | shr.u r16 = r14, 1 | ||
248 : | ;; | ||
249 : | cmp4.le p6, p7 = r0, r16 | ||
250 : | ;; | ||
251 : | (p7) mov r16 = r0 | ||
252 : | (p7) br.cond.dpnt .L96 | ||
253 : | ;; | ||
254 : | cmp4.ge p6, p7 = r22, r16 | ||
255 : | ;; | ||
256 : | (p7) addl r16 = 255, r0 | ||
257 : | .L96: | ||
258 : | ld1 r14 = [r19] | ||
259 : | adds r20 = 2, r20 | ||
260 : | adds r21 = 2, r21 | ||
261 : | ;; | ||
262 : | sub r15 = r14, r16 | ||
263 : | ;; | ||
264 : | cmp4.ge p6, p7 = 0, r15 | ||
265 : | ;; | ||
266 : | (p6) sub r14 = r16, r14 | ||
267 : | (p7) add r8 = r8, r15 | ||
268 : | ;; | ||
269 : | (p6) add r8 = r8, r14 | ||
270 : | ld1 r15 = [r18] | ||
271 : | ld1 r14 = [r17] | ||
272 : | ;; | ||
273 : | add r14 = r14, r15 | ||
274 : | adds r17 = 1, r19 | ||
275 : | ;; | ||
276 : | adds r14 = 1, r14 | ||
277 : | ;; | ||
278 : | shr.u r16 = r14, 1 | ||
279 : | ;; | ||
280 : | cmp4.le p6, p7 = r0, r16 | ||
281 : | ;; | ||
282 : | (p7) mov r16 = r0 | ||
283 : | (p7) br.cond.dpnt .L102 | ||
284 : | ;; | ||
285 : | cmp4.ge p6, p7 = r22, r16 | ||
286 : | ;; | ||
287 : | (p7) addl r16 = 255, r0 | ||
288 : | .L102: | ||
289 : | ld1 r14 = [r17] | ||
290 : | adds r19 = 2, r19 | ||
291 : | ;; | ||
292 : | sub r15 = r14, r16 | ||
293 : | ;; | ||
294 : | cmp4.ge p6, p7 = 0, r15 | ||
295 : | ;; | ||
296 : | (p7) add r8 = r8, r15 | ||
297 : | (p6) sub r14 = r16, r14 | ||
298 : | ;; | ||
299 : | (p6) add r8 = r8, r14 | ||
300 : | br.cloop.sptk.few .L105 | ||
301 : | adds r23 = 1, r23 | ||
302 : | add r32 = r32, r35 | ||
303 : | add r33 = r33, r35 | ||
304 : | add r34 = r34, r35 | ||
305 : | ;; | ||
306 : | cmp4.geu p6, p7 = 15, r23 | ||
307 : | (p6) br.cond.dptk .L21 | ||
308 : | mov ar.lc = r2 | ||
309 : | br.ret.sptk.many b0 | ||
310 : | .endp sad16bi_ia64# | ||
311 : | |||
312 : | |||
313 : | ia64p | 230 | |
314 : | |||
315 : | |||
316 : | |||
317 : | |||
318 : | |||
319 : | .text | ||
320 : | ia64p | 205 | .align 16 |
321 : | .global dev16_ia64# | ||
322 : | .proc dev16_ia64# | ||
323 : | ia64p | 230 | .auto |
324 : | ia64p | 205 | dev16_ia64: |
325 : | ia64p | 230 | // renamings for better readability |
326 : | stride = r18 | ||
327 : | pfs = r19 //for saving previous function state | ||
328 : | cura0 = r20 //address of first 8-byte block of cur | ||
329 : | cura1 = r21 //address of second 8-byte block of cur | ||
330 : | mean0 = r22 //registers for calculating the sum in parallel | ||
331 : | mean1 = r23 | ||
332 : | mean2 = r24 | ||
333 : | mean3 = r25 | ||
334 : | dev0 = r26 //same for the deviation | ||
335 : | dev1 = r27 | ||
336 : | dev2 = r28 | ||
337 : | dev3 = r29 | ||
338 : | |||
339 : | ia64p | 205 | .body |
340 : | ia64p | 230 | alloc pfs = ar.pfs, 2, 38, 0, 40 |
341 : | |||
342 : | mov cura0 = in0 | ||
343 : | mov stride = in1 | ||
344 : | add cura1 = 8, cura0 | ||
345 : | |||
346 : | .rotr c[32], psad[8] // just using rotating registers to get an array ;-) | ||
347 : | |||
348 : | .explicit | ||
349 : | {.mmi | ||
350 : | ld8 c[0] = [cura0], stride // load them ... | ||
351 : | ld8 c[1] = [cura1], stride | ||
352 : | ;; | ||
353 : | } | ||
354 : | {.mmi | ||
355 : | ld8 c[2] = [cura0], stride | ||
356 : | ld8 c[3] = [cura1], stride | ||
357 : | ;; | ||
358 : | } | ||
359 : | {.mmi | ||
360 : | ld8 c[4] = [cura0], stride | ||
361 : | ld8 c[5] = [cura1], stride | ||
362 : | ia64p | 205 | ;; |
363 : | ia64p | 230 | } |
364 : | {.mmi | ||
365 : | ld8 c[6] = [cura0], stride | ||
366 : | ld8 c[7] = [cura1], stride | ||
367 : | ia64p | 205 | ;; |
368 : | ia64p | 230 | } |
369 : | {.mmi | ||
370 : | ld8 c[8] = [cura0], stride | ||
371 : | ld8 c[9] = [cura1], stride | ||
372 : | ia64p | 205 | ;; |
373 : | ia64p | 230 | } |
374 : | {.mmi | ||
375 : | ld8 c[10] = [cura0], stride | ||
376 : | ld8 c[11] = [cura1], stride | ||
377 : | ia64p | 205 | ;; |
378 : | ia64p | 230 | } |
379 : | {.mii | ||
380 : | ld8 c[12] = [cura0], stride | ||
381 : | psad1 mean0 = c[0], r0 // get the sum of them ... | ||
382 : | psad1 mean1 = c[1], r0 | ||
383 : | } | ||
384 : | {.mmi | ||
385 : | ld8 c[13] = [cura1], stride | ||
386 : | ;; | ||
387 : | ld8 c[14] = [cura0], stride | ||
388 : | psad1 mean2 = c[2], r0 | ||
389 : | } | ||
390 : | {.mii | ||
391 : | ld8 c[15] = [cura1], stride | ||
392 : | psad1 mean3 = c[3], r0 | ||
393 : | ;; | ||
394 : | psad1 psad[0] = c[4], r0 | ||
395 : | } | ||
396 : | {.mmi | ||
397 : | ld8 c[16] = [cura0], stride | ||
398 : | ld8 c[17] = [cura1], stride | ||
399 : | psad1 psad[1] = c[5], r0 | ||
400 : | ia64p | 205 | ;; |
401 : | ia64p | 230 | } |
402 : | {.mii | ||
403 : | ld8 c[18] = [cura0], stride | ||
404 : | psad1 psad[2] = c[6], r0 | ||
405 : | psad1 psad[3] = c[7], r0 | ||
406 : | } | ||
407 : | {.mmi | ||
408 : | ld8 c[19] = [cura1], stride | ||
409 : | ;; | ||
410 : | ld8 c[20] = [cura0], stride | ||
411 : | psad1 psad[4] = c[8], r0 | ||
412 : | } | ||
413 : | {.mii | ||
414 : | ld8 c[21] = [cura1], stride | ||
415 : | psad1 psad[5] = c[9], r0 | ||
416 : | ia64p | 205 | ;; |
417 : | ia64p | 230 | add mean0 = mean0, psad[0] |
418 : | } | ||
419 : | {.mmi | ||
420 : | ld8 c[22] = [cura0], stride | ||
421 : | ld8 c[23] = [cura1], stride | ||
422 : | add mean1 = mean1, psad[1] | ||
423 : | ;; | ||
424 : | } | ||
425 : | {.mii | ||
426 : | ld8 c[24] = [cura0], stride | ||
427 : | psad1 psad[0] = c[10], r0 | ||
428 : | psad1 psad[1] = c[11], r0 | ||
429 : | } | ||
430 : | {.mmi | ||
431 : | ld8 c[25] = [cura1], stride | ||
432 : | ;; | ||
433 : | ld8 c[26] = [cura0], stride | ||
434 : | add mean2 = mean2, psad[2] | ||
435 : | } | ||
436 : | {.mii | ||
437 : | ld8 c[27] = [cura1], stride | ||
438 : | add mean3 = mean3, psad[3] | ||
439 : | ;; | ||
440 : | psad1 psad[2] = c[12], r0 | ||
441 : | } | ||
442 : | {.mmi | ||
443 : | ld8 c[28] = [cura0], stride | ||
444 : | ld8 c[29] = [cura1], stride | ||
445 : | psad1 psad[3] = c[13], r0 | ||
446 : | ;; | ||
447 : | } | ||
448 : | {.mii | ||
449 : | ld8 c[30] = [cura0] | ||
450 : | psad1 psad[6] = c[14], r0 | ||
451 : | psad1 psad[7] = c[15], r0 | ||
452 : | } | ||
453 : | {.mmi | ||
454 : | ld8 c[31] = [cura1] | ||
455 : | ;; | ||
456 : | add mean0 = mean0, psad[0] | ||
457 : | add mean1 = mean1, psad[1] | ||
458 : | } | ||
459 : | {.mii | ||
460 : | add mean2 = mean2, psad[4] | ||
461 : | add mean3 = mean3, psad[5] | ||
462 : | ia64p | 205 | ;; |
463 : | ia64p | 230 | psad1 psad[0] = c[16], r0 |
464 : | } | ||
465 : | {.mmi | ||
466 : | add mean0 = mean0, psad[2] | ||
467 : | add mean1 = mean1, psad[3] | ||
468 : | psad1 psad[1] = c[17], r0 | ||
469 : | ia64p | 205 | ;; |
470 : | ia64p | 230 | } |
471 : | {.mii | ||
472 : | add mean2 = mean2, psad[6] | ||
473 : | psad1 psad[2] = c[18], r0 | ||
474 : | psad1 psad[3] = c[19], r0 | ||
475 : | } | ||
476 : | {.mmi | ||
477 : | add mean3 = mean3, psad[7] | ||
478 : | ;; | ||
479 : | add mean0 = mean0, psad[0] | ||
480 : | psad1 psad[4] = c[20], r0 | ||
481 : | } | ||
482 : | {.mii | ||
483 : | add mean1 = mean1, psad[1] | ||
484 : | psad1 psad[5] = c[21], r0 | ||
485 : | ia64p | 205 | ;; |
486 : | ia64p | 230 | psad1 psad[6] = c[22], r0 |
487 : | } | ||
488 : | {.mmi | ||
489 : | add mean2 = mean2, psad[2] | ||
490 : | add mean3 = mean3, psad[3] | ||
491 : | psad1 psad[7] = c[23], r0 | ||
492 : | ia64p | 205 | ;; |
493 : | ia64p | 230 | } |
494 : | {.mii | ||
495 : | add mean0 = mean0, psad[4] | ||
496 : | psad1 psad[0] = c[24], r0 | ||
497 : | psad1 psad[1] = c[25], r0 | ||
498 : | } | ||
499 : | {.mmi | ||
500 : | add mean1 = mean1, psad[5] | ||
501 : | ia64p | 205 | ;; |
502 : | ia64p | 230 | add mean2 = mean2, psad[6] |
503 : | psad1 psad[2] = c[26], r0 | ||
504 : | } | ||
505 : | {.mii | ||
506 : | add mean3 = mean3, psad[7] | ||
507 : | psad1 psad[3] = c[27], r0 | ||
508 : | ;; | ||
509 : | psad1 psad[4] = c[28], r0 | ||
510 : | } | ||
511 : | {.mmi | ||
512 : | add mean0 = mean0, psad[0] | ||
513 : | add mean1 = mean1, psad[1] | ||
514 : | psad1 psad[5] = c[29], r0 | ||
515 : | ia64p | 205 | ;; |
516 : | ia64p | 230 | } |
517 : | {.mii | ||
518 : | add mean2 = mean2, psad[2] | ||
519 : | psad1 psad[6] = c[30], r0 | ||
520 : | psad1 psad[7] = c[31], r0 | ||
521 : | } | ||
522 : | {.mmi | ||
523 : | add mean3 = mean3, psad[3] | ||
524 : | ia64p | 205 | ;; |
525 : | ia64p | 230 | add mean0 = mean0, psad[4] |
526 : | add mean1 = mean1, psad[5] | ||
527 : | } | ||
528 : | {.mbb | ||
529 : | add mean2 = mean2, mean3 | ||
530 : | nop.b 1 | ||
531 : | nop.b 1 | ||
532 : | ia64p | 205 | ;; |
533 : | ia64p | 230 | } |
534 : | {.mib | ||
535 : | add mean0 = mean0, psad[6] | ||
536 : | add mean1 = mean1, psad[7] | ||
537 : | nop.b 1 | ||
538 : | ia64p | 205 | ;; |
539 : | ia64p | 230 | } |
540 : | {.mib | ||
541 : | add mean0 = mean0, mean1 | ||
542 : | // add mean2 = 127, mean2 // this could make our division more exact, but does not help much | ||
543 : | ia64p | 205 | ;; |
544 : | ia64p | 230 | } |
545 : | {.mib | ||
546 : | add mean0 = mean0, mean2 | ||
547 : | ia64p | 205 | ;; |
548 : | ia64p | 230 | } |
549 : | |||
550 : | {.mib | ||
551 : | shr.u mean0 = mean0, 8 // divide them ... | ||
552 : | ia64p | 205 | ;; |
553 : | ia64p | 230 | } |
554 : | {.mib | ||
555 : | mux1 mean0 = mean0, @brcst | ||
556 : | ;; | ||
557 : | } | ||
558 : | {.mii | ||
559 : | nop.m 0 | ||
560 : | psad1 dev0 = c[0], mean0 // and do a sad again ... | ||
561 : | psad1 dev1 = c[1], mean0 | ||
562 : | } | ||
563 : | {.mii | ||
564 : | nop.m 0 | ||
565 : | psad1 dev2 = c[2], mean0 | ||
566 : | psad1 dev3 = c[3], mean0 | ||
567 : | } | ||
568 : | {.mii | ||
569 : | nop.m 0 | ||
570 : | psad1 psad[0] = c[4], mean0 | ||
571 : | psad1 psad[1] = c[5], mean0 | ||
572 : | } | ||
573 : | {.mii | ||
574 : | nop.m 0 | ||
575 : | psad1 psad[2] = c[6], mean0 | ||
576 : | psad1 psad[3] = c[7], mean0 | ||
577 : | } | ||
578 : | {.mii | ||
579 : | nop.m 0 | ||
580 : | psad1 psad[4] = c[8], mean0 | ||
581 : | psad1 psad[5] = c[9], mean0 | ||
582 : | ;; | ||
583 : | } | ||
584 : | {.mii | ||
585 : | add dev0 = dev0, psad[0] | ||
586 : | psad1 psad[6] = c[10], mean0 | ||
587 : | psad1 psad[7] = c[11], mean0 | ||
588 : | } | ||
589 : | {.mmi | ||
590 : | add dev1 = dev1, psad[1] | ||
591 : | |||
592 : | add dev2 = dev2, psad[2] | ||
593 : | psad1 psad[0] = c[12], mean0 | ||
594 : | } | ||
595 : | {.mii | ||
596 : | add dev3 = dev3, psad[3] | ||
597 : | psad1 psad[1] = c[13], mean0 | ||
598 : | ;; | ||
599 : | psad1 psad[2] = c[14], mean0 | ||
600 : | } | ||
601 : | {.mmi | ||
602 : | add dev0 = dev0, psad[4] | ||
603 : | add dev1 = dev1, psad[5] | ||
604 : | psad1 psad[3] = c[15], mean0 | ||
605 : | } | ||
606 : | {.mii | ||
607 : | add dev2 = dev2, psad[6] | ||
608 : | psad1 psad[4] = c[16], mean0 | ||
609 : | psad1 psad[5] = c[17], mean0 | ||
610 : | } | ||
611 : | {.mmi | ||
612 : | add dev3 = dev3, psad[7] | ||
613 : | ;; | ||
614 : | add dev0 = dev0, psad[0] | ||
615 : | psad1 psad[6] = c[18], mean0 | ||
616 : | } | ||
617 : | {.mii | ||
618 : | add dev1 = dev1, psad[1] | ||
619 : | psad1 psad[7] = c[19], mean0 | ||
620 : | |||
621 : | psad1 psad[0] = c[20], mean0 | ||
622 : | } | ||
623 : | {.mmi | ||
624 : | add dev2 = dev2, psad[2] | ||
625 : | add dev3 = dev3, psad[3] | ||
626 : | psad1 psad[1] = c[21], mean0 | ||
627 : | ia64p | 205 | ;; |
628 : | ia64p | 230 | } |
629 : | {.mii | ||
630 : | add dev0 = dev0, psad[4] | ||
631 : | psad1 psad[2] = c[22], mean0 | ||
632 : | psad1 psad[3] = c[23], mean0 | ||
633 : | } | ||
634 : | {.mmi | ||
635 : | add dev1 = dev1, psad[5] | ||
636 : | |||
637 : | add dev2 = dev2, psad[6] | ||
638 : | psad1 psad[4] = c[24], mean0 | ||
639 : | } | ||
640 : | {.mii | ||
641 : | add dev3 = dev3, psad[7] | ||
642 : | psad1 psad[5] = c[25], mean0 | ||
643 : | ;; | ||
644 : | psad1 psad[6] = c[26], mean0 | ||
645 : | } | ||
646 : | {.mmi | ||
647 : | add dev0 = dev0, psad[0] | ||
648 : | add dev1 = dev1, psad[1] | ||
649 : | psad1 psad[7] = c[27], mean0 | ||
650 : | } | ||
651 : | {.mii | ||
652 : | add dev2 = dev2, psad[2] | ||
653 : | psad1 psad[0] = c[28], mean0 | ||
654 : | psad1 psad[1] = c[29], mean0 | ||
655 : | } | ||
656 : | {.mmi | ||
657 : | add dev3 = dev3, psad[3] | ||
658 : | ia64p | 205 | ;; |
659 : | ia64p | 230 | add dev0 = dev0, psad[4] |
660 : | psad1 psad[2] = c[30], mean0 | ||
661 : | } | ||
662 : | {.mii | ||
663 : | add dev1 = dev1, psad[5] | ||
664 : | psad1 psad[3] = c[31], mean0 | ||
665 : | ;; | ||
666 : | add dev2 = dev2, psad[6] | ||
667 : | } | ||
668 : | {.mmi | ||
669 : | add dev3 = dev3, psad[7] | ||
670 : | add dev0 = dev0, psad[0] | ||
671 : | add dev1 = dev1, psad[1] | ||
672 : | ia64p | 205 | ;; |
673 : | ia64p | 230 | } |
674 : | {.mii | ||
675 : | add dev2 = dev2, psad[2] | ||
676 : | add dev3 = dev3, psad[3] | ||
677 : | add ret0 = dev0, dev1 | ||
678 : | ;; | ||
679 : | } | ||
680 : | {.mib | ||
681 : | add dev2 = dev2, dev3 | ||
682 : | nop.i 1 | ||
683 : | nop.b 1 | ||
684 : | ;; | ||
685 : | } | ||
686 : | {.mib | ||
687 : | add ret0 = ret0, dev2 | ||
688 : | nop.i 1 | ||
689 : | ia64p | 205 | br.ret.sptk.many b0 |
690 : | ia64p | 230 | } |
691 : | ia64p | 205 | .endp dev16_ia64# |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |