Parent Directory
|
Revision Log
Revision 300 - (view) (download)
1 : | ia64p | 300 | // ------------------------------------------------------------------------------ |
2 : | // * Programmed by | ||
3 : | // * Johannes Singler (email@jsingler.de), Daniel Winkler (infostudent@uni.de) | ||
4 : | // * | ||
5 : | // * Programmed for the IA64 laboratory held at University Karlsruhe 2002 | ||
6 : | // * http://www.info.uni-karlsruhe.de/~rubino/ia64p/ | ||
7 : | // * | ||
8 : | // ------------------------------------------------------------------------------ | ||
9 : | // * | ||
10 : | // * This is the optimized assembler version of Halfpel8_Refine. This function | ||
11 : | // * is worth it to be optimized for the IA-64 architecture because of the huge | ||
12 : | // * register set. We can hold all necessary data in general use registers | ||
13 : | // * and reuse it. | ||
14 : | // * | ||
15 : | // * Our approach uses: | ||
16 : | // * - The Itanium command psad1, which solves the problem in hardware. | ||
17 : | // * - Alignment resolving to avoid memory faults | ||
18 : | // * - Massive lopp unrolling | ||
19 : | // * | ||
20 : | // ------------------------------------------------------------------------------ | ||
21 : | // * | ||
22 : | // * ------- Half-pixel steps around the center (*) and corresponding | ||
23 : | // * |0|1|0| register set parts. | ||
24 : | // * ------- | ||
25 : | // * |2|*|2| | ||
26 : | // * ------- | ||
27 : | // * |0|1|0| | ||
28 : | // * ------- | ||
29 : | // * | ||
30 : | // ------------------------------------------------------------------------------ | ||
31 : | // * calc_delta is split up in three parts wich are included from | ||
32 : | // * | ||
33 : | // * calc_delta_1.s | ||
34 : | // * calc_delta_2.s | ||
35 : | // * calc_delta_3.s | ||
36 : | // * | ||
37 : | // ------------------------------------------------------------------------------ | ||
38 : | // * We assume min_dx <= currX <= max_dx && min_dy <= currY <= max_dy | ||
39 : | |||
40 : | |||
41 : | .sdata | ||
42 : | .align 4 | ||
43 : | .type lambda_vec8#,@object | ||
44 : | .size lambda_vec8#,128 | ||
45 : | lambda_vec8: | ||
46 : | data4 0 | ||
47 : | data4 1 | ||
48 : | data4 1 | ||
49 : | data4 1 | ||
50 : | data4 1 | ||
51 : | data4 2 | ||
52 : | data4 2 | ||
53 : | data4 2 | ||
54 : | data4 2 | ||
55 : | data4 3 | ||
56 : | data4 3 | ||
57 : | data4 3 | ||
58 : | data4 4 | ||
59 : | data4 4 | ||
60 : | data4 4 | ||
61 : | data4 5 | ||
62 : | data4 5 | ||
63 : | data4 6 | ||
64 : | data4 7 | ||
65 : | data4 7 | ||
66 : | data4 8 | ||
67 : | data4 9 | ||
68 : | data4 10 | ||
69 : | data4 11 | ||
70 : | data4 13 | ||
71 : | data4 14 | ||
72 : | data4 16 | ||
73 : | data4 18 | ||
74 : | data4 21 | ||
75 : | data4 25 | ||
76 : | data4 30 | ||
77 : | data4 36 | ||
78 : | |||
79 : | |||
80 : | .type mvtab#,@object | ||
81 : | .size mvtab#,132 | ||
82 : | mvtab: | ||
83 : | data4 1 | ||
84 : | data4 2 | ||
85 : | data4 3 | ||
86 : | data4 4 | ||
87 : | data4 6 | ||
88 : | data4 7 | ||
89 : | data4 7 | ||
90 : | data4 7 | ||
91 : | data4 9 | ||
92 : | data4 9 | ||
93 : | data4 9 | ||
94 : | data4 10 | ||
95 : | data4 10 | ||
96 : | data4 10 | ||
97 : | data4 10 | ||
98 : | data4 10 | ||
99 : | data4 10 | ||
100 : | data4 10 | ||
101 : | data4 10 | ||
102 : | data4 10 | ||
103 : | data4 10 | ||
104 : | data4 10 | ||
105 : | data4 10 | ||
106 : | data4 10 | ||
107 : | data4 10 | ||
108 : | data4 11 | ||
109 : | data4 11 | ||
110 : | data4 11 | ||
111 : | data4 11 | ||
112 : | data4 11 | ||
113 : | data4 11 | ||
114 : | data4 12 | ||
115 : | data4 12 | ||
116 : | .text | ||
117 : | .align 16 | ||
118 : | .global Halfpel8_Refine_ia64# | ||
119 : | .proc Halfpel8_Refine_ia64# | ||
120 : | |||
121 : | Halfpel8_Refine_ia64: | ||
122 : | |||
123 : | pfs = r14 | ||
124 : | prsave = r15 | ||
125 : | |||
126 : | // Save important registers | ||
127 : | |||
128 : | alloc pfs = ar.pfs, 17, 75, 4, 96 | ||
129 : | mov prsave = pr | ||
130 : | |||
131 : | // Naming registers for better readability | ||
132 : | |||
133 : | pRef = in0 | ||
134 : | pRefH = in1 | ||
135 : | pRefV = in2 | ||
136 : | pRefHV = in3 | ||
137 : | cura = in4 | ||
138 : | x = in5 | ||
139 : | y = in6 | ||
140 : | currMV = in7 | ||
141 : | iMinSAD = in8 | ||
142 : | pmv = in9 | ||
143 : | min_dx = in10 | ||
144 : | max_dx = in11 | ||
145 : | min_dy = in12 | ||
146 : | max_dy = in13 | ||
147 : | iFcode = in14 | ||
148 : | iQuant = in15 | ||
149 : | iEdgedWidth = in16 | ||
150 : | |||
151 : | iSAD = r17 | ||
152 : | backupX = r18 | ||
153 : | backupY = r19 | ||
154 : | currX = r20 | ||
155 : | currY = r21 | ||
156 : | currYAddress = r22 | ||
157 : | bitX0 = r23 | ||
158 : | bitY0 = r24 | ||
159 : | dxd2 = r25 | ||
160 : | dyd2 = r26 | ||
161 : | offset = r27 | ||
162 : | block = r28 | ||
163 : | nob02 = r29 | ||
164 : | nob1 = r30 | ||
165 : | nob64m02 = r31 | ||
166 : | nob64m1 = r127 | ||
167 : | const7 = r126 | ||
168 : | nob56m02 = r125 | ||
169 : | dx = r124 | ||
170 : | dy = r123 | ||
171 : | oldX = r122 | ||
172 : | oldY = r121 | ||
173 : | |||
174 : | .rotr inregisters[17], refaa[3], refab[3], cur[8], ref0a[9], ref0b[9], ref1a[9], mpr[9], ref2a[8], ref2b[8], component[2], sc[2], tabaddress[2] | ||
175 : | |||
176 : | fx = f8 | ||
177 : | fy = f9 | ||
178 : | fblock = f10 | ||
179 : | fiEdgedWidth = f11 | ||
180 : | fdxd2 = f12 | ||
181 : | fdyd2 = f13 | ||
182 : | foffset = f14 | ||
183 : | fydiEdgedWidth = f15 | ||
184 : | fQuant = f16 | ||
185 : | fmv = f17 | ||
186 : | |||
187 : | n = p16 | ||
188 : | h = p17 | ||
189 : | v = p18 | ||
190 : | hv = p19 | ||
191 : | l = p20 | ||
192 : | r = p21 | ||
193 : | t = p22 | ||
194 : | b = p23 | ||
195 : | lt = p24 | ||
196 : | lb = p25 | ||
197 : | rt = p26 | ||
198 : | rb = p27 | ||
199 : | fb = p28 | ||
200 : | non0_0 = p30 | ||
201 : | non0_1 = p31 | ||
202 : | non0_2 = p32 | ||
203 : | non0_3 = p33 | ||
204 : | neg_0 = p34 | ||
205 : | neg_1 = p35 | ||
206 : | neg_2 = p36 | ||
207 : | neg_3 = p37 | ||
208 : | cg32_0 = p29 | ||
209 : | cg32_1 = p38 | ||
210 : | |||
211 : | // Initialize input variables | ||
212 : | |||
213 : | add sp = 16, sp | ||
214 : | ;; | ||
215 : | ld4 iMinSAD = [sp], 8 | ||
216 : | ;; | ||
217 : | sxt4 iMinSAD = iMinSAD | ||
218 : | ld8 pmv = [sp], 8 | ||
219 : | ;; | ||
220 : | ld4 min_dx = [sp], 8 | ||
221 : | ;; | ||
222 : | sxt4 min_dx = min_dx | ||
223 : | |||
224 : | ld4 max_dx = [sp], 8 | ||
225 : | ;; | ||
226 : | sxt4 max_dx = max_dx | ||
227 : | |||
228 : | ld4 min_dy = [sp], 8 | ||
229 : | ;; | ||
230 : | sxt4 min_dy = min_dy | ||
231 : | |||
232 : | ld4 max_dy = [sp], 8 | ||
233 : | ;; | ||
234 : | sxt4 max_dy = max_dy | ||
235 : | |||
236 : | ld4 iFcode = [sp], 8 | ||
237 : | ;; | ||
238 : | sxt4 iFcode = iFcode | ||
239 : | |||
240 : | ld4 iQuant = [sp], 8 | ||
241 : | |||
242 : | add tabaddress[0] = @gprel(lambda_vec8#), gp | ||
243 : | ;; | ||
244 : | shladd tabaddress[0] = iQuant, 2, tabaddress[0] | ||
245 : | ;; | ||
246 : | ld4 iQuant = [tabaddress[0]] | ||
247 : | ;; | ||
248 : | sxt4 iQuant = iQuant | ||
249 : | ;; | ||
250 : | add iFcode = -1, iFcode //only used in decreased version | ||
251 : | shl iQuant = iQuant, 1 | ||
252 : | ;; | ||
253 : | setf.sig fQuant = iQuant | ||
254 : | |||
255 : | ld4 iEdgedWidth = [sp] | ||
256 : | add sp = -80, sp | ||
257 : | |||
258 : | |||
259 : | |||
260 : | |||
261 : | // Initialize local variables | ||
262 : | |||
263 : | |||
264 : | ld4 currX = [currMV] | ||
265 : | add currYAddress = 4, currMV | ||
266 : | ;; | ||
267 : | sxt4 currX = currX | ||
268 : | ld4 currY = [currYAddress] | ||
269 : | ;; | ||
270 : | sxt4 currY = currY | ||
271 : | ;; | ||
272 : | // Calculate references | ||
273 : | |||
274 : | cmp.gt l, p0 = currX, min_dx | ||
275 : | cmp.lt r, p0 = currX, max_dx | ||
276 : | cmp.gt t, p0 = currY, min_dy | ||
277 : | cmp.lt b, p0 = currY, max_dy | ||
278 : | add backupX = -1, currX //move to left upper corner of quadrate | ||
279 : | add backupY = -1, currY | ||
280 : | |||
281 : | ;; | ||
282 : | (b) cmp.gt.unc lb, p0 = currX, min_dx | ||
283 : | (t) cmp.lt.unc rt, p0 = currX, max_dx | ||
284 : | (l) cmp.gt.unc lt, p0 = currY, min_dy | ||
285 : | (r) cmp.lt.unc rb, p0 = currY, max_dy | ||
286 : | |||
287 : | and bitX0 = 1, backupX | ||
288 : | and bitY0 = 1, backupY | ||
289 : | ;; | ||
290 : | cmp.eq n, p0 = 0, bitX0 | ||
291 : | cmp.eq h, p0 = 1, bitX0 | ||
292 : | cmp.eq v, p0 = 0, bitX0 | ||
293 : | cmp.eq hv, p0 = 1, bitX0 | ||
294 : | ;; | ||
295 : | cmp.eq.and n, p0 = 0, bitY0 | ||
296 : | cmp.eq.and h, p0 = 0, bitY0 | ||
297 : | cmp.eq.and v, p0 = 1, bitY0 | ||
298 : | cmp.eq.and hv, p0 = 1, bitY0 | ||
299 : | ;; | ||
300 : | |||
301 : | .pred.rel "mutex", p16, p17, p18, p19 //n, h, v, hv | ||
302 : | (n) mov refaa[0] = pRef | ||
303 : | (h) mov refaa[0] = pRefH | ||
304 : | (v) mov refaa[0] = pRefV | ||
305 : | (hv) mov refaa[0] = pRefHV | ||
306 : | |||
307 : | (n) mov refaa[1] = pRefH | ||
308 : | (h) mov refaa[1] = pRef | ||
309 : | (v) mov refaa[1] = pRefHV | ||
310 : | (hv) mov refaa[1] = pRefV | ||
311 : | |||
312 : | (n) mov refaa[2] = pRefV | ||
313 : | (h) mov refaa[2] = pRefHV | ||
314 : | (v) mov refaa[2] = pRef | ||
315 : | (hv) mov refaa[2] = pRefH | ||
316 : | |||
317 : | |||
318 : | // Calculate offset (integer multiplication on IA-64 sucks!) | ||
319 : | |||
320 : | mov block = 8 | ||
321 : | |||
322 : | shr dxd2 = backupX, 1 | ||
323 : | shr dyd2 = backupY, 1 | ||
324 : | |||
325 : | setf.sig fx = x | ||
326 : | setf.sig fy = y | ||
327 : | ;; | ||
328 : | setf.sig fblock = block | ||
329 : | setf.sig fiEdgedWidth = iEdgedWidth | ||
330 : | ;; | ||
331 : | setf.sig fdxd2 = dxd2 | ||
332 : | setf.sig fdyd2 = dyd2 | ||
333 : | ;; | ||
334 : | xma.l foffset = fx, fblock, fdxd2 | ||
335 : | xma.l fydiEdgedWidth = fy, fblock, fdyd2 | ||
336 : | ;; | ||
337 : | xma.l foffset = fydiEdgedWidth, fiEdgedWidth, foffset | ||
338 : | ;; | ||
339 : | getf.sig offset = foffset | ||
340 : | ;; | ||
341 : | add refaa[0] = refaa[0], offset | ||
342 : | add refaa[1] = refaa[1], offset | ||
343 : | add refaa[2] = refaa[2], offset | ||
344 : | ;; | ||
345 : | (h) add refaa[1] = 1, refaa[1] | ||
346 : | (hv) add refaa[1] = 1, refaa[1] | ||
347 : | (v) add refaa[2] = iEdgedWidth, refaa[2] | ||
348 : | (hv) add refaa[2] = iEdgedWidth, refaa[2] | ||
349 : | |||
350 : | // Load respecting misalignment of refx... | ||
351 : | |||
352 : | mov const7 = 7 | ||
353 : | ;; | ||
354 : | dep.z nob02 = refaa[0], 3, 3 | ||
355 : | dep.z nob1 = refaa[1], 3, 3 | ||
356 : | ;; | ||
357 : | andcm refaa[0] = refaa[0], const7 // set last 3 bits = 0 | ||
358 : | andcm refaa[1] = refaa[1], const7 | ||
359 : | andcm refaa[2] = refaa[2], const7 | ||
360 : | ;; | ||
361 : | add refab[0] = 8, refaa[0] | ||
362 : | add refab[1] = 8, refaa[1] | ||
363 : | add refab[2] = 8, refaa[2] | ||
364 : | ;; | ||
365 : | ld8 cur[0] = [cura], iEdgedWidth | ||
366 : | ld8 ref0a[0] = [refaa[0]], iEdgedWidth | ||
367 : | sub nob64m02 = 64, nob02 // 64 - nob | ||
368 : | |||
369 : | ld8 ref0b[0] = [refab[0]], iEdgedWidth | ||
370 : | ld8 ref1a[0] = [refaa[1]], iEdgedWidth | ||
371 : | sub nob56m02 = 56, nob02 // 56 - nob | ||
372 : | |||
373 : | ld8 mpr[0] = [refab[1]], iEdgedWidth | ||
374 : | ld8 ref2a[0] = [refaa[2]], iEdgedWidth | ||
375 : | sub nob64m1 = 64, nob1 | ||
376 : | |||
377 : | ld8 ref2b[0] = [refab[2]], iEdgedWidth | ||
378 : | ;; | ||
379 : | ld8 cur[1] = [cura], iEdgedWidth | ||
380 : | ld8 ref0a[1] = [refaa[0]], iEdgedWidth | ||
381 : | ld8 ref0b[1] = [refab[0]], iEdgedWidth | ||
382 : | ld8 ref1a[1] = [refaa[1]], iEdgedWidth | ||
383 : | ld8 mpr[1] = [refab[1]], iEdgedWidth | ||
384 : | ld8 ref2a[1] = [refaa[2]], iEdgedWidth | ||
385 : | ld8 ref2b[1] = [refab[2]], iEdgedWidth | ||
386 : | ;; | ||
387 : | ld8 cur[2] = [cura], iEdgedWidth | ||
388 : | ld8 ref0a[2] = [refaa[0]], iEdgedWidth | ||
389 : | ld8 ref0b[2] = [refab[0]], iEdgedWidth | ||
390 : | ld8 ref1a[2] = [refaa[1]], iEdgedWidth | ||
391 : | ld8 mpr[2] = [refab[1]], iEdgedWidth | ||
392 : | ld8 ref2a[2] = [refaa[2]], iEdgedWidth | ||
393 : | ld8 ref2b[2] = [refab[2]], iEdgedWidth | ||
394 : | ;; | ||
395 : | ld8 cur[3] = [cura], iEdgedWidth | ||
396 : | ld8 ref0a[3] = [refaa[0]], iEdgedWidth | ||
397 : | ld8 ref0b[3] = [refab[0]], iEdgedWidth | ||
398 : | ld8 ref1a[3] = [refaa[1]], iEdgedWidth | ||
399 : | ld8 mpr[3] = [refab[1]], iEdgedWidth | ||
400 : | ld8 ref2a[3] = [refaa[2]], iEdgedWidth | ||
401 : | ld8 ref2b[3] = [refab[2]], iEdgedWidth | ||
402 : | ;; | ||
403 : | ld8 cur[4] = [cura], iEdgedWidth | ||
404 : | ld8 ref0a[4] = [refaa[0]], iEdgedWidth | ||
405 : | ld8 ref0b[4] = [refab[0]], iEdgedWidth | ||
406 : | ld8 ref1a[4] = [refaa[1]], iEdgedWidth | ||
407 : | ld8 mpr[4] = [refab[1]], iEdgedWidth | ||
408 : | ld8 ref2a[4] = [refaa[2]], iEdgedWidth | ||
409 : | ld8 ref2b[4] = [refab[2]], iEdgedWidth | ||
410 : | ;; | ||
411 : | ld8 cur[5] = [cura], iEdgedWidth | ||
412 : | ld8 ref0a[5] = [refaa[0]], iEdgedWidth | ||
413 : | ld8 ref0b[5] = [refab[0]], iEdgedWidth | ||
414 : | ld8 ref1a[5] = [refaa[1]], iEdgedWidth | ||
415 : | ld8 mpr[5] = [refab[1]], iEdgedWidth | ||
416 : | ld8 ref2a[5] = [refaa[2]], iEdgedWidth | ||
417 : | ld8 ref2b[5] = [refab[2]], iEdgedWidth | ||
418 : | ;; | ||
419 : | ld8 cur[6] = [cura], iEdgedWidth | ||
420 : | ld8 ref0a[6] = [refaa[0]], iEdgedWidth | ||
421 : | ld8 ref0b[6] = [refab[0]], iEdgedWidth | ||
422 : | ld8 ref1a[6] = [refaa[1]], iEdgedWidth | ||
423 : | ld8 mpr[6] = [refab[1]], iEdgedWidth | ||
424 : | ld8 ref2a[6] = [refaa[2]], iEdgedWidth | ||
425 : | ld8 ref2b[6] = [refab[2]], iEdgedWidth | ||
426 : | ;; | ||
427 : | ld8 cur[7] = [cura] | ||
428 : | ld8 ref0a[7] = [refaa[0]], iEdgedWidth | ||
429 : | ld8 ref0b[7] = [refab[0]], iEdgedWidth | ||
430 : | ld8 ref1a[7] = [refaa[1]], iEdgedWidth | ||
431 : | ld8 mpr[7] = [refab[1]], iEdgedWidth | ||
432 : | ld8 ref2a[7] = [refaa[2]] | ||
433 : | ld8 ref2b[7] = [refab[2]] | ||
434 : | ;; | ||
435 : | ld8 ref0a[8] = [refaa[0]] | ||
436 : | ld8 ref0b[8] = [refab[0]] | ||
437 : | ld8 ref1a[8] = [refaa[1]] | ||
438 : | ld8 mpr[8] = [refab[1]] | ||
439 : | ;; | ||
440 : | |||
441 : | |||
442 : | // Align ref1 | ||
443 : | |||
444 : | shr.u ref1a[0] = ref1a[0], nob1 | ||
445 : | shr.u ref1a[1] = ref1a[1], nob1 | ||
446 : | shr.u ref1a[2] = ref1a[2], nob1 | ||
447 : | shr.u ref1a[3] = ref1a[3], nob1 | ||
448 : | shr.u ref1a[4] = ref1a[4], nob1 | ||
449 : | shr.u ref1a[5] = ref1a[5], nob1 | ||
450 : | shr.u ref1a[6] = ref1a[6], nob1 | ||
451 : | shr.u ref1a[7] = ref1a[7], nob1 | ||
452 : | shr.u ref1a[8] = ref1a[8], nob1 | ||
453 : | |||
454 : | shl mpr[0] = mpr[0], nob64m1 | ||
455 : | shl mpr[1] = mpr[1], nob64m1 | ||
456 : | shl mpr[2] = mpr[2], nob64m1 | ||
457 : | shl mpr[3] = mpr[3], nob64m1 | ||
458 : | shl mpr[4] = mpr[4], nob64m1 | ||
459 : | shl mpr[5] = mpr[5], nob64m1 | ||
460 : | shl mpr[6] = mpr[6], nob64m1 | ||
461 : | shl mpr[7] = mpr[7], nob64m1 | ||
462 : | shl mpr[8] = mpr[8], nob64m1 | ||
463 : | ;; | ||
464 : | .explicit | ||
465 : | {.mii | ||
466 : | or ref1a[0] = ref1a[0], mpr[0] | ||
467 : | shr.u ref0a[0] = ref0a[0], nob02 | ||
468 : | shr.u ref0a[1] = ref0a[1], nob02 | ||
469 : | } | ||
470 : | {.mmi | ||
471 : | or ref1a[1] = ref1a[1], mpr[1] | ||
472 : | or ref1a[2] = ref1a[2], mpr[2] | ||
473 : | shr.u ref0a[2] = ref0a[2], nob02 | ||
474 : | } | ||
475 : | {.mii | ||
476 : | or ref1a[3] = ref1a[3], mpr[3] | ||
477 : | shr.u ref0a[3] = ref0a[3], nob02 | ||
478 : | shr.u ref0a[4] = ref0a[4], nob02 | ||
479 : | } | ||
480 : | {.mmi | ||
481 : | or ref1a[4] = ref1a[4], mpr[4] | ||
482 : | or ref1a[5] = ref1a[5], mpr[5] | ||
483 : | shr.u ref0a[5] = ref0a[5], nob02 | ||
484 : | } | ||
485 : | {.mii | ||
486 : | or ref1a[6] = ref1a[6], mpr[6] | ||
487 : | shr.u ref0a[6] = ref0a[6], nob02 | ||
488 : | shr.u ref0a[7] = ref0a[7], nob02 | ||
489 : | } | ||
490 : | {.mii | ||
491 : | or ref1a[7] = ref1a[7], mpr[7] | ||
492 : | or ref1a[8] = ref1a[8], mpr[8] | ||
493 : | shr.u ref0a[8] = ref0a[8], nob02 | ||
494 : | } | ||
495 : | .default | ||
496 : | // ref1a[] now contains center position values | ||
497 : | // mpr[] not used any more | ||
498 : | |||
499 : | // Align ref0 left | ||
500 : | |||
501 : | ;; | ||
502 : | shl mpr[0] = ref0b[0], nob56m02 | ||
503 : | shl mpr[1] = ref0b[1], nob56m02 | ||
504 : | shl mpr[2] = ref0b[2], nob56m02 | ||
505 : | shl mpr[3] = ref0b[3], nob56m02 | ||
506 : | shl mpr[4] = ref0b[4], nob56m02 | ||
507 : | shl mpr[5] = ref0b[5], nob56m02 | ||
508 : | shl mpr[6] = ref0b[6], nob56m02 | ||
509 : | shl mpr[7] = ref0b[7], nob56m02 | ||
510 : | shl mpr[8] = ref0b[8], nob56m02 | ||
511 : | |||
512 : | shl ref0b[0] = ref0b[0], nob64m02 | ||
513 : | shl ref0b[1] = ref0b[1], nob64m02 | ||
514 : | shl ref0b[2] = ref0b[2], nob64m02 | ||
515 : | shl ref0b[3] = ref0b[3], nob64m02 | ||
516 : | shl ref0b[4] = ref0b[4], nob64m02 | ||
517 : | shl ref0b[5] = ref0b[5], nob64m02 | ||
518 : | shl ref0b[6] = ref0b[6], nob64m02 | ||
519 : | shl ref0b[7] = ref0b[7], nob64m02 | ||
520 : | shl ref0b[8] = ref0b[8], nob64m02 | ||
521 : | ;; | ||
522 : | or ref0a[0] = ref0a[0], ref0b[0] | ||
523 : | or ref0a[1] = ref0a[1], ref0b[1] | ||
524 : | or ref0a[2] = ref0a[2], ref0b[2] | ||
525 : | or ref0a[3] = ref0a[3], ref0b[3] | ||
526 : | or ref0a[4] = ref0a[4], ref0b[4] | ||
527 : | or ref0a[5] = ref0a[5], ref0b[5] | ||
528 : | or ref0a[6] = ref0a[6], ref0b[6] | ||
529 : | or ref0a[7] = ref0a[7], ref0b[7] | ||
530 : | or ref0a[8] = ref0a[8], ref0b[8] | ||
531 : | ;; | ||
532 : | |||
533 : | // ref0a[] now contains left position values | ||
534 : | // mpr[] contains intermediate result for right position values (former ref0a << 56 - nob02) | ||
535 : | |||
536 : | // Align ref0 right | ||
537 : | |||
538 : | // Shift one byte more to the right (seen als big-endian) | ||
539 : | shr.u ref0b[0] = ref0a[0], 8 | ||
540 : | shr.u ref0b[1] = ref0a[1], 8 | ||
541 : | shr.u ref0b[2] = ref0a[2], 8 | ||
542 : | shr.u ref0b[3] = ref0a[3], 8 | ||
543 : | shr.u ref0b[4] = ref0a[4], 8 | ||
544 : | shr.u ref0b[5] = ref0a[5], 8 | ||
545 : | shr.u ref0b[6] = ref0a[6], 8 | ||
546 : | shr.u ref0b[7] = ref0a[7], 8 | ||
547 : | shr.u ref0b[8] = ref0a[8], 8 | ||
548 : | ;; | ||
549 : | .explicit | ||
550 : | {.mii | ||
551 : | or ref0b[0] = ref0b[0], mpr[0] | ||
552 : | shr.u ref2a[0] = ref2a[0], nob02 | ||
553 : | shr.u ref2a[1] = ref2a[1], nob02 | ||
554 : | } | ||
555 : | {.mmi | ||
556 : | or ref0b[1] = ref0b[1], mpr[1] | ||
557 : | or ref0b[2] = ref0b[2], mpr[2] | ||
558 : | shr.u ref2a[2] = ref2a[2], nob02 | ||
559 : | } | ||
560 : | {.mii | ||
561 : | or ref0b[3] = ref0b[3], mpr[3] | ||
562 : | shr.u ref2a[3] = ref2a[3], nob02 | ||
563 : | shr.u ref2a[4] = ref2a[4], nob02 | ||
564 : | } | ||
565 : | {.mmi | ||
566 : | or ref0b[4] = ref0b[4], mpr[4] | ||
567 : | or ref0b[5] = ref0b[5], mpr[5] | ||
568 : | shr.u ref2a[5] = ref2a[5], nob02 | ||
569 : | } | ||
570 : | {.mii | ||
571 : | or ref0b[6] = ref0b[6], mpr[6] | ||
572 : | shr.u ref2a[6] = ref2a[6], nob02 | ||
573 : | shr.u ref2a[7] = ref2a[7], nob02 | ||
574 : | } | ||
575 : | .default | ||
576 : | or ref0b[7] = ref0b[7], mpr[7] | ||
577 : | or ref0b[8] = ref0b[8], mpr[8] | ||
578 : | |||
579 : | // ref0b[] now contains right position values | ||
580 : | // mpr[] not needed any more | ||
581 : | |||
582 : | |||
583 : | // Align ref2 left | ||
584 : | |||
585 : | ;; | ||
586 : | shl mpr[0] = ref2b[0], nob56m02 | ||
587 : | shl mpr[1] = ref2b[1], nob56m02 | ||
588 : | shl mpr[2] = ref2b[2], nob56m02 | ||
589 : | shl mpr[3] = ref2b[3], nob56m02 | ||
590 : | shl mpr[4] = ref2b[4], nob56m02 | ||
591 : | shl mpr[5] = ref2b[5], nob56m02 | ||
592 : | shl mpr[6] = ref2b[6], nob56m02 | ||
593 : | shl mpr[7] = ref2b[7], nob56m02 | ||
594 : | |||
595 : | shl ref2b[0] = ref2b[0], nob64m02 | ||
596 : | shl ref2b[1] = ref2b[1], nob64m02 | ||
597 : | shl ref2b[2] = ref2b[2], nob64m02 | ||
598 : | shl ref2b[3] = ref2b[3], nob64m02 | ||
599 : | shl ref2b[4] = ref2b[4], nob64m02 | ||
600 : | shl ref2b[5] = ref2b[5], nob64m02 | ||
601 : | shl ref2b[6] = ref2b[6], nob64m02 | ||
602 : | shl ref2b[7] = ref2b[7], nob64m02 | ||
603 : | ;; | ||
604 : | or ref2a[0] = ref2a[0], ref2b[0] | ||
605 : | or ref2a[1] = ref2a[1], ref2b[1] | ||
606 : | or ref2a[2] = ref2a[2], ref2b[2] | ||
607 : | or ref2a[3] = ref2a[3], ref2b[3] | ||
608 : | or ref2a[4] = ref2a[4], ref2b[4] | ||
609 : | or ref2a[5] = ref2a[5], ref2b[5] | ||
610 : | or ref2a[6] = ref2a[6], ref2b[6] | ||
611 : | or ref2a[7] = ref2a[7], ref2b[7] | ||
612 : | ;; | ||
613 : | |||
614 : | // ref2a[] now contains left position values | ||
615 : | // mpr[] contains intermediate result for right position values (former ref2a << 56 - nob02) | ||
616 : | |||
617 : | // Align ref2 right | ||
618 : | |||
619 : | // Shift one byte more to the right (seen als big-endian) | ||
620 : | shr.u ref2b[0] = ref2a[0], 8 | ||
621 : | shr.u ref2b[1] = ref2a[1], 8 | ||
622 : | shr.u ref2b[2] = ref2a[2], 8 | ||
623 : | shr.u ref2b[3] = ref2a[3], 8 | ||
624 : | shr.u ref2b[4] = ref2a[4], 8 | ||
625 : | shr.u ref2b[5] = ref2a[5], 8 | ||
626 : | shr.u ref2b[6] = ref2a[6], 8 | ||
627 : | shr.u ref2b[7] = ref2a[7], 8 | ||
628 : | ;; | ||
629 : | or ref2b[0] = ref2b[0], mpr[0] | ||
630 : | or ref2b[1] = ref2b[1], mpr[1] | ||
631 : | or ref2b[2] = ref2b[2], mpr[2] | ||
632 : | or ref2b[3] = ref2b[3], mpr[3] | ||
633 : | or ref2b[4] = ref2b[4], mpr[4] | ||
634 : | or ref2b[5] = ref2b[5], mpr[5] | ||
635 : | or ref2b[6] = ref2b[6], mpr[6] | ||
636 : | or ref2b[7] = ref2b[7], mpr[7] | ||
637 : | |||
638 : | |||
639 : | // ref2b[] now contains right position values | ||
640 : | // mpr[] not needed any more | ||
641 : | |||
642 : | |||
643 : | ld4 dx = [pmv], 4 //+ sizeof(int) | ||
644 : | ;; | ||
645 : | sxt4 dx = dx | ||
646 : | ld4 dy = [pmv] | ||
647 : | ;; | ||
648 : | sxt4 dy = dy | ||
649 : | ;; | ||
650 : | |||
651 : | |||
652 : | // Let's SAD | ||
653 : | |||
654 : | // Left top corner | ||
655 : | |||
656 : | |||
657 : | sub dx = backupX, dx | ||
658 : | psad1 mpr[0] = cur[0], ref0a[0] | ||
659 : | psad1 mpr[1] = cur[1], ref0a[1] | ||
660 : | |||
661 : | sub dy = backupY, dy | ||
662 : | psad1 mpr[2] = cur[2], ref0a[2] | ||
663 : | psad1 mpr[3] = cur[3], ref0a[3] | ||
664 : | psad1 mpr[4] = cur[4], ref0a[4] | ||
665 : | psad1 mpr[5] = cur[5], ref0a[5] | ||
666 : | psad1 mpr[6] = cur[6], ref0a[6] | ||
667 : | psad1 mpr[7] = cur[7], ref0a[7] | ||
668 : | ;; | ||
669 : | .include "../../src/motion/ia64_asm/calc_delta_1.s" | ||
670 : | |||
671 : | // Top edge | ||
672 : | |||
673 : | psad1 mpr[0] = cur[0], ref1a[0] | ||
674 : | psad1 mpr[1] = cur[1], ref1a[1] | ||
675 : | psad1 mpr[2] = cur[2], ref1a[2] | ||
676 : | psad1 mpr[3] = cur[3], ref1a[3] | ||
677 : | psad1 mpr[4] = cur[4], ref1a[4] | ||
678 : | |||
679 : | add dx = 1, dx | ||
680 : | psad1 mpr[5] = cur[5], ref1a[5] | ||
681 : | psad1 mpr[6] = cur[6], ref1a[6] | ||
682 : | |||
683 : | psad1 mpr[7] = cur[7], ref1a[7] | ||
684 : | ;; | ||
685 : | |||
686 : | .include "../../src/motion/ia64_asm/calc_delta_2.s" | ||
687 : | (lt) cmp.lt.unc fb, p0 = mpr[8], iMinSAD | ||
688 : | .include "../../src/motion/ia64_asm/calc_delta_3.s" | ||
689 : | |||
690 : | // Right top corner | ||
691 : | |||
692 : | |||
693 : | psad1 mpr[0] = cur[0], ref0b[0] | ||
694 : | psad1 mpr[1] = cur[1], ref0b[1] | ||
695 : | psad1 mpr[2] = cur[2], ref0b[2] | ||
696 : | psad1 mpr[3] = cur[3], ref0b[3] | ||
697 : | psad1 mpr[4] = cur[4], ref0b[4] | ||
698 : | |||
699 : | add backupX = 1, backupX | ||
700 : | psad1 mpr[5] = cur[5], ref0b[5] | ||
701 : | psad1 mpr[6] = cur[6], ref0b[6] | ||
702 : | |||
703 : | add dx = 1, dx | ||
704 : | psad1 mpr[7] = cur[7], ref0b[7] | ||
705 : | ;; | ||
706 : | |||
707 : | .include "../../src/motion/ia64_asm/calc_delta_1.s" | ||
708 : | (t) cmp.lt.unc fb, p0 = iSAD, iMinSAD | ||
709 : | ;; | ||
710 : | |||
711 : | // Left edge | ||
712 : | |||
713 : | (fb) mov iMinSAD = iSAD | ||
714 : | psad1 mpr[0] = cur[0], ref2a[0] | ||
715 : | |||
716 : | (fb) mov currX = backupX | ||
717 : | psad1 mpr[1] = cur[1], ref2a[1] | ||
718 : | psad1 mpr[2] = cur[2], ref2a[2] | ||
719 : | |||
720 : | (fb) mov currY = backupY | ||
721 : | psad1 mpr[3] = cur[3], ref2a[3] | ||
722 : | psad1 mpr[4] = cur[4], ref2a[4] | ||
723 : | |||
724 : | add backupX = 1, backupX | ||
725 : | psad1 mpr[5] = cur[5], ref2a[5] | ||
726 : | psad1 mpr[6] = cur[6], ref2a[6] | ||
727 : | |||
728 : | psad1 mpr[7] = cur[7], ref2a[7] | ||
729 : | |||
730 : | add dx = -2, dx | ||
731 : | add dy = 1, dy | ||
732 : | ;; | ||
733 : | |||
734 : | .include "../../src/motion/ia64_asm/calc_delta_2.s" | ||
735 : | (rt) cmp.lt.unc fb, p0 = mpr[8], iMinSAD | ||
736 : | .include "../../src/motion/ia64_asm/calc_delta_3.s" | ||
737 : | |||
738 : | // Right edge | ||
739 : | |||
740 : | |||
741 : | psad1 mpr[0] = cur[0], ref2b[0] | ||
742 : | psad1 mpr[1] = cur[1], ref2b[1] | ||
743 : | psad1 mpr[2] = cur[2], ref2b[2] | ||
744 : | psad1 mpr[3] = cur[3], ref2b[3] | ||
745 : | psad1 mpr[4] = cur[4], ref2b[4] | ||
746 : | |||
747 : | add backupX = -2, backupX | ||
748 : | psad1 mpr[5] = cur[5], ref2b[5] | ||
749 : | psad1 mpr[6] = cur[6], ref2b[6] | ||
750 : | |||
751 : | add backupY = 1, backupY | ||
752 : | add dx = 2, dx | ||
753 : | psad1 mpr[7] = cur[7], ref2b[7] | ||
754 : | ;; | ||
755 : | |||
756 : | .include "../../src/motion/ia64_asm/calc_delta_1.s" | ||
757 : | (l) cmp.lt.unc fb, p0 = iSAD, iMinSAD | ||
758 : | ;; | ||
759 : | |||
760 : | // Left bottom corner | ||
761 : | |||
762 : | (fb) mov iMinSAD = iSAD | ||
763 : | psad1 mpr[0] = cur[0], ref0a[1] | ||
764 : | |||
765 : | (fb) mov currX = backupX | ||
766 : | psad1 mpr[1] = cur[1], ref0a[2] | ||
767 : | psad1 mpr[2] = cur[2], ref0a[3] | ||
768 : | |||
769 : | (fb) mov currY = backupY | ||
770 : | psad1 mpr[3] = cur[3], ref0a[4] | ||
771 : | psad1 mpr[4] = cur[4], ref0a[5] | ||
772 : | |||
773 : | add backupX = 2, backupX | ||
774 : | psad1 mpr[5] = cur[5], ref0a[6] | ||
775 : | psad1 mpr[6] = cur[6], ref0a[7] | ||
776 : | |||
777 : | psad1 mpr[7] = cur[7], ref0a[8] | ||
778 : | |||
779 : | add dx = -2, dx | ||
780 : | add dy = 1, dy | ||
781 : | ;; | ||
782 : | |||
783 : | .include "../../src/motion/ia64_asm/calc_delta_2.s" | ||
784 : | (r) cmp.lt.unc fb, p0 = mpr[8], iMinSAD | ||
785 : | .include "../../src/motion/ia64_asm/calc_delta_3.s" | ||
786 : | |||
787 : | // Bottom edge | ||
788 : | |||
789 : | psad1 mpr[0] = cur[0], ref1a[1] | ||
790 : | psad1 mpr[1] = cur[1], ref1a[2] | ||
791 : | psad1 mpr[2] = cur[2], ref1a[3] | ||
792 : | psad1 mpr[3] = cur[3], ref1a[4] | ||
793 : | psad1 mpr[4] = cur[4], ref1a[5] | ||
794 : | |||
795 : | add backupX = -2, backupX | ||
796 : | psad1 mpr[5] = cur[5], ref1a[6] | ||
797 : | psad1 mpr[6] = cur[6], ref1a[7] | ||
798 : | |||
799 : | add backupY = 1, backupY | ||
800 : | add dx = 1, dx | ||
801 : | psad1 mpr[7] = cur[7], ref1a[8] | ||
802 : | ;; | ||
803 : | |||
804 : | .include "../../src/motion/ia64_asm/calc_delta_1.s" | ||
805 : | (lb) cmp.lt.unc fb, p0 = iSAD, iMinSAD | ||
806 : | ;; | ||
807 : | // Right bottom corner | ||
808 : | |||
809 : | |||
810 : | (fb) mov iMinSAD = iSAD | ||
811 : | psad1 mpr[0] = cur[0], ref0b[1] | ||
812 : | |||
813 : | (fb) mov currX = backupX | ||
814 : | psad1 mpr[1] = cur[1], ref0b[2] | ||
815 : | psad1 mpr[2] = cur[2], ref0b[3] | ||
816 : | |||
817 : | (fb) mov currY = backupY | ||
818 : | psad1 mpr[3] = cur[3], ref0b[4] | ||
819 : | psad1 mpr[4] = cur[4], ref0b[5] | ||
820 : | |||
821 : | add backupX = 1, backupX | ||
822 : | psad1 mpr[5] = cur[5], ref0b[6] | ||
823 : | psad1 mpr[6] = cur[6], ref0b[7] | ||
824 : | |||
825 : | add dx = 1, dx | ||
826 : | psad1 mpr[7] = cur[7], ref0b[8] | ||
827 : | ;; | ||
828 : | |||
829 : | .include "../../src/motion/ia64_asm/calc_delta_2.s" | ||
830 : | (b) cmp.lt.unc fb, p0 = mpr[8], iMinSAD | ||
831 : | .include "../../src/motion/ia64_asm/calc_delta_3.s" | ||
832 : | |||
833 : | (rb) getf.sig ret0 = fmv | ||
834 : | add backupX = 1, backupX | ||
835 : | ;; | ||
836 : | (rb) add iSAD = iSAD, ret0 | ||
837 : | ;; | ||
838 : | (rb) cmp.lt.unc fb, p0 = iSAD, iMinSAD | ||
839 : | ;; | ||
840 : | (fb) mov iMinSAD = iSAD | ||
841 : | (fb) mov currX = backupX | ||
842 : | (fb) mov currY = backupY | ||
843 : | ;; | ||
844 : | |||
845 : | // Write back result | ||
846 : | |||
847 : | st4 [currMV] = currX | ||
848 : | st4 [currYAddress] = currY | ||
849 : | mov ret0 = iMinSAD | ||
850 : | |||
851 : | // Restore important registers | ||
852 : | |||
853 : | ;; | ||
854 : | mov pr = prsave, -1 | ||
855 : | mov ar.pfs = pfs | ||
856 : | br.ret.sptk.many b0 | ||
857 : | |||
858 : | .endp Halfpel8_Refine_ia64# |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |