Parent Directory | Revision Log
Revision 1855 - (view) (download)
1 : | Isibaar | 1855 | // **************************************************************************** |
2 : | // * | ||
3 : | // * XVID MPEG-4 VIDEO CODEC | ||
4 : | // * - IA64 halfpel refinement - | ||
5 : | // * | ||
6 : | // * Copyright(C) 2002 Johannes Singler, Daniel Winkler | ||
7 : | // * | ||
8 : | // * This program is free software; you can redistribute it and/or modify it | ||
9 : | // * under the terms of the GNU General Public License as published by | ||
10 : | // * the Free Software Foundation; either version 2 of the License, or | ||
11 : | // * (at your option) any later version. | ||
12 : | // * | ||
13 : | // * This program is distributed in the hope that it will be useful, | ||
14 : | // * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 : | // * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 : | // * GNU General Public License for more details. | ||
17 : | // * | ||
18 : | // * You should have received a copy of the GNU General Public License | ||
19 : | // * along with this program; if not, write to the Free Software | ||
20 : | // * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
21 : | // * | ||
22 : | // * $Id: halfpel8_refine_ia64.s,v 1.4 2009-02-19 17:07:29 Isibaar Exp $ | ||
23 : | // * | ||
24 : | // ***************************************************************************/ | ||
25 : | // | ||
26 : | // **************************************************************************** | ||
27 : | // * | ||
28 : | // * halfpel8_refine_ia64.s, IA-64 halfpel refinement | ||
29 : | // * | ||
30 : | // * This version was implemented during an IA-64 practical training at | ||
31 : | // * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/) | ||
32 : | // * | ||
33 : | // **************************************************************************** | ||
34 : | |||
35 : | ia64p | 300 | // ------------------------------------------------------------------------------ |
36 : | // * Programmed by | ||
37 : | // * Johannes Singler (email@jsingler.de), Daniel Winkler (infostudent@uni.de) | ||
38 : | // * | ||
39 : | // * Programmed for the IA64 laboratory held at University Karlsruhe 2002 | ||
40 : | // * http://www.info.uni-karlsruhe.de/~rubino/ia64p/ | ||
41 : | // * | ||
42 : | // ------------------------------------------------------------------------------ | ||
43 : | // * | ||
44 : | // * This is the optimized assembler version of Halfpel8_Refine. This function | ||
45 : | // * is worth it to be optimized for the IA-64 architecture because of the huge | ||
46 : | // * register set. We can hold all necessary data in general use registers | ||
47 : | // * and reuse it. | ||
48 : | // * | ||
49 : | // * Our approach uses: | ||
50 : | // * - The Itanium command psad1, which solves the problem in hardware. | ||
51 : | // * - Alignment resolving to avoid memory faults | ||
52 : | // * - Massive lopp unrolling | ||
53 : | // * | ||
54 : | // ------------------------------------------------------------------------------ | ||
55 : | // * | ||
56 : | // * ------- Half-pixel steps around the center (*) and corresponding | ||
57 : | // * |0|1|0| register set parts. | ||
58 : | // * ------- | ||
59 : | // * |2|*|2| | ||
60 : | // * ------- | ||
61 : | // * |0|1|0| | ||
62 : | // * ------- | ||
63 : | // * | ||
64 : | // ------------------------------------------------------------------------------ | ||
65 : | // * calc_delta is split up in three parts wich are included from | ||
66 : | // * | ||
67 : | // * calc_delta_1.s | ||
68 : | // * calc_delta_2.s | ||
69 : | // * calc_delta_3.s | ||
70 : | // * | ||
71 : | // ------------------------------------------------------------------------------ | ||
72 : | // * We assume min_dx <= currX <= max_dx && min_dy <= currY <= max_dy | ||
73 : | |||
74 : | |||
75 : | .sdata | ||
76 : | .align 4 | ||
77 : | .type lambda_vec8#,@object | ||
78 : | .size lambda_vec8#,128 | ||
79 : | lambda_vec8: | ||
80 : | data4 0 | ||
81 : | data4 1 | ||
82 : | data4 1 | ||
83 : | data4 1 | ||
84 : | data4 1 | ||
85 : | data4 2 | ||
86 : | data4 2 | ||
87 : | data4 2 | ||
88 : | data4 2 | ||
89 : | data4 3 | ||
90 : | data4 3 | ||
91 : | data4 3 | ||
92 : | data4 4 | ||
93 : | data4 4 | ||
94 : | data4 4 | ||
95 : | data4 5 | ||
96 : | data4 5 | ||
97 : | data4 6 | ||
98 : | data4 7 | ||
99 : | data4 7 | ||
100 : | data4 8 | ||
101 : | data4 9 | ||
102 : | data4 10 | ||
103 : | data4 11 | ||
104 : | data4 13 | ||
105 : | data4 14 | ||
106 : | data4 16 | ||
107 : | data4 18 | ||
108 : | data4 21 | ||
109 : | data4 25 | ||
110 : | data4 30 | ||
111 : | data4 36 | ||
112 : | |||
113 : | |||
114 : | .type mvtab#,@object | ||
115 : | .size mvtab#,132 | ||
116 : | mvtab: | ||
117 : | data4 1 | ||
118 : | data4 2 | ||
119 : | data4 3 | ||
120 : | data4 4 | ||
121 : | data4 6 | ||
122 : | data4 7 | ||
123 : | data4 7 | ||
124 : | data4 7 | ||
125 : | data4 9 | ||
126 : | data4 9 | ||
127 : | data4 9 | ||
128 : | data4 10 | ||
129 : | data4 10 | ||
130 : | data4 10 | ||
131 : | data4 10 | ||
132 : | data4 10 | ||
133 : | data4 10 | ||
134 : | data4 10 | ||
135 : | data4 10 | ||
136 : | data4 10 | ||
137 : | data4 10 | ||
138 : | data4 10 | ||
139 : | data4 10 | ||
140 : | data4 10 | ||
141 : | data4 10 | ||
142 : | data4 11 | ||
143 : | data4 11 | ||
144 : | data4 11 | ||
145 : | data4 11 | ||
146 : | data4 11 | ||
147 : | data4 11 | ||
148 : | data4 12 | ||
149 : | data4 12 | ||
150 : | .text | ||
151 : | .align 16 | ||
152 : | .global Halfpel8_Refine_ia64# | ||
153 : | .proc Halfpel8_Refine_ia64# | ||
154 : | |||
155 : | Halfpel8_Refine_ia64: | ||
156 : | |||
157 : | pfs = r14 | ||
158 : | prsave = r15 | ||
159 : | |||
160 : | // Save important registers | ||
161 : | |||
162 : | ia64p | 372 | alloc pfs = ar.pfs, 18, 74, 4, 96 |
163 : | ia64p | 300 | mov prsave = pr |
164 : | |||
165 : | // Naming registers for better readability | ||
166 : | |||
167 : | pRef = in0 | ||
168 : | pRefH = in1 | ||
169 : | pRefV = in2 | ||
170 : | pRefHV = in3 | ||
171 : | cura = in4 | ||
172 : | x = in5 | ||
173 : | y = in6 | ||
174 : | currMV = in7 | ||
175 : | iMinSAD = in8 | ||
176 : | ia64p | 372 | dx = in9 |
177 : | dy = in10 | ||
178 : | min_dx = in11 | ||
179 : | max_dx = in12 | ||
180 : | min_dy = in13 | ||
181 : | max_dy = in14 | ||
182 : | iFcode = in15 | ||
183 : | iQuant = in16 | ||
184 : | iEdgedWidth = in17 | ||
185 : | ia64p | 300 | |
186 : | iSAD = r17 | ||
187 : | backupX = r18 | ||
188 : | backupY = r19 | ||
189 : | currX = r20 | ||
190 : | currY = r21 | ||
191 : | currYAddress = r22 | ||
192 : | bitX0 = r23 | ||
193 : | bitY0 = r24 | ||
194 : | dxd2 = r25 | ||
195 : | dyd2 = r26 | ||
196 : | offset = r27 | ||
197 : | block = r28 | ||
198 : | nob02 = r29 | ||
199 : | nob1 = r30 | ||
200 : | nob64m02 = r31 | ||
201 : | nob64m1 = r127 | ||
202 : | const7 = r126 | ||
203 : | nob56m02 = r125 | ||
204 : | ia64p | 372 | oldX = r124 |
205 : | oldY = r123 | ||
206 : | ia64p | 300 | |
207 : | ia64p | 372 | .rotr inregisters[18], refaa[3], refab[3], cur[8], ref0a[9], ref0b[9], ref1a[9], mpr[9], ref2a[8], ref2b[8], component[2], sc[2], tabaddress[2] |
208 : | ia64p | 300 | |
209 : | fx = f8 | ||
210 : | fy = f9 | ||
211 : | fblock = f10 | ||
212 : | fiEdgedWidth = f11 | ||
213 : | fdxd2 = f12 | ||
214 : | fdyd2 = f13 | ||
215 : | foffset = f14 | ||
216 : | fydiEdgedWidth = f15 | ||
217 : | ia64p | 303 | fQuant = f32 |
218 : | fmv = f33 | ||
219 : | ia64p | 300 | |
220 : | n = p16 | ||
221 : | h = p17 | ||
222 : | v = p18 | ||
223 : | hv = p19 | ||
224 : | l = p20 | ||
225 : | r = p21 | ||
226 : | t = p22 | ||
227 : | b = p23 | ||
228 : | lt = p24 | ||
229 : | lb = p25 | ||
230 : | rt = p26 | ||
231 : | rb = p27 | ||
232 : | fb = p28 | ||
233 : | non0_0 = p30 | ||
234 : | non0_1 = p31 | ||
235 : | non0_2 = p32 | ||
236 : | non0_3 = p33 | ||
237 : | neg_0 = p34 | ||
238 : | neg_1 = p35 | ||
239 : | neg_2 = p36 | ||
240 : | neg_3 = p37 | ||
241 : | cg32_0 = p29 | ||
242 : | cg32_1 = p38 | ||
243 : | |||
244 : | // Initialize input variables | ||
245 : | |||
246 : | add sp = 16, sp | ||
247 : | ;; | ||
248 : | ld4 iMinSAD = [sp], 8 | ||
249 : | ;; | ||
250 : | sxt4 iMinSAD = iMinSAD | ||
251 : | ia64p | 372 | |
252 : | |||
253 : | ld4 dx = [sp], 8 | ||
254 : | ia64p | 300 | ;; |
255 : | ia64p | 372 | sxt4 dx = dx |
256 : | |||
257 : | ld4 dy = [sp], 8 | ||
258 : | ;; | ||
259 : | sxt4 dy = dy | ||
260 : | |||
261 : | ia64p | 300 | ld4 min_dx = [sp], 8 |
262 : | ;; | ||
263 : | sxt4 min_dx = min_dx | ||
264 : | |||
265 : | ld4 max_dx = [sp], 8 | ||
266 : | ;; | ||
267 : | sxt4 max_dx = max_dx | ||
268 : | |||
269 : | ld4 min_dy = [sp], 8 | ||
270 : | ;; | ||
271 : | sxt4 min_dy = min_dy | ||
272 : | |||
273 : | ld4 max_dy = [sp], 8 | ||
274 : | ;; | ||
275 : | sxt4 max_dy = max_dy | ||
276 : | |||
277 : | ld4 iFcode = [sp], 8 | ||
278 : | ;; | ||
279 : | sxt4 iFcode = iFcode | ||
280 : | |||
281 : | ld4 iQuant = [sp], 8 | ||
282 : | |||
283 : | add tabaddress[0] = @gprel(lambda_vec8#), gp | ||
284 : | ;; | ||
285 : | shladd tabaddress[0] = iQuant, 2, tabaddress[0] | ||
286 : | ;; | ||
287 : | ld4 iQuant = [tabaddress[0]] | ||
288 : | ;; | ||
289 : | sxt4 iQuant = iQuant | ||
290 : | ;; | ||
291 : | add iFcode = -1, iFcode //only used in decreased version | ||
292 : | shl iQuant = iQuant, 1 | ||
293 : | ;; | ||
294 : | setf.sig fQuant = iQuant | ||
295 : | |||
296 : | ld4 iEdgedWidth = [sp] | ||
297 : | ia64p | 372 | add sp = -88, sp |
298 : | ia64p | 300 | |
299 : | |||
300 : | |||
301 : | |||
302 : | // Initialize local variables | ||
303 : | |||
304 : | |||
305 : | ld4 currX = [currMV] | ||
306 : | add currYAddress = 4, currMV | ||
307 : | ;; | ||
308 : | sxt4 currX = currX | ||
309 : | ld4 currY = [currYAddress] | ||
310 : | ;; | ||
311 : | sxt4 currY = currY | ||
312 : | ;; | ||
313 : | // Calculate references | ||
314 : | |||
315 : | cmp.gt l, p0 = currX, min_dx | ||
316 : | cmp.lt r, p0 = currX, max_dx | ||
317 : | cmp.gt t, p0 = currY, min_dy | ||
318 : | cmp.lt b, p0 = currY, max_dy | ||
319 : | add backupX = -1, currX //move to left upper corner of quadrate | ||
320 : | add backupY = -1, currY | ||
321 : | |||
322 : | ;; | ||
323 : | (b) cmp.gt.unc lb, p0 = currX, min_dx | ||
324 : | (t) cmp.lt.unc rt, p0 = currX, max_dx | ||
325 : | (l) cmp.gt.unc lt, p0 = currY, min_dy | ||
326 : | (r) cmp.lt.unc rb, p0 = currY, max_dy | ||
327 : | |||
328 : | and bitX0 = 1, backupX | ||
329 : | and bitY0 = 1, backupY | ||
330 : | ;; | ||
331 : | cmp.eq n, p0 = 0, bitX0 | ||
332 : | cmp.eq h, p0 = 1, bitX0 | ||
333 : | cmp.eq v, p0 = 0, bitX0 | ||
334 : | cmp.eq hv, p0 = 1, bitX0 | ||
335 : | ;; | ||
336 : | cmp.eq.and n, p0 = 0, bitY0 | ||
337 : | cmp.eq.and h, p0 = 0, bitY0 | ||
338 : | cmp.eq.and v, p0 = 1, bitY0 | ||
339 : | cmp.eq.and hv, p0 = 1, bitY0 | ||
340 : | ;; | ||
341 : | |||
342 : | .pred.rel "mutex", p16, p17, p18, p19 //n, h, v, hv | ||
343 : | (n) mov refaa[0] = pRef | ||
344 : | (h) mov refaa[0] = pRefH | ||
345 : | (v) mov refaa[0] = pRefV | ||
346 : | (hv) mov refaa[0] = pRefHV | ||
347 : | |||
348 : | (n) mov refaa[1] = pRefH | ||
349 : | (h) mov refaa[1] = pRef | ||
350 : | (v) mov refaa[1] = pRefHV | ||
351 : | (hv) mov refaa[1] = pRefV | ||
352 : | |||
353 : | (n) mov refaa[2] = pRefV | ||
354 : | (h) mov refaa[2] = pRefHV | ||
355 : | (v) mov refaa[2] = pRef | ||
356 : | (hv) mov refaa[2] = pRefH | ||
357 : | |||
358 : | |||
359 : | // Calculate offset (integer multiplication on IA-64 sucks!) | ||
360 : | |||
361 : | mov block = 8 | ||
362 : | |||
363 : | shr dxd2 = backupX, 1 | ||
364 : | shr dyd2 = backupY, 1 | ||
365 : | |||
366 : | setf.sig fx = x | ||
367 : | setf.sig fy = y | ||
368 : | ;; | ||
369 : | setf.sig fblock = block | ||
370 : | setf.sig fiEdgedWidth = iEdgedWidth | ||
371 : | ;; | ||
372 : | setf.sig fdxd2 = dxd2 | ||
373 : | setf.sig fdyd2 = dyd2 | ||
374 : | ;; | ||
375 : | xma.l foffset = fx, fblock, fdxd2 | ||
376 : | xma.l fydiEdgedWidth = fy, fblock, fdyd2 | ||
377 : | ;; | ||
378 : | xma.l foffset = fydiEdgedWidth, fiEdgedWidth, foffset | ||
379 : | ;; | ||
380 : | getf.sig offset = foffset | ||
381 : | ;; | ||
382 : | add refaa[0] = refaa[0], offset | ||
383 : | add refaa[1] = refaa[1], offset | ||
384 : | add refaa[2] = refaa[2], offset | ||
385 : | ;; | ||
386 : | (h) add refaa[1] = 1, refaa[1] | ||
387 : | (hv) add refaa[1] = 1, refaa[1] | ||
388 : | (v) add refaa[2] = iEdgedWidth, refaa[2] | ||
389 : | (hv) add refaa[2] = iEdgedWidth, refaa[2] | ||
390 : | |||
391 : | // Load respecting misalignment of refx... | ||
392 : | |||
393 : | mov const7 = 7 | ||
394 : | ;; | ||
395 : | dep.z nob02 = refaa[0], 3, 3 | ||
396 : | dep.z nob1 = refaa[1], 3, 3 | ||
397 : | ;; | ||
398 : | andcm refaa[0] = refaa[0], const7 // set last 3 bits = 0 | ||
399 : | andcm refaa[1] = refaa[1], const7 | ||
400 : | andcm refaa[2] = refaa[2], const7 | ||
401 : | ;; | ||
402 : | add refab[0] = 8, refaa[0] | ||
403 : | add refab[1] = 8, refaa[1] | ||
404 : | add refab[2] = 8, refaa[2] | ||
405 : | ;; | ||
406 : | ld8 cur[0] = [cura], iEdgedWidth | ||
407 : | ld8 ref0a[0] = [refaa[0]], iEdgedWidth | ||
408 : | sub nob64m02 = 64, nob02 // 64 - nob | ||
409 : | |||
410 : | ld8 ref0b[0] = [refab[0]], iEdgedWidth | ||
411 : | ld8 ref1a[0] = [refaa[1]], iEdgedWidth | ||
412 : | sub nob56m02 = 56, nob02 // 56 - nob | ||
413 : | |||
414 : | ld8 mpr[0] = [refab[1]], iEdgedWidth | ||
415 : | ld8 ref2a[0] = [refaa[2]], iEdgedWidth | ||
416 : | sub nob64m1 = 64, nob1 | ||
417 : | |||
418 : | ld8 ref2b[0] = [refab[2]], iEdgedWidth | ||
419 : | ;; | ||
420 : | ld8 cur[1] = [cura], iEdgedWidth | ||
421 : | ld8 ref0a[1] = [refaa[0]], iEdgedWidth | ||
422 : | ld8 ref0b[1] = [refab[0]], iEdgedWidth | ||
423 : | ld8 ref1a[1] = [refaa[1]], iEdgedWidth | ||
424 : | ld8 mpr[1] = [refab[1]], iEdgedWidth | ||
425 : | ld8 ref2a[1] = [refaa[2]], iEdgedWidth | ||
426 : | ld8 ref2b[1] = [refab[2]], iEdgedWidth | ||
427 : | ;; | ||
428 : | ld8 cur[2] = [cura], iEdgedWidth | ||
429 : | ld8 ref0a[2] = [refaa[0]], iEdgedWidth | ||
430 : | ld8 ref0b[2] = [refab[0]], iEdgedWidth | ||
431 : | ld8 ref1a[2] = [refaa[1]], iEdgedWidth | ||
432 : | ld8 mpr[2] = [refab[1]], iEdgedWidth | ||
433 : | ld8 ref2a[2] = [refaa[2]], iEdgedWidth | ||
434 : | ld8 ref2b[2] = [refab[2]], iEdgedWidth | ||
435 : | ;; | ||
436 : | ld8 cur[3] = [cura], iEdgedWidth | ||
437 : | ld8 ref0a[3] = [refaa[0]], iEdgedWidth | ||
438 : | ld8 ref0b[3] = [refab[0]], iEdgedWidth | ||
439 : | ld8 ref1a[3] = [refaa[1]], iEdgedWidth | ||
440 : | ld8 mpr[3] = [refab[1]], iEdgedWidth | ||
441 : | ld8 ref2a[3] = [refaa[2]], iEdgedWidth | ||
442 : | ld8 ref2b[3] = [refab[2]], iEdgedWidth | ||
443 : | ;; | ||
444 : | ld8 cur[4] = [cura], iEdgedWidth | ||
445 : | ld8 ref0a[4] = [refaa[0]], iEdgedWidth | ||
446 : | ld8 ref0b[4] = [refab[0]], iEdgedWidth | ||
447 : | ld8 ref1a[4] = [refaa[1]], iEdgedWidth | ||
448 : | ld8 mpr[4] = [refab[1]], iEdgedWidth | ||
449 : | ld8 ref2a[4] = [refaa[2]], iEdgedWidth | ||
450 : | ld8 ref2b[4] = [refab[2]], iEdgedWidth | ||
451 : | ;; | ||
452 : | ld8 cur[5] = [cura], iEdgedWidth | ||
453 : | ld8 ref0a[5] = [refaa[0]], iEdgedWidth | ||
454 : | ld8 ref0b[5] = [refab[0]], iEdgedWidth | ||
455 : | ld8 ref1a[5] = [refaa[1]], iEdgedWidth | ||
456 : | ld8 mpr[5] = [refab[1]], iEdgedWidth | ||
457 : | ld8 ref2a[5] = [refaa[2]], iEdgedWidth | ||
458 : | ld8 ref2b[5] = [refab[2]], iEdgedWidth | ||
459 : | ;; | ||
460 : | ld8 cur[6] = [cura], iEdgedWidth | ||
461 : | ld8 ref0a[6] = [refaa[0]], iEdgedWidth | ||
462 : | ld8 ref0b[6] = [refab[0]], iEdgedWidth | ||
463 : | ld8 ref1a[6] = [refaa[1]], iEdgedWidth | ||
464 : | ld8 mpr[6] = [refab[1]], iEdgedWidth | ||
465 : | ld8 ref2a[6] = [refaa[2]], iEdgedWidth | ||
466 : | ld8 ref2b[6] = [refab[2]], iEdgedWidth | ||
467 : | ;; | ||
468 : | ld8 cur[7] = [cura] | ||
469 : | ld8 ref0a[7] = [refaa[0]], iEdgedWidth | ||
470 : | ld8 ref0b[7] = [refab[0]], iEdgedWidth | ||
471 : | ld8 ref1a[7] = [refaa[1]], iEdgedWidth | ||
472 : | ld8 mpr[7] = [refab[1]], iEdgedWidth | ||
473 : | ld8 ref2a[7] = [refaa[2]] | ||
474 : | ld8 ref2b[7] = [refab[2]] | ||
475 : | ;; | ||
476 : | ld8 ref0a[8] = [refaa[0]] | ||
477 : | ld8 ref0b[8] = [refab[0]] | ||
478 : | ld8 ref1a[8] = [refaa[1]] | ||
479 : | ld8 mpr[8] = [refab[1]] | ||
480 : | ;; | ||
481 : | |||
482 : | |||
483 : | // Align ref1 | ||
484 : | |||
485 : | shr.u ref1a[0] = ref1a[0], nob1 | ||
486 : | shr.u ref1a[1] = ref1a[1], nob1 | ||
487 : | shr.u ref1a[2] = ref1a[2], nob1 | ||
488 : | shr.u ref1a[3] = ref1a[3], nob1 | ||
489 : | shr.u ref1a[4] = ref1a[4], nob1 | ||
490 : | shr.u ref1a[5] = ref1a[5], nob1 | ||
491 : | shr.u ref1a[6] = ref1a[6], nob1 | ||
492 : | shr.u ref1a[7] = ref1a[7], nob1 | ||
493 : | shr.u ref1a[8] = ref1a[8], nob1 | ||
494 : | |||
495 : | shl mpr[0] = mpr[0], nob64m1 | ||
496 : | shl mpr[1] = mpr[1], nob64m1 | ||
497 : | shl mpr[2] = mpr[2], nob64m1 | ||
498 : | shl mpr[3] = mpr[3], nob64m1 | ||
499 : | shl mpr[4] = mpr[4], nob64m1 | ||
500 : | shl mpr[5] = mpr[5], nob64m1 | ||
501 : | shl mpr[6] = mpr[6], nob64m1 | ||
502 : | shl mpr[7] = mpr[7], nob64m1 | ||
503 : | shl mpr[8] = mpr[8], nob64m1 | ||
504 : | ;; | ||
505 : | .explicit | ||
506 : | {.mii | ||
507 : | or ref1a[0] = ref1a[0], mpr[0] | ||
508 : | shr.u ref0a[0] = ref0a[0], nob02 | ||
509 : | shr.u ref0a[1] = ref0a[1], nob02 | ||
510 : | } | ||
511 : | {.mmi | ||
512 : | or ref1a[1] = ref1a[1], mpr[1] | ||
513 : | or ref1a[2] = ref1a[2], mpr[2] | ||
514 : | shr.u ref0a[2] = ref0a[2], nob02 | ||
515 : | } | ||
516 : | {.mii | ||
517 : | or ref1a[3] = ref1a[3], mpr[3] | ||
518 : | shr.u ref0a[3] = ref0a[3], nob02 | ||
519 : | shr.u ref0a[4] = ref0a[4], nob02 | ||
520 : | } | ||
521 : | {.mmi | ||
522 : | or ref1a[4] = ref1a[4], mpr[4] | ||
523 : | or ref1a[5] = ref1a[5], mpr[5] | ||
524 : | shr.u ref0a[5] = ref0a[5], nob02 | ||
525 : | } | ||
526 : | {.mii | ||
527 : | or ref1a[6] = ref1a[6], mpr[6] | ||
528 : | shr.u ref0a[6] = ref0a[6], nob02 | ||
529 : | shr.u ref0a[7] = ref0a[7], nob02 | ||
530 : | } | ||
531 : | {.mii | ||
532 : | or ref1a[7] = ref1a[7], mpr[7] | ||
533 : | or ref1a[8] = ref1a[8], mpr[8] | ||
534 : | shr.u ref0a[8] = ref0a[8], nob02 | ||
535 : | } | ||
536 : | .default | ||
537 : | // ref1a[] now contains center position values | ||
538 : | // mpr[] not used any more | ||
539 : | |||
540 : | // Align ref0 left | ||
541 : | |||
542 : | ;; | ||
543 : | shl mpr[0] = ref0b[0], nob56m02 | ||
544 : | shl mpr[1] = ref0b[1], nob56m02 | ||
545 : | shl mpr[2] = ref0b[2], nob56m02 | ||
546 : | shl mpr[3] = ref0b[3], nob56m02 | ||
547 : | shl mpr[4] = ref0b[4], nob56m02 | ||
548 : | shl mpr[5] = ref0b[5], nob56m02 | ||
549 : | shl mpr[6] = ref0b[6], nob56m02 | ||
550 : | shl mpr[7] = ref0b[7], nob56m02 | ||
551 : | shl mpr[8] = ref0b[8], nob56m02 | ||
552 : | |||
553 : | shl ref0b[0] = ref0b[0], nob64m02 | ||
554 : | shl ref0b[1] = ref0b[1], nob64m02 | ||
555 : | shl ref0b[2] = ref0b[2], nob64m02 | ||
556 : | shl ref0b[3] = ref0b[3], nob64m02 | ||
557 : | shl ref0b[4] = ref0b[4], nob64m02 | ||
558 : | shl ref0b[5] = ref0b[5], nob64m02 | ||
559 : | shl ref0b[6] = ref0b[6], nob64m02 | ||
560 : | shl ref0b[7] = ref0b[7], nob64m02 | ||
561 : | shl ref0b[8] = ref0b[8], nob64m02 | ||
562 : | ;; | ||
563 : | or ref0a[0] = ref0a[0], ref0b[0] | ||
564 : | or ref0a[1] = ref0a[1], ref0b[1] | ||
565 : | or ref0a[2] = ref0a[2], ref0b[2] | ||
566 : | or ref0a[3] = ref0a[3], ref0b[3] | ||
567 : | or ref0a[4] = ref0a[4], ref0b[4] | ||
568 : | or ref0a[5] = ref0a[5], ref0b[5] | ||
569 : | or ref0a[6] = ref0a[6], ref0b[6] | ||
570 : | or ref0a[7] = ref0a[7], ref0b[7] | ||
571 : | or ref0a[8] = ref0a[8], ref0b[8] | ||
572 : | ;; | ||
573 : | |||
574 : | // ref0a[] now contains left position values | ||
575 : | // mpr[] contains intermediate result for right position values (former ref0a << 56 - nob02) | ||
576 : | |||
577 : | // Align ref0 right | ||
578 : | |||
579 : | // Shift one byte more to the right (seen als big-endian) | ||
580 : | shr.u ref0b[0] = ref0a[0], 8 | ||
581 : | shr.u ref0b[1] = ref0a[1], 8 | ||
582 : | shr.u ref0b[2] = ref0a[2], 8 | ||
583 : | shr.u ref0b[3] = ref0a[3], 8 | ||
584 : | shr.u ref0b[4] = ref0a[4], 8 | ||
585 : | shr.u ref0b[5] = ref0a[5], 8 | ||
586 : | shr.u ref0b[6] = ref0a[6], 8 | ||
587 : | shr.u ref0b[7] = ref0a[7], 8 | ||
588 : | shr.u ref0b[8] = ref0a[8], 8 | ||
589 : | ;; | ||
590 : | .explicit | ||
591 : | {.mii | ||
592 : | or ref0b[0] = ref0b[0], mpr[0] | ||
593 : | shr.u ref2a[0] = ref2a[0], nob02 | ||
594 : | shr.u ref2a[1] = ref2a[1], nob02 | ||
595 : | } | ||
596 : | {.mmi | ||
597 : | or ref0b[1] = ref0b[1], mpr[1] | ||
598 : | or ref0b[2] = ref0b[2], mpr[2] | ||
599 : | shr.u ref2a[2] = ref2a[2], nob02 | ||
600 : | } | ||
601 : | {.mii | ||
602 : | or ref0b[3] = ref0b[3], mpr[3] | ||
603 : | shr.u ref2a[3] = ref2a[3], nob02 | ||
604 : | shr.u ref2a[4] = ref2a[4], nob02 | ||
605 : | } | ||
606 : | {.mmi | ||
607 : | or ref0b[4] = ref0b[4], mpr[4] | ||
608 : | or ref0b[5] = ref0b[5], mpr[5] | ||
609 : | shr.u ref2a[5] = ref2a[5], nob02 | ||
610 : | } | ||
611 : | {.mii | ||
612 : | or ref0b[6] = ref0b[6], mpr[6] | ||
613 : | shr.u ref2a[6] = ref2a[6], nob02 | ||
614 : | shr.u ref2a[7] = ref2a[7], nob02 | ||
615 : | } | ||
616 : | .default | ||
617 : | or ref0b[7] = ref0b[7], mpr[7] | ||
618 : | or ref0b[8] = ref0b[8], mpr[8] | ||
619 : | |||
620 : | // ref0b[] now contains right position values | ||
621 : | // mpr[] not needed any more | ||
622 : | |||
623 : | |||
624 : | // Align ref2 left | ||
625 : | |||
626 : | ;; | ||
627 : | shl mpr[0] = ref2b[0], nob56m02 | ||
628 : | shl mpr[1] = ref2b[1], nob56m02 | ||
629 : | shl mpr[2] = ref2b[2], nob56m02 | ||
630 : | shl mpr[3] = ref2b[3], nob56m02 | ||
631 : | shl mpr[4] = ref2b[4], nob56m02 | ||
632 : | shl mpr[5] = ref2b[5], nob56m02 | ||
633 : | shl mpr[6] = ref2b[6], nob56m02 | ||
634 : | shl mpr[7] = ref2b[7], nob56m02 | ||
635 : | |||
636 : | shl ref2b[0] = ref2b[0], nob64m02 | ||
637 : | shl ref2b[1] = ref2b[1], nob64m02 | ||
638 : | shl ref2b[2] = ref2b[2], nob64m02 | ||
639 : | shl ref2b[3] = ref2b[3], nob64m02 | ||
640 : | shl ref2b[4] = ref2b[4], nob64m02 | ||
641 : | shl ref2b[5] = ref2b[5], nob64m02 | ||
642 : | shl ref2b[6] = ref2b[6], nob64m02 | ||
643 : | shl ref2b[7] = ref2b[7], nob64m02 | ||
644 : | ;; | ||
645 : | or ref2a[0] = ref2a[0], ref2b[0] | ||
646 : | or ref2a[1] = ref2a[1], ref2b[1] | ||
647 : | or ref2a[2] = ref2a[2], ref2b[2] | ||
648 : | or ref2a[3] = ref2a[3], ref2b[3] | ||
649 : | or ref2a[4] = ref2a[4], ref2b[4] | ||
650 : | or ref2a[5] = ref2a[5], ref2b[5] | ||
651 : | or ref2a[6] = ref2a[6], ref2b[6] | ||
652 : | or ref2a[7] = ref2a[7], ref2b[7] | ||
653 : | ;; | ||
654 : | |||
655 : | // ref2a[] now contains left position values | ||
656 : | // mpr[] contains intermediate result for right position values (former ref2a << 56 - nob02) | ||
657 : | |||
658 : | // Align ref2 right | ||
659 : | |||
660 : | // Shift one byte more to the right (seen als big-endian) | ||
661 : | shr.u ref2b[0] = ref2a[0], 8 | ||
662 : | shr.u ref2b[1] = ref2a[1], 8 | ||
663 : | shr.u ref2b[2] = ref2a[2], 8 | ||
664 : | shr.u ref2b[3] = ref2a[3], 8 | ||
665 : | shr.u ref2b[4] = ref2a[4], 8 | ||
666 : | shr.u ref2b[5] = ref2a[5], 8 | ||
667 : | shr.u ref2b[6] = ref2a[6], 8 | ||
668 : | shr.u ref2b[7] = ref2a[7], 8 | ||
669 : | ;; | ||
670 : | or ref2b[0] = ref2b[0], mpr[0] | ||
671 : | or ref2b[1] = ref2b[1], mpr[1] | ||
672 : | or ref2b[2] = ref2b[2], mpr[2] | ||
673 : | or ref2b[3] = ref2b[3], mpr[3] | ||
674 : | or ref2b[4] = ref2b[4], mpr[4] | ||
675 : | or ref2b[5] = ref2b[5], mpr[5] | ||
676 : | or ref2b[6] = ref2b[6], mpr[6] | ||
677 : | or ref2b[7] = ref2b[7], mpr[7] | ||
678 : | |||
679 : | |||
680 : | // ref2b[] now contains right position values | ||
681 : | // mpr[] not needed any more | ||
682 : | |||
683 : | |||
684 : | |||
685 : | // Let's SAD | ||
686 : | |||
687 : | // Left top corner | ||
688 : | |||
689 : | |||
690 : | sub dx = backupX, dx | ||
691 : | psad1 mpr[0] = cur[0], ref0a[0] | ||
692 : | psad1 mpr[1] = cur[1], ref0a[1] | ||
693 : | |||
694 : | sub dy = backupY, dy | ||
695 : | psad1 mpr[2] = cur[2], ref0a[2] | ||
696 : | psad1 mpr[3] = cur[3], ref0a[3] | ||
697 : | psad1 mpr[4] = cur[4], ref0a[4] | ||
698 : | psad1 mpr[5] = cur[5], ref0a[5] | ||
699 : | psad1 mpr[6] = cur[6], ref0a[6] | ||
700 : | psad1 mpr[7] = cur[7], ref0a[7] | ||
701 : | ;; | ||
702 : | .include "../../src/motion/ia64_asm/calc_delta_1.s" | ||
703 : | |||
704 : | // Top edge | ||
705 : | |||
706 : | psad1 mpr[0] = cur[0], ref1a[0] | ||
707 : | psad1 mpr[1] = cur[1], ref1a[1] | ||
708 : | psad1 mpr[2] = cur[2], ref1a[2] | ||
709 : | psad1 mpr[3] = cur[3], ref1a[3] | ||
710 : | psad1 mpr[4] = cur[4], ref1a[4] | ||
711 : | |||
712 : | add dx = 1, dx | ||
713 : | psad1 mpr[5] = cur[5], ref1a[5] | ||
714 : | psad1 mpr[6] = cur[6], ref1a[6] | ||
715 : | |||
716 : | psad1 mpr[7] = cur[7], ref1a[7] | ||
717 : | ;; | ||
718 : | |||
719 : | .include "../../src/motion/ia64_asm/calc_delta_2.s" | ||
720 : | (lt) cmp.lt.unc fb, p0 = mpr[8], iMinSAD | ||
721 : | .include "../../src/motion/ia64_asm/calc_delta_3.s" | ||
722 : | |||
723 : | // Right top corner | ||
724 : | |||
725 : | |||
726 : | psad1 mpr[0] = cur[0], ref0b[0] | ||
727 : | psad1 mpr[1] = cur[1], ref0b[1] | ||
728 : | psad1 mpr[2] = cur[2], ref0b[2] | ||
729 : | psad1 mpr[3] = cur[3], ref0b[3] | ||
730 : | psad1 mpr[4] = cur[4], ref0b[4] | ||
731 : | |||
732 : | add backupX = 1, backupX | ||
733 : | psad1 mpr[5] = cur[5], ref0b[5] | ||
734 : | psad1 mpr[6] = cur[6], ref0b[6] | ||
735 : | |||
736 : | add dx = 1, dx | ||
737 : | psad1 mpr[7] = cur[7], ref0b[7] | ||
738 : | ;; | ||
739 : | |||
740 : | .include "../../src/motion/ia64_asm/calc_delta_1.s" | ||
741 : | (t) cmp.lt.unc fb, p0 = iSAD, iMinSAD | ||
742 : | ;; | ||
743 : | |||
744 : | // Left edge | ||
745 : | |||
746 : | (fb) mov iMinSAD = iSAD | ||
747 : | psad1 mpr[0] = cur[0], ref2a[0] | ||
748 : | |||
749 : | (fb) mov currX = backupX | ||
750 : | psad1 mpr[1] = cur[1], ref2a[1] | ||
751 : | psad1 mpr[2] = cur[2], ref2a[2] | ||
752 : | |||
753 : | (fb) mov currY = backupY | ||
754 : | psad1 mpr[3] = cur[3], ref2a[3] | ||
755 : | psad1 mpr[4] = cur[4], ref2a[4] | ||
756 : | |||
757 : | add backupX = 1, backupX | ||
758 : | psad1 mpr[5] = cur[5], ref2a[5] | ||
759 : | psad1 mpr[6] = cur[6], ref2a[6] | ||
760 : | |||
761 : | psad1 mpr[7] = cur[7], ref2a[7] | ||
762 : | |||
763 : | add dx = -2, dx | ||
764 : | add dy = 1, dy | ||
765 : | ;; | ||
766 : | |||
767 : | .include "../../src/motion/ia64_asm/calc_delta_2.s" | ||
768 : | (rt) cmp.lt.unc fb, p0 = mpr[8], iMinSAD | ||
769 : | .include "../../src/motion/ia64_asm/calc_delta_3.s" | ||
770 : | |||
771 : | // Right edge | ||
772 : | |||
773 : | |||
774 : | psad1 mpr[0] = cur[0], ref2b[0] | ||
775 : | psad1 mpr[1] = cur[1], ref2b[1] | ||
776 : | psad1 mpr[2] = cur[2], ref2b[2] | ||
777 : | psad1 mpr[3] = cur[3], ref2b[3] | ||
778 : | psad1 mpr[4] = cur[4], ref2b[4] | ||
779 : | |||
780 : | add backupX = -2, backupX | ||
781 : | psad1 mpr[5] = cur[5], ref2b[5] | ||
782 : | psad1 mpr[6] = cur[6], ref2b[6] | ||
783 : | |||
784 : | add backupY = 1, backupY | ||
785 : | add dx = 2, dx | ||
786 : | psad1 mpr[7] = cur[7], ref2b[7] | ||
787 : | ;; | ||
788 : | |||
789 : | .include "../../src/motion/ia64_asm/calc_delta_1.s" | ||
790 : | (l) cmp.lt.unc fb, p0 = iSAD, iMinSAD | ||
791 : | ;; | ||
792 : | |||
793 : | // Left bottom corner | ||
794 : | |||
795 : | (fb) mov iMinSAD = iSAD | ||
796 : | psad1 mpr[0] = cur[0], ref0a[1] | ||
797 : | |||
798 : | (fb) mov currX = backupX | ||
799 : | psad1 mpr[1] = cur[1], ref0a[2] | ||
800 : | psad1 mpr[2] = cur[2], ref0a[3] | ||
801 : | |||
802 : | (fb) mov currY = backupY | ||
803 : | psad1 mpr[3] = cur[3], ref0a[4] | ||
804 : | psad1 mpr[4] = cur[4], ref0a[5] | ||
805 : | |||
806 : | add backupX = 2, backupX | ||
807 : | psad1 mpr[5] = cur[5], ref0a[6] | ||
808 : | psad1 mpr[6] = cur[6], ref0a[7] | ||
809 : | |||
810 : | psad1 mpr[7] = cur[7], ref0a[8] | ||
811 : | |||
812 : | add dx = -2, dx | ||
813 : | add dy = 1, dy | ||
814 : | ;; | ||
815 : | |||
816 : | .include "../../src/motion/ia64_asm/calc_delta_2.s" | ||
817 : | (r) cmp.lt.unc fb, p0 = mpr[8], iMinSAD | ||
818 : | .include "../../src/motion/ia64_asm/calc_delta_3.s" | ||
819 : | |||
820 : | // Bottom edge | ||
821 : | |||
822 : | psad1 mpr[0] = cur[0], ref1a[1] | ||
823 : | psad1 mpr[1] = cur[1], ref1a[2] | ||
824 : | psad1 mpr[2] = cur[2], ref1a[3] | ||
825 : | psad1 mpr[3] = cur[3], ref1a[4] | ||
826 : | psad1 mpr[4] = cur[4], ref1a[5] | ||
827 : | |||
828 : | add backupX = -2, backupX | ||
829 : | psad1 mpr[5] = cur[5], ref1a[6] | ||
830 : | psad1 mpr[6] = cur[6], ref1a[7] | ||
831 : | |||
832 : | add backupY = 1, backupY | ||
833 : | add dx = 1, dx | ||
834 : | psad1 mpr[7] = cur[7], ref1a[8] | ||
835 : | ;; | ||
836 : | |||
837 : | .include "../../src/motion/ia64_asm/calc_delta_1.s" | ||
838 : | (lb) cmp.lt.unc fb, p0 = iSAD, iMinSAD | ||
839 : | ;; | ||
840 : | // Right bottom corner | ||
841 : | |||
842 : | |||
843 : | (fb) mov iMinSAD = iSAD | ||
844 : | psad1 mpr[0] = cur[0], ref0b[1] | ||
845 : | |||
846 : | (fb) mov currX = backupX | ||
847 : | psad1 mpr[1] = cur[1], ref0b[2] | ||
848 : | psad1 mpr[2] = cur[2], ref0b[3] | ||
849 : | |||
850 : | (fb) mov currY = backupY | ||
851 : | psad1 mpr[3] = cur[3], ref0b[4] | ||
852 : | psad1 mpr[4] = cur[4], ref0b[5] | ||
853 : | |||
854 : | add backupX = 1, backupX | ||
855 : | psad1 mpr[5] = cur[5], ref0b[6] | ||
856 : | psad1 mpr[6] = cur[6], ref0b[7] | ||
857 : | |||
858 : | add dx = 1, dx | ||
859 : | psad1 mpr[7] = cur[7], ref0b[8] | ||
860 : | ;; | ||
861 : | |||
862 : | .include "../../src/motion/ia64_asm/calc_delta_2.s" | ||
863 : | (b) cmp.lt.unc fb, p0 = mpr[8], iMinSAD | ||
864 : | .include "../../src/motion/ia64_asm/calc_delta_3.s" | ||
865 : | |||
866 : | (rb) getf.sig ret0 = fmv | ||
867 : | add backupX = 1, backupX | ||
868 : | ;; | ||
869 : | (rb) add iSAD = iSAD, ret0 | ||
870 : | ;; | ||
871 : | (rb) cmp.lt.unc fb, p0 = iSAD, iMinSAD | ||
872 : | ;; | ||
873 : | (fb) mov iMinSAD = iSAD | ||
874 : | (fb) mov currX = backupX | ||
875 : | (fb) mov currY = backupY | ||
876 : | ;; | ||
877 : | |||
878 : | // Write back result | ||
879 : | |||
880 : | st4 [currMV] = currX | ||
881 : | st4 [currYAddress] = currY | ||
882 : | mov ret0 = iMinSAD | ||
883 : | |||
884 : | // Restore important registers | ||
885 : | |||
886 : | ;; | ||
887 : | mov pr = prsave, -1 | ||
888 : | mov ar.pfs = pfs | ||
889 : | br.ret.sptk.many b0 | ||
890 : | |||
891 : | .endp Halfpel8_Refine_ia64# |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |