Parent Directory | Revision Log
Revision 1855 - (view) (download)
1 : | Isibaar | 1855 | // **************************************************************************** |
2 : | // * | ||
3 : | // * XVID MPEG-4 VIDEO CODEC | ||
4 : | // * - IA64 halfpel interpolation - | ||
5 : | // * | ||
6 : | // * Copyright(C) 2002 Kai Kühn, Alexander Viehl | ||
7 : | // * | ||
8 : | // * This program is free software; you can redistribute it and/or modify it | ||
9 : | // * under the terms of the GNU General Public License as published by | ||
10 : | // * the Free Software Foundation; either version 2 of the License, or | ||
11 : | // * (at your option) any later version. | ||
12 : | // * | ||
13 : | // * This program is distributed in the hope that it will be useful, | ||
14 : | // * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 : | // * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 : | // * GNU General Public License for more details. | ||
17 : | // * | ||
18 : | // * You should have received a copy of the GNU General Public License | ||
19 : | // * along with this program; if not, write to the Free Software | ||
20 : | // * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
21 : | // * | ||
22 : | // * $Id: interpolate8x8_ia64_exact.s,v 1.2 2009-02-19 17:07:29 Isibaar Exp $ | ||
23 : | // * | ||
24 : | // ***************************************************************************/ | ||
25 : | // | ||
26 : | // **************************************************************************** | ||
27 : | // * | ||
28 : | // * interpolate8x8_ia64_exact.s, IA-64 halfpel interpolation | ||
29 : | // * | ||
30 : | // * This version was implemented during an IA-64 practical training at | ||
31 : | // * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/) | ||
32 : | // * | ||
33 : | // **************************************************************************** | ||
34 : | |||
35 : | ia64p | 320 | // *********************************** |
36 : | // interpolate8x8_ia64.s | ||
37 : | // optimized for IA-64 | ||
38 : | // Authors : Kai Kühn | ||
39 : | // Alexander Viehl | ||
40 : | // last update : 13.7.2002 | ||
41 : | // *********************************** | ||
42 : | .file "interpolate8x8_ia64.s" | ||
43 : | .pred.safe_across_calls p1-p5,p16-p63 | ||
44 : | .text | ||
45 : | .align 16 | ||
46 : | .global interpolate8x8_halfpel_h_ia64# | ||
47 : | .proc interpolate8x8_halfpel_h_ia64# | ||
48 : | interpolate8x8_halfpel_h_ia64: | ||
49 : | LL=3 | ||
50 : | SL=1 | ||
51 : | SL2=1 | ||
52 : | OL=1 | ||
53 : | OL2=1 | ||
54 : | AVL=1 | ||
55 : | AL=1 | ||
56 : | PSH=1 | ||
57 : | ML=1 | ||
58 : | STL=3 | ||
59 : | |||
60 : | alloc r9=ar.pfs,4, 60,0,64 | ||
61 : | |||
62 : | mov r20 = ar.lc | ||
63 : | mov r21 = pr | ||
64 : | |||
65 : | dep.z r22 = r33,3,3 // rshift of src | ||
66 : | |||
67 : | and r14 = -8,r33 // align src | ||
68 : | mov r15 = r32 // get dest | ||
69 : | mov r16 = r34 // stride | ||
70 : | sub r17 = 1,r35 // 1-rounding | ||
71 : | |||
72 : | ;; | ||
73 : | |||
74 : | add r18 = 8,r14 | ||
75 : | mux2 r17 = r17, 0x00 // broadcast 1-rounding | ||
76 : | |||
77 : | sub r24 = 64,r22 // lshift of src | ||
78 : | add r26 = 8,r22 // rshift of src+1 | ||
79 : | sub r27 = 56,r22 // lshift of src+1 | ||
80 : | |||
81 : | mov ar.lc = 7 // loopcounter | ||
82 : | mov ar.ec = LL + SL +OL + AVL + STL + 2*PSH + 2*AL + ML // sum of latencies | ||
83 : | mov pr.rot = 1 << 16 // init pr regs for sw-pipeling | ||
84 : | |||
85 : | ;; | ||
86 : | .rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1+PSH],or2[OL+1+PSH],pshb1[PSH+1],pshb2[PSH+1],pshe1[PSH+1],pshe2[PSH+1+AL],pshe3[PSH+1],pshe4[PSH+1+AL],add1[AL+1],add2[AL+1],add3[AL+1],add4[AL+1],avg1[AVL+1],avg2[AVL+1],pmix1[ML+1] | ||
87 : | .rotp aldp[LL], sh1p[SL], or1p[OL], pshb[PSH], pshe[PSH],addp[AL],add2p[AL],pavg1p[AVL],mixp[ML],stp[STL] | ||
88 : | |||
89 : | .Lloop_interpolate: | ||
90 : | (aldp[0]) ld8 ald1[0] = [r14],r16 // load aligned src | ||
91 : | (aldp[0]) ld8 ald2[0] = [r18],r16 // and aligned src+8 | ||
92 : | |||
93 : | (sh1p[0]) shr.u shru1[0] = ald1[LL],r22 // get src | ||
94 : | (sh1p[0]) shl shl1[0] = ald2[LL],r27 | ||
95 : | (sh1p[0]) shr.u shru2[0] = ald1[LL],r26 // get src+1 | ||
96 : | (sh1p[0]) shl shl2[0] = ald2[LL],r24 | ||
97 : | |||
98 : | (or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things | ||
99 : | (or1p[0]) or or2[0] = shru2[SL],shl1[SL] | ||
100 : | |||
101 : | (pshb[0]) pshl2 pshb1[0] = or1[OL],8 | ||
102 : | (pshb[0]) pshl2 pshb2[0] = or2[OL],8 | ||
103 : | |||
104 : | (pshe[0]) pshr2.u pshe1[0] = pshb1[PSH],8 | ||
105 : | (pshe[0]) pshr2.u pshe2[0] = pshb2[PSH],8 | ||
106 : | (pshe[0]) pshr2.u pshe3[0] = or1[PSH+OL],8 | ||
107 : | (pshe[0]) pshr2.u pshe4[0] = or2[PSH+OL],8 | ||
108 : | |||
109 : | (addp[0]) padd2.sss add1[0] = pshe1[PSH],r17 // add 1-rounding | ||
110 : | (addp[0]) padd2.sss add2[0] = pshe3[PSH],r17 // add 1-rounding | ||
111 : | |||
112 : | (add2p[0]) padd2.uus add3[0] = pshe2[AL+PSH],add1[AL] | ||
113 : | (add2p[0]) padd2.uus add4[0] = pshe4[AL+PSH],add2[AL] | ||
114 : | |||
115 : | (pavg1p[0]) pshr2.u avg1[0] = add3[AL],1 // parallel average | ||
116 : | (pavg1p[0]) pshr2.u avg2[0] = add4[AL],1 // parallel average | ||
117 : | |||
118 : | (mixp[0]) mix1.r pmix1[0] = avg2[AVL],avg1[AVL] | ||
119 : | |||
120 : | (stp[0]) st8 [r15] = pmix1[ML] // store results | ||
121 : | (stp[0]) add r15 = r15,r16 | ||
122 : | |||
123 : | |||
124 : | |||
125 : | |||
126 : | br.ctop.sptk.few .Lloop_interpolate | ||
127 : | ;; | ||
128 : | mov ar.lc = r20 | ||
129 : | mov pr = r21,-1 | ||
130 : | br.ret.sptk.many b0 | ||
131 : | .endp interpolate8x8_halfpel_h_ia64# | ||
132 : | |||
133 : | .align 16 | ||
134 : | .global interpolate8x8_halfpel_v_ia64# | ||
135 : | .proc interpolate8x8_halfpel_v_ia64# | ||
136 : | interpolate8x8_halfpel_v_ia64: | ||
137 : | LL=3 | ||
138 : | SL=1 | ||
139 : | SL2=1 | ||
140 : | OL=1 | ||
141 : | OL2=1 | ||
142 : | AVL=1 | ||
143 : | AL=1 | ||
144 : | PSH=1 | ||
145 : | ML=1 | ||
146 : | STL=3 | ||
147 : | |||
148 : | alloc r9=ar.pfs,4, 60,0,64 | ||
149 : | |||
150 : | mov r20 = ar.lc | ||
151 : | mov r21 = pr | ||
152 : | |||
153 : | dep.z r22 = r33,3,3 | ||
154 : | |||
155 : | and r14 = -8,r33 | ||
156 : | mov r15 = r32 | ||
157 : | mov r16 = r34 | ||
158 : | sub r17 = 1,r35 | ||
159 : | ;; | ||
160 : | |||
161 : | add r18 = 8,r14 | ||
162 : | add r19 = r14,r16 // src + stride | ||
163 : | mux2 r17 = r17, 0x00 | ||
164 : | |||
165 : | sub r24 = 64,r22 | ||
166 : | ;; | ||
167 : | add r26 = 8,r19 // src + stride + 8 | ||
168 : | |||
169 : | mov ar.lc = 7 | ||
170 : | mov ar.ec = LL + SL +OL + AVL + STL + 2*PSH + 2*AL + ML // sum of latencies | ||
171 : | mov pr.rot = 1 << 16 | ||
172 : | |||
173 : | ;; | ||
174 : | .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1+PSH],or2[OL+1+PSH],pshb1[PSH+1],pshb2[PSH+1],pshe1[PSH+1],pshe2[PSH+1+AL],pshe3[PSH+1],pshe4[PSH+1+AL],add1[AL+1],add2[AL+1],add3[AL+1],add4[AL+1],avg1[AVL+1],avg2[AVL+1],pmix1[ML+1] | ||
175 : | .rotp aldp[LL], sh1p[SL], or1p[OL], pshb[PSH], pshe[PSH],addp[AL],add2p[AL],pavg1p[AVL],mixp[ML],stp[STL] | ||
176 : | |||
177 : | |||
178 : | |||
179 : | .Lloop_interpolate2: | ||
180 : | (aldp[0]) ld8 ald1[0] = [r14],r16 | ||
181 : | (aldp[0]) ld8 ald2[0] = [r18],r16 | ||
182 : | (aldp[0]) ld8 ald3[0] = [r19],r16 | ||
183 : | (aldp[0]) ld8 ald4[0] = [r26],r16 | ||
184 : | |||
185 : | (sh1p[0]) shr.u shru1[0] = ald1[LL],r22 | ||
186 : | (sh1p[0]) shl shl1[0] = ald2[LL],r24 | ||
187 : | (sh1p[0]) shr.u shru2[0] = ald3[LL],r22 | ||
188 : | (sh1p[0]) shl shl2[0] = ald4[LL],r24 | ||
189 : | |||
190 : | (or1p[0]) or or1[0] = shru1[SL],shl1[SL] | ||
191 : | (or1p[0]) or or2[0] = shru2[SL],shl2[SL] | ||
192 : | |||
193 : | (pshb[0]) pshl2 pshb1[0] = or1[OL],8 | ||
194 : | (pshb[0]) pshl2 pshb2[0] = or2[OL],8 | ||
195 : | |||
196 : | (pshe[0]) pshr2.u pshe1[0] = pshb1[PSH],8 | ||
197 : | (pshe[0]) pshr2.u pshe2[0] = pshb2[PSH],8 | ||
198 : | (pshe[0]) pshr2.u pshe3[0] = or1[PSH+OL],8 | ||
199 : | (pshe[0]) pshr2.u pshe4[0] = or2[PSH+OL],8 | ||
200 : | |||
201 : | (addp[0]) padd2.sss add1[0] = pshe1[PSH],r17 // add 1-rounding | ||
202 : | (addp[0]) padd2.sss add2[0] = pshe3[PSH],r17 // add 1-rounding | ||
203 : | |||
204 : | (add2p[0]) padd2.uus add3[0] = pshe2[AL+PSH],add1[AL] | ||
205 : | (add2p[0]) padd2.uus add4[0] = pshe4[AL+PSH],add2[AL] | ||
206 : | |||
207 : | (pavg1p[0]) pshr2.u avg1[0] = add3[AL],1 // parallel average | ||
208 : | (pavg1p[0]) pshr2.u avg2[0] = add4[AL],1 // parallel average | ||
209 : | |||
210 : | (mixp[0]) mix1.r pmix1[0] = avg2[AVL],avg1[AVL] | ||
211 : | |||
212 : | (stp[0]) st8 [r15] = pmix1[ML] | ||
213 : | (stp[0]) add r15 = r15,r16 | ||
214 : | |||
215 : | |||
216 : | |||
217 : | |||
218 : | br.ctop.sptk.few .Lloop_interpolate2 | ||
219 : | ;; | ||
220 : | mov ar.lc = r20 | ||
221 : | mov pr = r21,-1 | ||
222 : | br.ret.sptk.many b0 | ||
223 : | .endp interpolate8x8_halfpel_v_ia64# | ||
224 : | |||
225 : | .align 16 | ||
226 : | .global interpolate8x8_halfpel_hv_ia64# | ||
227 : | .proc interpolate8x8_halfpel_hv_ia64# | ||
228 : | interpolate8x8_halfpel_hv_ia64: | ||
229 : | LL=3 | ||
230 : | SL=1 | ||
231 : | SL2=1 | ||
232 : | OL=1 | ||
233 : | OL2=1 | ||
234 : | AVL=1 | ||
235 : | AL=1 | ||
236 : | PSH=1 | ||
237 : | ML=1 | ||
238 : | STL=3 | ||
239 : | |||
240 : | alloc r9=ar.pfs,4, 92,0,96 | ||
241 : | |||
242 : | mov r20 = ar.lc | ||
243 : | mov r21 = pr | ||
244 : | |||
245 : | dep.z r22 = r33,3,3 | ||
246 : | |||
247 : | and r14 = -8,r33 | ||
248 : | mov r15 = r32 | ||
249 : | mov r16 = r34 | ||
250 : | sub r17 = 2,r35 | ||
251 : | ;; | ||
252 : | |||
253 : | add r18 = 8,r14 | ||
254 : | add r19 = r14,r16 | ||
255 : | mux2 r17 = r17, 0x00 | ||
256 : | |||
257 : | add r27 = 8,r22 | ||
258 : | sub r28 = 56,r22 | ||
259 : | sub r24 = 64,r22 | ||
260 : | ;; | ||
261 : | add r26 = 8,r19 | ||
262 : | |||
263 : | mov ar.lc = 7 | ||
264 : | mov ar.ec = LL + SL +OL + AVL + STL + 2*PSH + 2*AL + ML // sum of latencies | ||
265 : | |||
266 : | mov pr.rot = 1 << 16 | ||
267 : | |||
268 : | ;; | ||
269 : | .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1+PSH],or2[OL+1+PSH],or3[OL+1+PSH],or4[OL+1+PSH],pshb1[PSH+1],pshb2[PSH+1],pshb3[PSH+1],pshb4[PSH+1],pshe1[PSH+1],pshe2[PSH+1+AL],pshe3[PSH+1],pshe4[PSH+1+AL],pshe5[PSH+1],pshe6[PSH+1+AL],pshe7[PSH+1],pshe8[PSH+1+AL],add1[AL+1],add2[AL+1],add3[AL+1],add4[AL+1],add5[AL+1],add6[AL+1],add7[AL+1],add8[AL+1],avg1[AVL+1],avg2[AVL+1],pmix1[ML+1] | ||
270 : | .rotp aldp[LL], sh1p[SL],or1p[OL],pshb[PSH],pshe[PSH],addp[AL],add2p[AL],add3p[AL],pavg1p[AVL],mixp[ML],stp[STL] | ||
271 : | |||
272 : | |||
273 : | |||
274 : | .Lloop_interpolate3: | ||
275 : | (aldp[0]) ld8 ald1[0] = [r14],r16 | ||
276 : | (aldp[0]) ld8 ald2[0] = [r18],r16 | ||
277 : | (aldp[0]) ld8 ald3[0] = [r19],r16 | ||
278 : | (aldp[0]) ld8 ald4[0] = [r26],r16 | ||
279 : | |||
280 : | (sh1p[0]) shr.u shru1[0] = ald1[LL],r22 | ||
281 : | (sh1p[0]) shl shl1[0] = ald2[LL],r24 | ||
282 : | (sh1p[0]) shr.u shru2[0] = ald3[LL],r22 | ||
283 : | (sh1p[0]) shl shl2[0] = ald4[LL],r24 | ||
284 : | (sh1p[0]) shr.u shru3[0] = ald1[LL],r27 | ||
285 : | (sh1p[0]) shl shl3[0] = ald2[LL],r28 | ||
286 : | (sh1p[0]) shr.u shru4[0] = ald3[LL],r27 | ||
287 : | (sh1p[0]) shl shl4[0] = ald4[LL],r28 | ||
288 : | |||
289 : | |||
290 : | (or1p[0]) or or1[0] = shru1[SL],shl1[SL] | ||
291 : | (or1p[0]) or or2[0] = shru2[SL],shl2[SL] | ||
292 : | (or1p[0]) or or3[0] = shru3[SL],shl3[SL] | ||
293 : | (or1p[0]) or or4[0] = shru4[SL],shl4[SL] | ||
294 : | |||
295 : | (pshb[0]) pshl2 pshb1[0] = or1[OL],8 | ||
296 : | (pshb[0]) pshl2 pshb2[0] = or2[OL],8 | ||
297 : | (pshb[0]) pshl2 pshb3[0] = or3[OL],8 | ||
298 : | (pshb[0]) pshl2 pshb4[0] = or4[OL],8 | ||
299 : | |||
300 : | |||
301 : | (pshe[0]) pshr2.u pshe1[0] = pshb1[PSH],8 | ||
302 : | (pshe[0]) pshr2.u pshe2[0] = pshb2[PSH],8 | ||
303 : | (pshe[0]) pshr2.u pshe3[0] = or1[PSH+OL],8 | ||
304 : | (pshe[0]) pshr2.u pshe4[0] = or2[PSH+OL],8 | ||
305 : | (pshe[0]) pshr2.u pshe5[0] = pshb3[PSH],8 | ||
306 : | (pshe[0]) pshr2.u pshe6[0] = pshb4[PSH],8 | ||
307 : | (pshe[0]) pshr2.u pshe7[0] = or3[PSH+OL],8 | ||
308 : | (pshe[0]) pshr2.u pshe8[0] = or4[PSH+OL],8 | ||
309 : | |||
310 : | |||
311 : | |||
312 : | (addp[0]) padd2.sss add1[0] = pshe1[PSH],pshe2[PSH] // add 1-rounding | ||
313 : | (addp[0]) padd2.sss add2[0] = pshe3[PSH],pshe4[PSH] // add 1-rounding | ||
314 : | (addp[0]) padd2.sss add5[0] = pshe5[PSH],pshe6[PSH] // add 1-rounding | ||
315 : | (addp[0]) padd2.sss add6[0] = pshe7[PSH],pshe8[PSH] // add 1-rounding | ||
316 : | |||
317 : | |||
318 : | (add2p[0]) padd2.uus add3[0] = add1[AL],add5[AL] | ||
319 : | (add2p[0]) padd2.uus add4[0] = add2[AL],add6[AL] | ||
320 : | |||
321 : | (add3p[0]) padd2.uus add7[0] = add3[AL],r17 | ||
322 : | (add3p[0]) padd2.uus add8[0] = add4[AL],r17 | ||
323 : | |||
324 : | |||
325 : | (pavg1p[0]) pshr2.u avg1[0] = add7[AL],2 // parallel average | ||
326 : | (pavg1p[0]) pshr2.u avg2[0] = add8[AL],2 // parallel average | ||
327 : | |||
328 : | (mixp[0]) mix1.r pmix1[0] = avg2[AVL],avg1[AVL] | ||
329 : | |||
330 : | |||
331 : | (stp[0]) st8 [r15] = pmix1[ML] | ||
332 : | (stp[0]) add r15 = r15,r16 | ||
333 : | |||
334 : | |||
335 : | |||
336 : | |||
337 : | br.ctop.sptk.few .Lloop_interpolate3 | ||
338 : | ;; | ||
339 : | mov ar.lc = r20 | ||
340 : | mov pr = r21,-1 | ||
341 : | br.ret.sptk.many b0 | ||
342 : | .endp interpolate8x8_halfpel_hv_ia64# | ||
343 : | |||
344 : |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |