[svn] / trunk / xvidcore / src / image / ia64_asm / interpolate8x8_ia64_exact.s Repository:
ViewVC logotype

Annotation of /trunk/xvidcore/src/image/ia64_asm/interpolate8x8_ia64_exact.s

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1855 - (view) (download)

1 : Isibaar 1855 // ****************************************************************************
2 :     // *
3 :     // * XVID MPEG-4 VIDEO CODEC
4 :     // * - IA64 halfpel interpolation -
5 :     // *
6 :     // * Copyright(C) 2002 Kai Kühn, Alexander Viehl
7 :     // *
8 :     // * This program is free software; you can redistribute it and/or modify it
9 :     // * under the terms of the GNU General Public License as published by
10 :     // * the Free Software Foundation; either version 2 of the License, or
11 :     // * (at your option) any later version.
12 :     // *
13 :     // * This program is distributed in the hope that it will be useful,
14 :     // * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 :     // * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :     // * GNU General Public License for more details.
17 :     // *
18 :     // * You should have received a copy of the GNU General Public License
19 :     // * along with this program; if not, write to the Free Software
20 :     // * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :     // *
22 :     // * $Id: interpolate8x8_ia64_exact.s,v 1.2 2009-02-19 17:07:29 Isibaar Exp $
23 :     // *
24 :     // ***************************************************************************/
25 :     //
26 :     // ****************************************************************************
27 :     // *
28 :     // * interpolate8x8_ia64_exact.s, IA-64 halfpel interpolation
29 :     // *
30 :     // * This version was implemented during an IA-64 practical training at
31 :     // * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/)
32 :     // *
33 :     // ****************************************************************************
34 :    
35 : ia64p 320 // ***********************************
36 :     // interpolate8x8_ia64.s
37 :     // optimized for IA-64
38 :     // Authors : Kai Kühn
39 :     // Alexander Viehl
40 :     // last update : 13.7.2002
41 :     // ***********************************
42 :     .file "interpolate8x8_ia64.s"
43 :     .pred.safe_across_calls p1-p5,p16-p63
44 :     .text
45 :     .align 16
46 :     .global interpolate8x8_halfpel_h_ia64#
47 :     .proc interpolate8x8_halfpel_h_ia64#
48 :     interpolate8x8_halfpel_h_ia64:
49 :     LL=3
50 :     SL=1
51 :     SL2=1
52 :     OL=1
53 :     OL2=1
54 :     AVL=1
55 :     AL=1
56 :     PSH=1
57 :     ML=1
58 :     STL=3
59 :    
60 :     alloc r9=ar.pfs,4, 60,0,64
61 :    
62 :     mov r20 = ar.lc
63 :     mov r21 = pr
64 :    
65 :     dep.z r22 = r33,3,3 // rshift of src
66 :    
67 :     and r14 = -8,r33 // align src
68 :     mov r15 = r32 // get dest
69 :     mov r16 = r34 // stride
70 :     sub r17 = 1,r35 // 1-rounding
71 :    
72 :     ;;
73 :    
74 :     add r18 = 8,r14
75 :     mux2 r17 = r17, 0x00 // broadcast 1-rounding
76 :    
77 :     sub r24 = 64,r22 // lshift of src
78 :     add r26 = 8,r22 // rshift of src+1
79 :     sub r27 = 56,r22 // lshift of src+1
80 :    
81 :     mov ar.lc = 7 // loopcounter
82 :     mov ar.ec = LL + SL +OL + AVL + STL + 2*PSH + 2*AL + ML // sum of latencies
83 :     mov pr.rot = 1 << 16 // init pr regs for sw-pipeling
84 :    
85 :     ;;
86 :     .rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1+PSH],or2[OL+1+PSH],pshb1[PSH+1],pshb2[PSH+1],pshe1[PSH+1],pshe2[PSH+1+AL],pshe3[PSH+1],pshe4[PSH+1+AL],add1[AL+1],add2[AL+1],add3[AL+1],add4[AL+1],avg1[AVL+1],avg2[AVL+1],pmix1[ML+1]
87 :     .rotp aldp[LL], sh1p[SL], or1p[OL], pshb[PSH], pshe[PSH],addp[AL],add2p[AL],pavg1p[AVL],mixp[ML],stp[STL]
88 :    
89 :     .Lloop_interpolate:
90 :     (aldp[0]) ld8 ald1[0] = [r14],r16 // load aligned src
91 :     (aldp[0]) ld8 ald2[0] = [r18],r16 // and aligned src+8
92 :    
93 :     (sh1p[0]) shr.u shru1[0] = ald1[LL],r22 // get src
94 :     (sh1p[0]) shl shl1[0] = ald2[LL],r27
95 :     (sh1p[0]) shr.u shru2[0] = ald1[LL],r26 // get src+1
96 :     (sh1p[0]) shl shl2[0] = ald2[LL],r24
97 :    
98 :     (or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things
99 :     (or1p[0]) or or2[0] = shru2[SL],shl1[SL]
100 :    
101 :     (pshb[0]) pshl2 pshb1[0] = or1[OL],8
102 :     (pshb[0]) pshl2 pshb2[0] = or2[OL],8
103 :    
104 :     (pshe[0]) pshr2.u pshe1[0] = pshb1[PSH],8
105 :     (pshe[0]) pshr2.u pshe2[0] = pshb2[PSH],8
106 :     (pshe[0]) pshr2.u pshe3[0] = or1[PSH+OL],8
107 :     (pshe[0]) pshr2.u pshe4[0] = or2[PSH+OL],8
108 :    
109 :     (addp[0]) padd2.sss add1[0] = pshe1[PSH],r17 // add 1-rounding
110 :     (addp[0]) padd2.sss add2[0] = pshe3[PSH],r17 // add 1-rounding
111 :    
112 :     (add2p[0]) padd2.uus add3[0] = pshe2[AL+PSH],add1[AL]
113 :     (add2p[0]) padd2.uus add4[0] = pshe4[AL+PSH],add2[AL]
114 :    
115 :     (pavg1p[0]) pshr2.u avg1[0] = add3[AL],1 // parallel average
116 :     (pavg1p[0]) pshr2.u avg2[0] = add4[AL],1 // parallel average
117 :    
118 :     (mixp[0]) mix1.r pmix1[0] = avg2[AVL],avg1[AVL]
119 :    
120 :     (stp[0]) st8 [r15] = pmix1[ML] // store results
121 :     (stp[0]) add r15 = r15,r16
122 :    
123 :    
124 :    
125 :    
126 :     br.ctop.sptk.few .Lloop_interpolate
127 :     ;;
128 :     mov ar.lc = r20
129 :     mov pr = r21,-1
130 :     br.ret.sptk.many b0
131 :     .endp interpolate8x8_halfpel_h_ia64#
132 :    
133 :     .align 16
134 :     .global interpolate8x8_halfpel_v_ia64#
135 :     .proc interpolate8x8_halfpel_v_ia64#
136 :     interpolate8x8_halfpel_v_ia64:
137 :     LL=3
138 :     SL=1
139 :     SL2=1
140 :     OL=1
141 :     OL2=1
142 :     AVL=1
143 :     AL=1
144 :     PSH=1
145 :     ML=1
146 :     STL=3
147 :    
148 :     alloc r9=ar.pfs,4, 60,0,64
149 :    
150 :     mov r20 = ar.lc
151 :     mov r21 = pr
152 :    
153 :     dep.z r22 = r33,3,3
154 :    
155 :     and r14 = -8,r33
156 :     mov r15 = r32
157 :     mov r16 = r34
158 :     sub r17 = 1,r35
159 :     ;;
160 :    
161 :     add r18 = 8,r14
162 :     add r19 = r14,r16 // src + stride
163 :     mux2 r17 = r17, 0x00
164 :    
165 :     sub r24 = 64,r22
166 :     ;;
167 :     add r26 = 8,r19 // src + stride + 8
168 :    
169 :     mov ar.lc = 7
170 :     mov ar.ec = LL + SL +OL + AVL + STL + 2*PSH + 2*AL + ML // sum of latencies
171 :     mov pr.rot = 1 << 16
172 :    
173 :     ;;
174 :     .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1+PSH],or2[OL+1+PSH],pshb1[PSH+1],pshb2[PSH+1],pshe1[PSH+1],pshe2[PSH+1+AL],pshe3[PSH+1],pshe4[PSH+1+AL],add1[AL+1],add2[AL+1],add3[AL+1],add4[AL+1],avg1[AVL+1],avg2[AVL+1],pmix1[ML+1]
175 :     .rotp aldp[LL], sh1p[SL], or1p[OL], pshb[PSH], pshe[PSH],addp[AL],add2p[AL],pavg1p[AVL],mixp[ML],stp[STL]
176 :    
177 :    
178 :    
179 :     .Lloop_interpolate2:
180 :     (aldp[0]) ld8 ald1[0] = [r14],r16
181 :     (aldp[0]) ld8 ald2[0] = [r18],r16
182 :     (aldp[0]) ld8 ald3[0] = [r19],r16
183 :     (aldp[0]) ld8 ald4[0] = [r26],r16
184 :    
185 :     (sh1p[0]) shr.u shru1[0] = ald1[LL],r22
186 :     (sh1p[0]) shl shl1[0] = ald2[LL],r24
187 :     (sh1p[0]) shr.u shru2[0] = ald3[LL],r22
188 :     (sh1p[0]) shl shl2[0] = ald4[LL],r24
189 :    
190 :     (or1p[0]) or or1[0] = shru1[SL],shl1[SL]
191 :     (or1p[0]) or or2[0] = shru2[SL],shl2[SL]
192 :    
193 :     (pshb[0]) pshl2 pshb1[0] = or1[OL],8
194 :     (pshb[0]) pshl2 pshb2[0] = or2[OL],8
195 :    
196 :     (pshe[0]) pshr2.u pshe1[0] = pshb1[PSH],8
197 :     (pshe[0]) pshr2.u pshe2[0] = pshb2[PSH],8
198 :     (pshe[0]) pshr2.u pshe3[0] = or1[PSH+OL],8
199 :     (pshe[0]) pshr2.u pshe4[0] = or2[PSH+OL],8
200 :    
201 :     (addp[0]) padd2.sss add1[0] = pshe1[PSH],r17 // add 1-rounding
202 :     (addp[0]) padd2.sss add2[0] = pshe3[PSH],r17 // add 1-rounding
203 :    
204 :     (add2p[0]) padd2.uus add3[0] = pshe2[AL+PSH],add1[AL]
205 :     (add2p[0]) padd2.uus add4[0] = pshe4[AL+PSH],add2[AL]
206 :    
207 :     (pavg1p[0]) pshr2.u avg1[0] = add3[AL],1 // parallel average
208 :     (pavg1p[0]) pshr2.u avg2[0] = add4[AL],1 // parallel average
209 :    
210 :     (mixp[0]) mix1.r pmix1[0] = avg2[AVL],avg1[AVL]
211 :    
212 :     (stp[0]) st8 [r15] = pmix1[ML]
213 :     (stp[0]) add r15 = r15,r16
214 :    
215 :    
216 :    
217 :    
218 :     br.ctop.sptk.few .Lloop_interpolate2
219 :     ;;
220 :     mov ar.lc = r20
221 :     mov pr = r21,-1
222 :     br.ret.sptk.many b0
223 :     .endp interpolate8x8_halfpel_v_ia64#
224 :    
225 :     .align 16
226 :     .global interpolate8x8_halfpel_hv_ia64#
227 :     .proc interpolate8x8_halfpel_hv_ia64#
228 :     interpolate8x8_halfpel_hv_ia64:
229 :     LL=3
230 :     SL=1
231 :     SL2=1
232 :     OL=1
233 :     OL2=1
234 :     AVL=1
235 :     AL=1
236 :     PSH=1
237 :     ML=1
238 :     STL=3
239 :    
240 :     alloc r9=ar.pfs,4, 92,0,96
241 :    
242 :     mov r20 = ar.lc
243 :     mov r21 = pr
244 :    
245 :     dep.z r22 = r33,3,3
246 :    
247 :     and r14 = -8,r33
248 :     mov r15 = r32
249 :     mov r16 = r34
250 :     sub r17 = 2,r35
251 :     ;;
252 :    
253 :     add r18 = 8,r14
254 :     add r19 = r14,r16
255 :     mux2 r17 = r17, 0x00
256 :    
257 :     add r27 = 8,r22
258 :     sub r28 = 56,r22
259 :     sub r24 = 64,r22
260 :     ;;
261 :     add r26 = 8,r19
262 :    
263 :     mov ar.lc = 7
264 :     mov ar.ec = LL + SL +OL + AVL + STL + 2*PSH + 2*AL + ML // sum of latencies
265 :    
266 :     mov pr.rot = 1 << 16
267 :    
268 :     ;;
269 :     .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1+PSH],or2[OL+1+PSH],or3[OL+1+PSH],or4[OL+1+PSH],pshb1[PSH+1],pshb2[PSH+1],pshb3[PSH+1],pshb4[PSH+1],pshe1[PSH+1],pshe2[PSH+1+AL],pshe3[PSH+1],pshe4[PSH+1+AL],pshe5[PSH+1],pshe6[PSH+1+AL],pshe7[PSH+1],pshe8[PSH+1+AL],add1[AL+1],add2[AL+1],add3[AL+1],add4[AL+1],add5[AL+1],add6[AL+1],add7[AL+1],add8[AL+1],avg1[AVL+1],avg2[AVL+1],pmix1[ML+1]
270 :     .rotp aldp[LL], sh1p[SL],or1p[OL],pshb[PSH],pshe[PSH],addp[AL],add2p[AL],add3p[AL],pavg1p[AVL],mixp[ML],stp[STL]
271 :    
272 :    
273 :    
274 :     .Lloop_interpolate3:
275 :     (aldp[0]) ld8 ald1[0] = [r14],r16
276 :     (aldp[0]) ld8 ald2[0] = [r18],r16
277 :     (aldp[0]) ld8 ald3[0] = [r19],r16
278 :     (aldp[0]) ld8 ald4[0] = [r26],r16
279 :    
280 :     (sh1p[0]) shr.u shru1[0] = ald1[LL],r22
281 :     (sh1p[0]) shl shl1[0] = ald2[LL],r24
282 :     (sh1p[0]) shr.u shru2[0] = ald3[LL],r22
283 :     (sh1p[0]) shl shl2[0] = ald4[LL],r24
284 :     (sh1p[0]) shr.u shru3[0] = ald1[LL],r27
285 :     (sh1p[0]) shl shl3[0] = ald2[LL],r28
286 :     (sh1p[0]) shr.u shru4[0] = ald3[LL],r27
287 :     (sh1p[0]) shl shl4[0] = ald4[LL],r28
288 :    
289 :    
290 :     (or1p[0]) or or1[0] = shru1[SL],shl1[SL]
291 :     (or1p[0]) or or2[0] = shru2[SL],shl2[SL]
292 :     (or1p[0]) or or3[0] = shru3[SL],shl3[SL]
293 :     (or1p[0]) or or4[0] = shru4[SL],shl4[SL]
294 :    
295 :     (pshb[0]) pshl2 pshb1[0] = or1[OL],8
296 :     (pshb[0]) pshl2 pshb2[0] = or2[OL],8
297 :     (pshb[0]) pshl2 pshb3[0] = or3[OL],8
298 :     (pshb[0]) pshl2 pshb4[0] = or4[OL],8
299 :    
300 :    
301 :     (pshe[0]) pshr2.u pshe1[0] = pshb1[PSH],8
302 :     (pshe[0]) pshr2.u pshe2[0] = pshb2[PSH],8
303 :     (pshe[0]) pshr2.u pshe3[0] = or1[PSH+OL],8
304 :     (pshe[0]) pshr2.u pshe4[0] = or2[PSH+OL],8
305 :     (pshe[0]) pshr2.u pshe5[0] = pshb3[PSH],8
306 :     (pshe[0]) pshr2.u pshe6[0] = pshb4[PSH],8
307 :     (pshe[0]) pshr2.u pshe7[0] = or3[PSH+OL],8
308 :     (pshe[0]) pshr2.u pshe8[0] = or4[PSH+OL],8
309 :    
310 :    
311 :    
312 :     (addp[0]) padd2.sss add1[0] = pshe1[PSH],pshe2[PSH] // add 1-rounding
313 :     (addp[0]) padd2.sss add2[0] = pshe3[PSH],pshe4[PSH] // add 1-rounding
314 :     (addp[0]) padd2.sss add5[0] = pshe5[PSH],pshe6[PSH] // add 1-rounding
315 :     (addp[0]) padd2.sss add6[0] = pshe7[PSH],pshe8[PSH] // add 1-rounding
316 :    
317 :    
318 :     (add2p[0]) padd2.uus add3[0] = add1[AL],add5[AL]
319 :     (add2p[0]) padd2.uus add4[0] = add2[AL],add6[AL]
320 :    
321 :     (add3p[0]) padd2.uus add7[0] = add3[AL],r17
322 :     (add3p[0]) padd2.uus add8[0] = add4[AL],r17
323 :    
324 :    
325 :     (pavg1p[0]) pshr2.u avg1[0] = add7[AL],2 // parallel average
326 :     (pavg1p[0]) pshr2.u avg2[0] = add8[AL],2 // parallel average
327 :    
328 :     (mixp[0]) mix1.r pmix1[0] = avg2[AVL],avg1[AVL]
329 :    
330 :    
331 :     (stp[0]) st8 [r15] = pmix1[ML]
332 :     (stp[0]) add r15 = r15,r16
333 :    
334 :    
335 :    
336 :    
337 :     br.ctop.sptk.few .Lloop_interpolate3
338 :     ;;
339 :     mov ar.lc = r20
340 :     mov pr = r21,-1
341 :     br.ret.sptk.many b0
342 :     .endp interpolate8x8_halfpel_hv_ia64#
343 :    
344 :    

No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4