[svn] / trunk / xvidcore / src / image / ia64_asm / interpolate8x8_ia64.s Repository:
ViewVC logotype

Annotation of /trunk/xvidcore/src/image/ia64_asm/interpolate8x8_ia64.s

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1855 - (view) (download)

1 : Isibaar 1855 // ****************************************************************************
2 :     // *
3 :     // * XVID MPEG-4 VIDEO CODEC
4 :     // * - IA64 halfpel interpolation -
5 :     // *
6 :     // * Copyright(C) 2002 Kai Kühn, Alexander Viehl
7 :     // *
8 :     // * This program is free software; you can redistribute it and/or modify it
9 :     // * under the terms of the GNU General Public License as published by
10 :     // * the Free Software Foundation; either version 2 of the License, or
11 :     // * (at your option) any later version.
12 :     // *
13 :     // * This program is distributed in the hope that it will be useful,
14 :     // * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 :     // * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :     // * GNU General Public License for more details.
17 :     // *
18 :     // * You should have received a copy of the GNU General Public License
19 :     // * along with this program; if not, write to the Free Software
20 :     // * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :     // *
22 :     // * $Id: interpolate8x8_ia64.s,v 1.6 2009-02-19 17:07:29 Isibaar Exp $
23 :     // *
24 :     // ***************************************************************************/
25 :     //
26 :     // ****************************************************************************
27 :     // *
28 :     // * interpolate8x8_ia64.s, IA-64 halfpel interpolation
29 :     // *
30 :     // * This version was implemented during an IA-64 practical training at
31 :     // * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/)
32 :     // *
33 :     // ****************************************************************************
34 : ia64p 244
35 :     .file "interpolate8x8_ia64.s"
36 :     .pred.safe_across_calls p1-p5,p16-p63
37 :     .text
38 :     .align 16
39 :     .global interpolate8x8_halfpel_h_ia64#
40 :     .proc interpolate8x8_halfpel_h_ia64#
41 :     interpolate8x8_halfpel_h_ia64:
42 :     LL=3
43 :     SL=1
44 :     SL2=1
45 :     OL=1
46 :     OL2=1
47 :     AVL=1
48 :     AL=1
49 :     STL=3
50 :    
51 :     alloc r9=ar.pfs,4, 60,0,64
52 :    
53 :     mov r20 = ar.lc
54 :     mov r21 = pr
55 :    
56 :     dep.z r22 = r33,3,3 // rshift of src
57 :    
58 :     and r14 = -8,r33 // align src
59 :     mov r15 = r32 // get dest
60 :     mov r16 = r34 // stride
61 : ia64p 291 // sub r17 = 0,r0 // 1-rounding
62 :    
63 : ia64p 244 ;;
64 :    
65 :     add r18 = 8,r14
66 : ia64p 291 // mux1 r17 = r17, @brcst // broadcast 1-rounding
67 : ia64p 244
68 :     sub r24 = 64,r22 // lshift of src
69 :     add r26 = 8,r22 // rshift of src+1
70 :     sub r27 = 56,r22 // lshift of src+1
71 :    
72 :     mov ar.lc = 7 // loopcounter
73 : ia64p 291 mov ar.ec = LL + SL +OL + AVL + STL // sum of latencies
74 : ia64p 244 mov pr.rot = 1 << 16 // init pr regs for sw-pipeling
75 :    
76 :     ;;
77 : ia64p 291 .rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1]
78 :     .rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL]
79 : ia64p 244
80 :    
81 : ia64p 246 .Lloop_interpolate:
82 : ia64p 244 (aldp[0]) ld8 ald1[0] = [r14],r16 // load aligned src
83 :     (aldp[0]) ld8 ald2[0] = [r18],r16 // and aligned src+8
84 :    
85 :     (sh1p[0]) shr.u shru1[0] = ald1[LL],r22 // get src
86 :     (sh1p[0]) shl shl1[0] = ald2[LL],r27
87 :     (sh1p[0]) shr.u shru2[0] = ald1[LL],r26 // get src+1
88 :     (sh1p[0]) shl shl2[0] = ald2[LL],r24
89 :    
90 :     (or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things
91 :     (or1p[0]) or or2[0] = shru2[SL],shl1[SL]
92 :    
93 : ia64p 291 // (addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding
94 : ia64p 244
95 : ia64p 291 (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] // parallel average
96 : ia64p 244
97 :     (stp[0]) st8 [r15] = avg[AVL] // store results
98 :     (stp[0]) add r15 = r15,r16
99 :    
100 :    
101 :    
102 :    
103 : ia64p 246 br.ctop.sptk.few .Lloop_interpolate
104 : ia64p 244 ;;
105 :     mov ar.lc = r20
106 :     mov pr = r21,-1
107 :     br.ret.sptk.many b0
108 :     .endp interpolate8x8_halfpel_h_ia64#
109 :    
110 :     .align 16
111 :     .global interpolate8x8_halfpel_v_ia64#
112 :     .proc interpolate8x8_halfpel_v_ia64#
113 :     interpolate8x8_halfpel_v_ia64:
114 :     LL=3
115 :     SL=1
116 :     SL2=1
117 :     OL=1
118 :     OL2=1
119 :     AVL=1
120 :     AL=1
121 :     STL=3
122 :    
123 :     alloc r9=ar.pfs,4, 60,0,64
124 :    
125 :     mov r20 = ar.lc
126 :     mov r21 = pr
127 :    
128 :     dep.z r22 = r33,3,3
129 :    
130 :     and r14 = -8,r33
131 :     mov r15 = r32
132 :     mov r16 = r34
133 : ia64p 291 // sub r17 = 0,r0
134 : ia64p 244 ;;
135 :    
136 :     add r18 = 8,r14
137 :     add r19 = r14,r16 // src + stride
138 : ia64p 291 // mux1 r17 = r17, @brcst
139 : ia64p 244
140 :     sub r24 = 64,r22
141 :     ;;
142 :     add r26 = 8,r19 // src + stride + 8
143 :    
144 :     mov ar.lc = 7
145 : ia64p 291 mov ar.ec = LL + SL +OL + AVL + STL
146 : ia64p 244 mov pr.rot = 1 << 16
147 :    
148 :     ;;
149 : ia64p 291 .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1]
150 :     .rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL]
151 : ia64p 244
152 :    
153 : ia64p 246 .Lloop_interpolate2:
154 : ia64p 244 (aldp[0]) ld8 ald1[0] = [r14],r16
155 :     (aldp[0]) ld8 ald2[0] = [r18],r16
156 :     (aldp[0]) ld8 ald3[0] = [r19],r16
157 :     (aldp[0]) ld8 ald4[0] = [r26],r16
158 :    
159 :     (sh1p[0]) shr.u shru1[0] = ald1[LL],r22
160 :     (sh1p[0]) shl shl1[0] = ald2[LL],r24
161 :     (sh1p[0]) shr.u shru2[0] = ald3[LL],r22
162 :     (sh1p[0]) shl shl2[0] = ald4[LL],r24
163 :    
164 :     (or1p[0]) or or1[0] = shru1[SL],shl1[SL]
165 :     (or1p[0]) or or2[0] = shru2[SL],shl2[SL]
166 :    
167 : ia64p 291 // (addp[0]) padd1.uus add1[0] = or1[OL],r17
168 : ia64p 244
169 : ia64p 291 (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL]
170 : ia64p 244
171 :     (stp[0]) st8 [r15] = avg[AVL]
172 :     (stp[0]) add r15 = r15,r16
173 :    
174 :    
175 :    
176 :    
177 : ia64p 246 br.ctop.sptk.few .Lloop_interpolate2
178 : ia64p 244 ;;
179 :     mov ar.lc = r20
180 :     mov pr = r21,-1
181 :     br.ret.sptk.many b0
182 :     .endp interpolate8x8_halfpel_v_ia64#
183 :    
184 :     .align 16
185 :     .global interpolate8x8_halfpel_hv_ia64#
186 :     .proc interpolate8x8_halfpel_hv_ia64#
187 :     interpolate8x8_halfpel_hv_ia64:
188 :     LL=3
189 :     SL=1
190 :     SL2=1
191 :     OL=1
192 :     OL2=1
193 :     AVL=1
194 :     AL=1
195 :     STL=3
196 :    
197 :     alloc r9=ar.pfs,4, 60,0,64
198 :    
199 :     mov r20 = ar.lc
200 :     mov r21 = pr
201 :    
202 :     dep.z r22 = r33,3,3
203 :    
204 :     and r14 = -8,r33
205 :     mov r15 = r32
206 :     mov r16 = r34
207 : ia64p 291 // sub r17 = 0,r0
208 : ia64p 244 ;;
209 :    
210 :     add r18 = 8,r14
211 :     add r19 = r14,r16
212 : ia64p 291 // mux1 r17 = r17, @brcst
213 : ia64p 244
214 :     add r27 = 8,r22
215 :     sub r28 = 56,r22
216 :     sub r24 = 64,r22
217 :     ;;
218 :     add r26 = 8,r19
219 :    
220 :     mov ar.lc = 7
221 : ia64p 291 mov ar.ec = LL + SL +OL + 2*AVL + STL
222 : ia64p 244 mov pr.rot = 1 << 16
223 :    
224 :     ;;
225 : ia64p 291 .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1]
226 :     .rotp aldp[LL], sh1p[SL], or1p[OL],pavg1p[AVL],pavg2p[AVL],stp[STL]
227 : ia64p 244
228 :    
229 : ia64p 246 .Lloop_interpolate3:
230 : ia64p 244 (aldp[0]) ld8 ald1[0] = [r14],r16
231 :     (aldp[0]) ld8 ald2[0] = [r18],r16
232 :     (aldp[0]) ld8 ald3[0] = [r19],r16
233 :     (aldp[0]) ld8 ald4[0] = [r26],r16
234 :    
235 :     (sh1p[0]) shr.u shru1[0] = ald1[LL],r22
236 :     (sh1p[0]) shl shl1[0] = ald2[LL],r24
237 :     (sh1p[0]) shr.u shru2[0] = ald3[LL],r22
238 :     (sh1p[0]) shl shl2[0] = ald4[LL],r24
239 :     (sh1p[0]) shr.u shru3[0] = ald1[LL],r27
240 :     (sh1p[0]) shl shl3[0] = ald2[LL],r28
241 :     (sh1p[0]) shr.u shru4[0] = ald3[LL],r27
242 :     (sh1p[0]) shl shl4[0] = ald4[LL],r28
243 :    
244 :    
245 :     (or1p[0]) or or1[0] = shru1[SL],shl1[SL]
246 :     (or1p[0]) or or2[0] = shru2[SL],shl2[SL]
247 :     (or1p[0]) or or3[0] = shru3[SL],shl3[SL]
248 :     (or1p[0]) or or4[0] = shru4[SL],shl4[SL]
249 :    
250 : ia64p 291 // (addp[0]) padd1.uus add1[0] = or1[OL],r17
251 : ia64p 244
252 : ia64p 291 (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL]
253 :     (pavg1p[0]) pavg1 avg1[0] = or3[OL],or4[OL]
254 : ia64p 244
255 :     (pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL]
256 :    
257 :     (stp[0]) st8 [r15] = avg2[AVL]
258 :     (stp[0]) add r15 = r15,r16
259 :    
260 :    
261 :    
262 :    
263 : ia64p 246 br.ctop.sptk.few .Lloop_interpolate3
264 : ia64p 244 ;;
265 :     mov ar.lc = r20
266 :     mov pr = r21,-1
267 :     br.ret.sptk.many b0
268 :     .endp interpolate8x8_halfpel_hv_ia64#
269 :    
270 :    

No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4