[svn] / branches / release-1_2-branch / xvidcore / src / image / ia64_asm / interpolate8x8_ia64.s Repository:
ViewVC logotype

Annotation of /branches/release-1_2-branch/xvidcore/src/image/ia64_asm/interpolate8x8_ia64.s

Parent Directory Parent Directory | Revision Log Revision Log


Revision 291 - (view) (download)
Original Path: trunk/xvidcore/src/image/ia64_asm/interpolate8x8_ia64.s

1 : ia64p 244
2 :     .file "interpolate8x8_ia64.s"
3 :     .pred.safe_across_calls p1-p5,p16-p63
4 :     .text
5 :     .align 16
6 :     .global interpolate8x8_halfpel_h_ia64#
7 :     .proc interpolate8x8_halfpel_h_ia64#
8 :     interpolate8x8_halfpel_h_ia64:
9 :     LL=3
10 :     SL=1
11 :     SL2=1
12 :     OL=1
13 :     OL2=1
14 :     AVL=1
15 :     AL=1
16 :     STL=3
17 :    
18 :     alloc r9=ar.pfs,4, 60,0,64
19 :    
20 :     mov r20 = ar.lc
21 :     mov r21 = pr
22 :    
23 :     dep.z r22 = r33,3,3 // rshift of src
24 :    
25 :     and r14 = -8,r33 // align src
26 :     mov r15 = r32 // get dest
27 :     mov r16 = r34 // stride
28 : ia64p 291 // sub r17 = 0,r0 // 1-rounding
29 :    
30 : ia64p 244 ;;
31 :    
32 :     add r18 = 8,r14
33 : ia64p 291 // mux1 r17 = r17, @brcst // broadcast 1-rounding
34 : ia64p 244
35 :     sub r24 = 64,r22 // lshift of src
36 :     add r26 = 8,r22 // rshift of src+1
37 :     sub r27 = 56,r22 // lshift of src+1
38 :    
39 :     mov ar.lc = 7 // loopcounter
40 : ia64p 291 mov ar.ec = LL + SL +OL + AVL + STL // sum of latencies
41 : ia64p 244 mov pr.rot = 1 << 16 // init pr regs for sw-pipeling
42 :    
43 :     ;;
44 : ia64p 291 .rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1]
45 :     .rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL]
46 : ia64p 244
47 :    
48 : ia64p 246 .Lloop_interpolate:
49 : ia64p 244 (aldp[0]) ld8 ald1[0] = [r14],r16 // load aligned src
50 :     (aldp[0]) ld8 ald2[0] = [r18],r16 // and aligned src+8
51 :    
52 :     (sh1p[0]) shr.u shru1[0] = ald1[LL],r22 // get src
53 :     (sh1p[0]) shl shl1[0] = ald2[LL],r27
54 :     (sh1p[0]) shr.u shru2[0] = ald1[LL],r26 // get src+1
55 :     (sh1p[0]) shl shl2[0] = ald2[LL],r24
56 :    
57 :     (or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things
58 :     (or1p[0]) or or2[0] = shru2[SL],shl1[SL]
59 :    
60 : ia64p 291 // (addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding
61 : ia64p 244
62 : ia64p 291 (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] // parallel average
63 : ia64p 244
64 :     (stp[0]) st8 [r15] = avg[AVL] // store results
65 :     (stp[0]) add r15 = r15,r16
66 :    
67 :    
68 :    
69 :    
70 : ia64p 246 br.ctop.sptk.few .Lloop_interpolate
71 : ia64p 244 ;;
72 :     mov ar.lc = r20
73 :     mov pr = r21,-1
74 :     br.ret.sptk.many b0
75 :     .endp interpolate8x8_halfpel_h_ia64#
76 :    
77 :     .align 16
78 :     .global interpolate8x8_halfpel_v_ia64#
79 :     .proc interpolate8x8_halfpel_v_ia64#
80 :     interpolate8x8_halfpel_v_ia64:
81 :     LL=3
82 :     SL=1
83 :     SL2=1
84 :     OL=1
85 :     OL2=1
86 :     AVL=1
87 :     AL=1
88 :     STL=3
89 :    
90 :     alloc r9=ar.pfs,4, 60,0,64
91 :    
92 :     mov r20 = ar.lc
93 :     mov r21 = pr
94 :    
95 :     dep.z r22 = r33,3,3
96 :    
97 :     and r14 = -8,r33
98 :     mov r15 = r32
99 :     mov r16 = r34
100 : ia64p 291 // sub r17 = 0,r0
101 : ia64p 244 ;;
102 :    
103 :     add r18 = 8,r14
104 :     add r19 = r14,r16 // src + stride
105 : ia64p 291 // mux1 r17 = r17, @brcst
106 : ia64p 244
107 :     sub r24 = 64,r22
108 :     ;;
109 :     add r26 = 8,r19 // src + stride + 8
110 :    
111 :     mov ar.lc = 7
112 : ia64p 291 mov ar.ec = LL + SL +OL + AVL + STL
113 : ia64p 244 mov pr.rot = 1 << 16
114 :    
115 :     ;;
116 : ia64p 291 .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1]
117 :     .rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL]
118 : ia64p 244
119 :    
120 : ia64p 246 .Lloop_interpolate2:
121 : ia64p 244 (aldp[0]) ld8 ald1[0] = [r14],r16
122 :     (aldp[0]) ld8 ald2[0] = [r18],r16
123 :     (aldp[0]) ld8 ald3[0] = [r19],r16
124 :     (aldp[0]) ld8 ald4[0] = [r26],r16
125 :    
126 :     (sh1p[0]) shr.u shru1[0] = ald1[LL],r22
127 :     (sh1p[0]) shl shl1[0] = ald2[LL],r24
128 :     (sh1p[0]) shr.u shru2[0] = ald3[LL],r22
129 :     (sh1p[0]) shl shl2[0] = ald4[LL],r24
130 :    
131 :     (or1p[0]) or or1[0] = shru1[SL],shl1[SL]
132 :     (or1p[0]) or or2[0] = shru2[SL],shl2[SL]
133 :    
134 : ia64p 291 // (addp[0]) padd1.uus add1[0] = or1[OL],r17
135 : ia64p 244
136 : ia64p 291 (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL]
137 : ia64p 244
138 :     (stp[0]) st8 [r15] = avg[AVL]
139 :     (stp[0]) add r15 = r15,r16
140 :    
141 :    
142 :    
143 :    
144 : ia64p 246 br.ctop.sptk.few .Lloop_interpolate2
145 : ia64p 244 ;;
146 :     mov ar.lc = r20
147 :     mov pr = r21,-1
148 :     br.ret.sptk.many b0
149 :     .endp interpolate8x8_halfpel_v_ia64#
150 :    
151 :     .align 16
152 :     .global interpolate8x8_halfpel_hv_ia64#
153 :     .proc interpolate8x8_halfpel_hv_ia64#
154 :     interpolate8x8_halfpel_hv_ia64:
155 :     LL=3
156 :     SL=1
157 :     SL2=1
158 :     OL=1
159 :     OL2=1
160 :     AVL=1
161 :     AL=1
162 :     STL=3
163 :    
164 :     alloc r9=ar.pfs,4, 60,0,64
165 :    
166 :     mov r20 = ar.lc
167 :     mov r21 = pr
168 :    
169 :     dep.z r22 = r33,3,3
170 :    
171 :     and r14 = -8,r33
172 :     mov r15 = r32
173 :     mov r16 = r34
174 : ia64p 291 // sub r17 = 0,r0
175 : ia64p 244 ;;
176 :    
177 :     add r18 = 8,r14
178 :     add r19 = r14,r16
179 : ia64p 291 // mux1 r17 = r17, @brcst
180 : ia64p 244
181 :     add r27 = 8,r22
182 :     sub r28 = 56,r22
183 :     sub r24 = 64,r22
184 :     ;;
185 :     add r26 = 8,r19
186 :    
187 :     mov ar.lc = 7
188 : ia64p 291 mov ar.ec = LL + SL +OL + 2*AVL + STL
189 : ia64p 244 mov pr.rot = 1 << 16
190 :    
191 :     ;;
192 : ia64p 291 .rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1]
193 :     .rotp aldp[LL], sh1p[SL], or1p[OL],pavg1p[AVL],pavg2p[AVL],stp[STL]
194 : ia64p 244
195 :    
196 : ia64p 246 .Lloop_interpolate3:
197 : ia64p 244 (aldp[0]) ld8 ald1[0] = [r14],r16
198 :     (aldp[0]) ld8 ald2[0] = [r18],r16
199 :     (aldp[0]) ld8 ald3[0] = [r19],r16
200 :     (aldp[0]) ld8 ald4[0] = [r26],r16
201 :    
202 :     (sh1p[0]) shr.u shru1[0] = ald1[LL],r22
203 :     (sh1p[0]) shl shl1[0] = ald2[LL],r24
204 :     (sh1p[0]) shr.u shru2[0] = ald3[LL],r22
205 :     (sh1p[0]) shl shl2[0] = ald4[LL],r24
206 :     (sh1p[0]) shr.u shru3[0] = ald1[LL],r27
207 :     (sh1p[0]) shl shl3[0] = ald2[LL],r28
208 :     (sh1p[0]) shr.u shru4[0] = ald3[LL],r27
209 :     (sh1p[0]) shl shl4[0] = ald4[LL],r28
210 :    
211 :    
212 :     (or1p[0]) or or1[0] = shru1[SL],shl1[SL]
213 :     (or1p[0]) or or2[0] = shru2[SL],shl2[SL]
214 :     (or1p[0]) or or3[0] = shru3[SL],shl3[SL]
215 :     (or1p[0]) or or4[0] = shru4[SL],shl4[SL]
216 :    
217 : ia64p 291 // (addp[0]) padd1.uus add1[0] = or1[OL],r17
218 : ia64p 244
219 : ia64p 291 (pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL]
220 :     (pavg1p[0]) pavg1 avg1[0] = or3[OL],or4[OL]
221 : ia64p 244
222 :     (pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL]
223 :    
224 :     (stp[0]) st8 [r15] = avg2[AVL]
225 :     (stp[0]) add r15 = r15,r16
226 :    
227 :    
228 :    
229 :    
230 : ia64p 246 br.ctop.sptk.few .Lloop_interpolate3
231 : ia64p 244 ;;
232 :     mov ar.lc = r20
233 :     mov pr = r21,-1
234 :     br.ret.sptk.many b0
235 :     .endp interpolate8x8_halfpel_hv_ia64#
236 :    
237 :    

No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4