Annotation of /trunk/xvidcore/src/image/ia64_asm/interpolate8x8_ia64.s

Revision 1855 - (view) (download)

1 :	Isibaar	1855	// ****************************************************************************
2 :			// *
3 :			// * XVID MPEG-4 VIDEO CODEC
4 :			// * - IA64 halfpel interpolation -
5 :			// *
6 :			// * Copyright(C) 2002 Kai Kühn, Alexander Viehl
7 :			// *
8 :			// * This program is free software; you can redistribute it and/or modify it
9 :			// * under the terms of the GNU General Public License as published by
10 :			// * the Free Software Foundation; either version 2 of the License, or
11 :			// * (at your option) any later version.
12 :			// *
13 :			// * This program is distributed in the hope that it will be useful,
14 :			// * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 :			// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :			// * GNU General Public License for more details.
17 :			// *
18 :			// * You should have received a copy of the GNU General Public License
19 :			// * along with this program; if not, write to the Free Software
20 :			// * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :			// *
22 :			// * $Id: interpolate8x8_ia64.s,v 1.6 2009-02-19 17:07:29 Isibaar Exp $
23 :			// *
24 :			// ***************************************************************************/
25 :			//
26 :			// ****************************************************************************
27 :			// *
28 :			// * interpolate8x8_ia64.s, IA-64 halfpel interpolation
29 :			// *
30 :			// * This version was implemented during an IA-64 practical training at
31 :			// * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/)
32 :			// *
33 :			// ****************************************************************************
34 :	ia64p	244
35 :			.file "interpolate8x8_ia64.s"
36 :			.pred.safe_across_calls p1-p5,p16-p63
37 :			.text
38 :			.align 16
39 :			.global interpolate8x8_halfpel_h_ia64#
40 :			.proc interpolate8x8_halfpel_h_ia64#
41 :			interpolate8x8_halfpel_h_ia64:
42 :			LL=3
43 :			SL=1
44 :			SL2=1
45 :			OL=1
46 :			OL2=1
47 :			AVL=1
48 :			AL=1
49 :			STL=3
50 :
51 :			alloc r9=ar.pfs,4, 60,0,64
52 :
53 :			mov r20 = ar.lc
54 :			mov r21 = pr
55 :
56 :			dep.z r22 = r33,3,3 // rshift of src
57 :
58 :			and r14 = -8,r33 // align src
59 :			mov r15 = r32 // get dest
60 :			mov r16 = r34 // stride
61 :	ia64p	291	// sub r17 = 0,r0 // 1-rounding
62 :
63 :	ia64p	244	;;
64 :
65 :			add r18 = 8,r14
66 :	ia64p	291	// mux1 r17 = r17, @brcst // broadcast 1-rounding
67 :	ia64p	244
68 :			sub r24 = 64,r22 // lshift of src
69 :			add r26 = 8,r22 // rshift of src+1
70 :			sub r27 = 56,r22 // lshift of src+1
71 :
72 :			mov ar.lc = 7 // loopcounter
73 :	ia64p	291	mov ar.ec = LL + SL +OL + AVL + STL // sum of latencies
74 :	ia64p	244	mov pr.rot = 1 << 16 // init pr regs for sw-pipeling
75 :
76 :			;;
77 :	ia64p	291	.rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1]
78 :			.rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL]
79 :	ia64p	244
80 :
81 :	ia64p	246	.Lloop_interpolate:
82 :	ia64p	244	(aldp[0]) ld8 ald1[0] = [r14],r16 // load aligned src
83 :			(aldp[0]) ld8 ald2[0] = [r18],r16 // and aligned src+8
84 :
85 :			(sh1p[0]) shr.u shru1[0] = ald1[LL],r22 // get src
86 :			(sh1p[0]) shl shl1[0] = ald2[LL],r27
87 :			(sh1p[0]) shr.u shru2[0] = ald1[LL],r26 // get src+1
88 :			(sh1p[0]) shl shl2[0] = ald2[LL],r24
89 :
90 :			(or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things
91 :			(or1p[0]) or or2[0] = shru2[SL],shl1[SL]
92 :
93 :	ia64p	291	// (addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding
94 :	ia64p	244
95 :	ia64p	291	(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] // parallel average
96 :	ia64p	244
97 :			(stp[0]) st8 [r15] = avg[AVL] // store results
98 :			(stp[0]) add r15 = r15,r16
99 :
100 :
101 :
102 :
103 :	ia64p	246	br.ctop.sptk.few .Lloop_interpolate
104 :	ia64p	244	;;
105 :			mov ar.lc = r20
106 :			mov pr = r21,-1
107 :			br.ret.sptk.many b0
108 :			.endp interpolate8x8_halfpel_h_ia64#
109 :
110 :			.align 16
111 :			.global interpolate8x8_halfpel_v_ia64#
112 :			.proc interpolate8x8_halfpel_v_ia64#
113 :			interpolate8x8_halfpel_v_ia64:
114 :			LL=3
115 :			SL=1
116 :			SL2=1
117 :			OL=1
118 :			OL2=1
119 :			AVL=1
120 :			AL=1
121 :			STL=3
122 :
123 :			alloc r9=ar.pfs,4, 60,0,64
124 :
125 :			mov r20 = ar.lc
126 :			mov r21 = pr
127 :
128 :			dep.z r22 = r33,3,3
129 :
130 :			and r14 = -8,r33
131 :			mov r15 = r32
132 :			mov r16 = r34
133 :	ia64p	291	// sub r17 = 0,r0
134 :	ia64p	244	;;
135 :
136 :			add r18 = 8,r14
137 :			add r19 = r14,r16 // src + stride
138 :	ia64p	291	// mux1 r17 = r17, @brcst
139 :	ia64p	244
140 :			sub r24 = 64,r22
141 :			;;
142 :			add r26 = 8,r19 // src + stride + 8
143 :
144 :			mov ar.lc = 7
145 :	ia64p	291	mov ar.ec = LL + SL +OL + AVL + STL
146 :	ia64p	244	mov pr.rot = 1 << 16
147 :
148 :			;;
149 :	ia64p	291	.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1]
150 :			.rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL]
151 :	ia64p	244
152 :
153 :	ia64p	246	.Lloop_interpolate2:
154 :	ia64p	244	(aldp[0]) ld8 ald1[0] = [r14],r16
155 :			(aldp[0]) ld8 ald2[0] = [r18],r16
156 :			(aldp[0]) ld8 ald3[0] = [r19],r16
157 :			(aldp[0]) ld8 ald4[0] = [r26],r16
158 :
159 :			(sh1p[0]) shr.u shru1[0] = ald1[LL],r22
160 :			(sh1p[0]) shl shl1[0] = ald2[LL],r24
161 :			(sh1p[0]) shr.u shru2[0] = ald3[LL],r22
162 :			(sh1p[0]) shl shl2[0] = ald4[LL],r24
163 :
164 :			(or1p[0]) or or1[0] = shru1[SL],shl1[SL]
165 :			(or1p[0]) or or2[0] = shru2[SL],shl2[SL]
166 :
167 :	ia64p	291	// (addp[0]) padd1.uus add1[0] = or1[OL],r17
168 :	ia64p	244
169 :	ia64p	291	(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL]
170 :	ia64p	244
171 :			(stp[0]) st8 [r15] = avg[AVL]
172 :			(stp[0]) add r15 = r15,r16
173 :
174 :
175 :
176 :
177 :	ia64p	246	br.ctop.sptk.few .Lloop_interpolate2
178 :	ia64p	244	;;
179 :			mov ar.lc = r20
180 :			mov pr = r21,-1
181 :			br.ret.sptk.many b0
182 :			.endp interpolate8x8_halfpel_v_ia64#
183 :
184 :			.align 16
185 :			.global interpolate8x8_halfpel_hv_ia64#
186 :			.proc interpolate8x8_halfpel_hv_ia64#
187 :			interpolate8x8_halfpel_hv_ia64:
188 :			LL=3
189 :			SL=1
190 :			SL2=1
191 :			OL=1
192 :			OL2=1
193 :			AVL=1
194 :			AL=1
195 :			STL=3
196 :
197 :			alloc r9=ar.pfs,4, 60,0,64
198 :
199 :			mov r20 = ar.lc
200 :			mov r21 = pr
201 :
202 :			dep.z r22 = r33,3,3
203 :
204 :			and r14 = -8,r33
205 :			mov r15 = r32
206 :			mov r16 = r34
207 :	ia64p	291	// sub r17 = 0,r0
208 :	ia64p	244	;;
209 :
210 :			add r18 = 8,r14
211 :			add r19 = r14,r16
212 :	ia64p	291	// mux1 r17 = r17, @brcst
213 :	ia64p	244
214 :			add r27 = 8,r22
215 :			sub r28 = 56,r22
216 :			sub r24 = 64,r22
217 :			;;
218 :			add r26 = 8,r19
219 :
220 :			mov ar.lc = 7
221 :	ia64p	291	mov ar.ec = LL + SL +OL + 2*AVL + STL
222 :	ia64p	244	mov pr.rot = 1 << 16
223 :
224 :			;;
225 :	ia64p	291	.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1]
226 :			.rotp aldp[LL], sh1p[SL], or1p[OL],pavg1p[AVL],pavg2p[AVL],stp[STL]
227 :	ia64p	244
228 :
229 :	ia64p	246	.Lloop_interpolate3:
230 :	ia64p	244	(aldp[0]) ld8 ald1[0] = [r14],r16
231 :			(aldp[0]) ld8 ald2[0] = [r18],r16
232 :			(aldp[0]) ld8 ald3[0] = [r19],r16
233 :			(aldp[0]) ld8 ald4[0] = [r26],r16
234 :
235 :			(sh1p[0]) shr.u shru1[0] = ald1[LL],r22
236 :			(sh1p[0]) shl shl1[0] = ald2[LL],r24
237 :			(sh1p[0]) shr.u shru2[0] = ald3[LL],r22
238 :			(sh1p[0]) shl shl2[0] = ald4[LL],r24
239 :			(sh1p[0]) shr.u shru3[0] = ald1[LL],r27
240 :			(sh1p[0]) shl shl3[0] = ald2[LL],r28
241 :			(sh1p[0]) shr.u shru4[0] = ald3[LL],r27
242 :			(sh1p[0]) shl shl4[0] = ald4[LL],r28
243 :
244 :
245 :			(or1p[0]) or or1[0] = shru1[SL],shl1[SL]
246 :			(or1p[0]) or or2[0] = shru2[SL],shl2[SL]
247 :			(or1p[0]) or or3[0] = shru3[SL],shl3[SL]
248 :			(or1p[0]) or or4[0] = shru4[SL],shl4[SL]
249 :
250 :	ia64p	291	// (addp[0]) padd1.uus add1[0] = or1[OL],r17
251 :	ia64p	244
252 :	ia64p	291	(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL]
253 :			(pavg1p[0]) pavg1 avg1[0] = or3[OL],or4[OL]
254 :	ia64p	244
255 :			(pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL]
256 :
257 :			(stp[0]) st8 [r15] = avg2[AVL]
258 :			(stp[0]) add r15 = r15,r16
259 :
260 :
261 :
262 :
263 :	ia64p	246	br.ctop.sptk.few .Lloop_interpolate3
264 :	ia64p	244	;;
265 :			mov ar.lc = r20
266 :			mov pr = r21,-1
267 :			br.ret.sptk.many b0
268 :			.endp interpolate8x8_halfpel_hv_ia64#
269 :
270 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4