ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/svn/trunk/xvidcore/src/dct/ia64_asm/fdct_ia64.s
Revision: 1855
Committed: Thu Feb 19 17:07:29 2009 UTC (15 years, 7 months ago) by Isibaar
File size: 41115 byte(s)
Error occurred while calculating annotation data.
Log Message:
added proper license headers to the IA64 asm files

File Contents

# Content
1 // ****************************************************************************
2 // *
3 // * XVID MPEG-4 VIDEO CODEC
4 // * - IA64 forward discrete cosine transform -
5 // *
6 // * Copyright(C) 2002 Stephan Krause, Ingo-Marc Weber, Daniel Kallfass
7 // *
8 // * This program is free software; you can redistribute it and/or modify it
9 // * under the terms of the GNU General Public License as published by
10 // * the Free Software Foundation; either version 2 of the License, or
11 // * (at your option) any later version.
12 // *
13 // * This program is distributed in the hope that it will be useful,
14 // * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 // * GNU General Public License for more details.
17 // *
18 // * You should have received a copy of the GNU General Public License
19 // * along with this program; if not, write to the Free Software
20 // * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 // *
22 // * $Id: fdct_ia64.s,v 1.6 2009-02-19 17:07:29 Isibaar Exp $
23 // *
24 // ***************************************************************************/
25 //
26 // ****************************************************************************
27 // *
28 // * fdct_ia64.s, IA-64 optimized forward DCT
29 // *
30 // * Completed version provided by Intel at AppNote AP-922
31 // * http://developer.intel.com/software/products/college/ia32/strmsimd/
32 // * Copyright (C) 1999 Intel Corporation,
33 // *
34 // * This version was implemented during an IA-64 practical training at
35 // * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/)
36 // *
37 // *****************************************************************************
38 //
39 // *****************************************************************************
40 // *
41 // * Revision history:
42 // *
43 // * 24.07.2002 Initial Version
44 // *
45 // *****************************************************************************
46
47
48 // This is a fast precise implementation of 8x8 Discrete Cosine Transform
49 // published in Intel Application Note 922 from 1999 and optimized for IA-64.
50 //
51 // An unoptimized "straight forward" version can be found at the end of this file.
52
53
54 .pred.safe_across_calls p1-p5,p16-p63
55 .text
56 .align 16
57 .global fdct_ia64#
58 .proc fdct_ia64#
59 fdct_ia64:
60 .prologue
61 alloc r14 = ar.pfs, 1, 56, 0, 0
62 // Save constants
63 mov r31 = 0x32ec // c0 = tan(1pi/16)
64 mov r30 = 0x6a0a // c1 = tan(2pi/16)
65 mov r29 = 0xab0e // c2 = tan(3pi/16)
66 mov r28 = 0xb505 // g4 = cos(4pi/16)
67 mov r27 = 0xd4db // g3 = cos(3pi/16)
68 mov r26 = 0xec83 // g2 = cos(2pi/16)
69 mov r25 = 0xfb15 // g1 = cos(1pi/16)
70 mov r24 = 0x0002 // correction bit for descaling
71 mov r23 = 0x0004 // correction bit for descaling
72
73 // Load Matrix into registers
74
75 add loc0 = r0, r32
76 add loc2 = 16, r32
77 add loc4 = 32, r32
78 add loc6 = 48, r32
79 add loc8 = 64, r32
80 add loc10 = 80, r32
81 add loc12 = 96, r32
82 add loc14 = 112, r32
83 add loc1 = 8, r32
84 add loc3 = 24, r32
85 add loc5 = 40, r32
86 add loc7 = 56, r32
87 add loc9 = 72, r32
88 add loc11 = 88, r32
89 add loc13 = 104, r32
90 add loc15 = 120, r32
91 ;;
92 ld8 loc16 = [loc0]
93 ld8 loc17 = [loc2]
94 ld8 loc18 = [loc4]
95 ld8 loc19 = [loc6]
96 ld8 loc20 = [loc8]
97 ld8 loc21 = [loc10]
98 ld8 loc22 = [loc12]
99 ld8 loc23 = [loc14]
100 ld8 loc24 = [loc1]
101 ld8 loc25 = [loc3]
102 ld8 loc26 = [loc5]
103 ld8 loc27 = [loc7]
104 mux2 r26 = r26, 0x00
105 ld8 loc28 = [loc9]
106 mux2 r31 = r31, 0x00
107 mux2 r25 = r25, 0x00
108 ld8 loc29 = [loc11]
109 mux2 r30 = r30, 0x00
110 mux2 r29 = r29, 0x00
111 ld8 loc30 = [loc13]
112 mux2 r28 = r28, 0x00
113 mux2 r27 = r27, 0x00
114 ld8 loc31 = [loc15]
115 mux2 r24 = r24, 0x00
116 mux2 r23 = r23, 0x00
117 ;;
118 pshl2 loc16 = loc16, 3
119 pshl2 loc17 = loc17, 3
120 pshl2 loc18 = loc18, 3
121 pshl2 loc19 = loc19, 3
122 pshl2 loc20 = loc20, 3
123 pshl2 loc21 = loc21, 3
124 pshl2 loc22 = loc22, 3
125 pshl2 loc23 = loc23, 3
126 ;;
127 pshl2 loc24 = loc24, 3
128
129 // *******************
130 // column-DTC 1st half
131 // *******************
132
133 psub2 loc37 = loc17, loc22 // t5 = x1 - x6
134 pshl2 loc25 = loc25, 3
135 pshl2 loc26 = loc26, 3
136 psub2 loc38 = loc18, loc21 // t6 = x2 - x5
137 pshl2 loc27 = loc27, 3
138 pshl2 loc28 = loc28, 3
139 ;;
140 padd2 loc32 = loc16, loc23 // t0 = x0 + x7
141 pshl2 loc29 = loc29, 3
142 pshl2 loc30 = loc30, 3
143 padd2 loc33 = loc17, loc22 // t1 = x1 + x6
144 padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
145 psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
146 ;;
147 padd2 loc34 = loc18, loc21 // t2 = x2 + x5
148 pshl2 loc31 = loc31, 3
149 padd2 loc35 = loc19, loc20 // t3 = x3 + x4
150 psub2 loc36 = loc16, loc23 // t4 = x0 - x7
151 pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
152 pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
153 ;;
154 psub2 loc39 = loc19, loc20 // t7 = x3 - x4
155 padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
156 padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
157
158 padd2 loc16 = loc32, loc35 // x0 = t0 + t3
159 padd2 loc17 = loc33, loc34 // x1 = t1 + t2
160 psub2 loc18 = loc32, loc35 // x2 = t0 - t3
161 ;;
162 psub2 loc19 = loc33, loc34 // x3 = t1 - t2
163 padd2 loc20 = loc36, loc37 // x4 = t4 + t5
164 padd2 loc21 = loc38, loc39 // x5 = t6 + t7
165 psub2 loc22 = loc36, loc37 // x6 = t4 - t5
166 psub2 loc23 = loc38, loc39 // x7 = t6 - t7
167 ;;
168 pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
169 padd2 loc32 = loc16, loc17 // t0 = x0 + x1
170 pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
171 pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
172 psub2 loc33 = loc16, loc17 // t1 = x0 - x1
173 pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
174 pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
175 ;;
176 padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
177 pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
178 padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
179 padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
180 psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
181 psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
182 ;;
183 padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
184 padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1)
185 pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
186 pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
187 psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7
188 ;;
189 padd2 loc48 = loc16, loc32 // y0 = x0 + t0
190 pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
191 pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
192 padd2 loc52 = loc17, loc33 // y4 = x1 + t1
193 pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
194 pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
195 ;;
196 padd2 loc50 = loc18, loc34 // y2 = x2 + t2
197 pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
198 pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
199 padd2 loc55 = loc21, loc37 // y7 = x5 + t5
200 padd2 loc49 = loc20, loc36 // y1 = x4 + t4
201 padd2 loc54 = loc19, loc35 // y6 = x3 + t3
202 ;;
203 padd2 loc51 = loc22, loc38 // y3 = x6 + t6
204 padd2 loc53 = loc23, loc39 // y5 = x7 + t7
205
206 //divide by 4
207
208 padd2 loc48 = loc48, r24
209 padd2 loc49 = loc49, r24
210 padd2 loc50 = loc50, r24
211 padd2 loc52 = loc52, r24
212 ;;
213 padd2 loc51 = loc51, r24
214 pshr2 loc48 = loc48, 2
215 padd2 loc53 = loc53, r24
216 pshr2 loc49 = loc49, 2
217 padd2 loc54 = loc54, r24
218 pshr2 loc50 = loc50, 2
219 padd2 loc55 = loc55, r24
220 pshr2 loc52 = loc52, 2
221 ;;
222 pshr2 loc51 = loc51, 2
223 pshr2 loc53 = loc53, 2
224 pshr2 loc54 = loc54, 2
225 pshr2 loc55 = loc55, 2
226
227
228 // *******************
229 // column-DTC 2nd half
230 // *******************
231
232 psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2
233 psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2
234 padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2
235 padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2
236 ;;
237 padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2
238 psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
239 padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
240 padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2
241 ;;
242 psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2
243 pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
244 pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
245 ;;
246 psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2
247 padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
248 padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
249
250 padd2 loc16 = loc32, loc35 // x0 = t0 + t3
251 padd2 loc17 = loc33, loc34 // x1 = t1 + t2
252 psub2 loc18 = loc32, loc35 // x2 = t0 - t3
253 ;;
254 psub2 loc19 = loc33, loc34 // x3 = t1 - t2
255 padd2 loc20 = loc36, loc37 // x4 = t4 + t5
256 padd2 loc21 = loc38, loc39 // x5 = t6 + t7
257 psub2 loc22 = loc36, loc37 // x6 = t4 - t5
258 psub2 loc23 = loc38, loc39 // x7 = t6 - t7
259 ;;
260 pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
261 padd2 loc32 = loc16, loc17 // t0 = x0 + x1
262 pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
263 pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
264 psub2 loc33 = loc16, loc17 // t1 = x0 - x1
265 pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
266 pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
267 pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
268 ;;
269 padd2 loc34 = loc18, loc43 // t2 = x2 + buf3
270 padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
271 padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
272 psub2 loc35 = loc42, loc19 // t3 = buf2 - x3
273 padd2 loc36 = loc20, loc45 // t4 = x4 + buf5
274 pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
275 ;;
276 psub2 loc37 = loc44, loc21 // t5 = buf4 - x5
277 padd2 loc38 = loc22, loc47 // t6 = x6 + buf7
278 psub2 loc39 = loc46, loc23 // t7 = buf6 - x7
279 pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
280 ;;
281 padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0
282 pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
283 pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
284 padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1
285 pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
286 pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
287 ;;
288 padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2
289 pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
290 pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
291 padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5
292 padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4
293 padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3
294 ;;
295 padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6
296
297 // *******************
298 // transpose matrix
299 // *******************
300
301 mix2.r loc32 = loc48, loc49 // tmp0 = mixr y0, y1
302 mix2.l loc33 = loc48, loc49 // tmp1 = mixl y0, y1
303 padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7
304 mix2.r loc34 = loc50, loc51 // tmp2 = mixr y2, y3
305 mix2.l loc35 = loc50, loc51 // tmp3 = mixl y2, y3
306 ;;
307
308 //divide by 4
309
310 padd2 loc40 = loc40, r24
311 padd2 loc41 = loc41, r24
312 mix4.r loc16 = loc32, loc34 // x0 = mixr tmp0, tmp2
313 padd2 loc42 = loc42, r24
314 padd2 loc43 = loc43, r24
315 mix4.r loc17 = loc33, loc35 // x1 = mixr tmp1, tmp3
316 padd2 loc44 = loc44, r24
317 padd2 loc45 = loc45, r24
318 mix4.l loc18 = loc32, loc34 // x2 = mixl tmp0, tmp2
319 padd2 loc46 = loc46, r24
320 padd2 loc47 = loc47, r24
321 mix4.l loc19 = loc33, loc35 // x3 = mixl tmp1, tmp3
322 ;;
323 pshr2 loc40 = loc40, 2
324 pshr2 loc41 = loc41, 2
325 pshr2 loc42 = loc42, 2
326 pshr2 loc43 = loc43, 2
327 mix2.r loc32 = loc52, loc53 // tmp0 = mixr y4, y5
328 mix2.l loc33 = loc52, loc53 // tmp1 = mixl y4, y5
329 mix2.r loc34 = loc54, loc55 // tmp2 = mixr y6, y7
330 mix2.l loc35 = loc54, loc55 // tmp3 = mixl y6, y7
331 ;;
332 pshr2 loc44 = loc44, 2
333 pshr2 loc45 = loc45, 2
334 pshr2 loc46 = loc46, 2
335 pshr2 loc47 = loc47, 2
336 mix4.r loc24 = loc32, loc34 // x0.2 = mixr tmp0, tmp2
337 mix4.r loc25 = loc33, loc35 // x1.2 = mixr tmp1, tmp3
338 mix4.l loc26 = loc32, loc34 // x2.2 = mixl tmp0, tmp2
339 mix4.l loc27 = loc33, loc35 // x3.2 = mixl tmp1, tmp3
340 ;;
341 mix2.r loc32 = loc40, loc41 // tmp0 = mixr y0.2, y1.2
342 mix2.l loc33 = loc40, loc41 // tmp1 = mixl y0.2, y1.2
343 mix2.r loc34 = loc42, loc43 // tmp2 = mixr y2.2, y3.2
344 mix2.l loc35 = loc42, loc43 // tmp3 = mixl y2.2, y3.2
345 ;;
346 mix4.r loc20 = loc32, loc34 // x4 = mixr tmp0, tmp2
347 mix4.r loc21 = loc33, loc35 // x5 = mixr tmp1, tmp3
348 mix4.l loc22 = loc32, loc34 // x6 = mixl tmp0, tmp2
349 mix4.l loc23 = loc33, loc35 // x7 = mixl tmp1, tmp3
350 ;;
351 mix2.r loc32 = loc44, loc45 // tmp0 = mixr y4.2, y5.2
352 mix2.l loc33 = loc44, loc45 // tmp1 = mixl y4.2, y5.2
353 mix2.r loc34 = loc46, loc47 // tmp2 = mixr y6.2, y6.2
354 mix2.l loc35 = loc46, loc47 // tmp3 = mixl y6.2, y6.2
355 ;;
356 mix4.r loc28 = loc32, loc34 // x4.2 = mixr tmp0, tmp2
357 mix4.r loc29 = loc33, loc35 // x5.2 = mixr tmp1, tmp3
358 mix4.l loc30 = loc32, loc34 // x6.2 = mixl tmp0, tmp2
359 mix4.l loc31 = loc33, loc35 // x7.2 = mixl tmp1, tmp3
360
361 // *******************
362 // row-DTC 1st half
363 // *******************
364
365 psub2 loc37 = loc17, loc22 // t5 = x1 - x6
366 psub2 loc38 = loc18, loc21 // t6 = x2 - x5
367 ;;
368 padd2 loc32 = loc16, loc23 // t0 = x0 + x7
369 padd2 loc33 = loc17, loc22 // t1 = x1 + x6
370 padd2 loc34 = loc18, loc21 // t2 = x2 + x5
371 psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
372 padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
373 padd2 loc35 = loc19, loc20 // t3 = x3 + x4
374 ;;
375 psub2 loc36 = loc16, loc23 // t4 = x0 - x7
376 pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
377 pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
378 ;;
379 psub2 loc39 = loc19, loc20 // t7 = x3 - x4
380 padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
381 padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
382
383 padd2 loc16 = loc32, loc35 // x0 = t0 + t3
384 padd2 loc17 = loc33, loc34 // x1 = t1 + t2
385 psub2 loc18 = loc32, loc35 // x2 = t0 - t3
386 ;;
387 psub2 loc19 = loc33, loc34 // x3 = t1 - t2
388 padd2 loc20 = loc36, loc37 // x4 = t4 + t5
389 padd2 loc21 = loc38, loc39 // x5 = t6 + t7
390 psub2 loc22 = loc36, loc37 // x6 = t4 - t5
391 psub2 loc23 = loc38, loc39 // x7 = t6 - t7
392 ;;
393 pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
394 padd2 loc32 = loc16, loc17 // t0 = x0 + x1
395 pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
396 pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
397 psub2 loc33 = loc16, loc17 // t1 = x0 - x1
398 pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
399 pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
400 pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
401 ;;
402 padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
403 padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
404 padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
405 ;;
406 psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
407 padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
408 psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
409 padd2 loc38 = loc22, loc47 // t6 = x6 + (buf7 * c1)
410 psub2 loc39 = loc46, loc23 // t7 = (c1 * buf6) - x7
411 pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
412 pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
413 ;;
414 padd2 loc48 = loc16, loc32 // y0 = x0 + t0
415 pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
416 pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
417 padd2 loc52 = loc17, loc33 // y4 = x1 + t1
418 pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
419 pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
420 ;;
421 padd2 loc50 = loc18, loc34 // y2 = x2 + t2
422 pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
423 pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
424 padd2 loc55 = loc21, loc37 // y7 = x5 + t5
425 padd2 loc49 = loc20, loc36 // y1 = x4 + t4
426 padd2 loc54 = loc19, loc35 // y6 = x3 + t3
427 ;;
428 padd2 loc51 = loc22, loc38 // y3 = x6 + t6
429 padd2 loc53 = loc23, loc39 // y5 = x7 + t7
430
431 // *******************
432 // row-DTC 2nd half
433 // *******************
434
435 psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2
436 psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2
437 padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2
438 padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2
439 ;;
440 padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2
441 psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
442 padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
443 padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2
444 ;;
445 psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2
446 pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
447 pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
448 ;;
449 psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2
450 padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
451 padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
452
453 padd2 loc16 = loc32, loc35 // x0 = t0 + t3
454 padd2 loc17 = loc33, loc34 // x1 = t1 + t2
455 psub2 loc18 = loc32, loc35 // x2 = t0 - t3
456 ;;
457 psub2 loc19 = loc33, loc34 // x3 = t1 - t2
458 padd2 loc20 = loc36, loc37 // x4 = t4 + t5
459 padd2 loc21 = loc38, loc39 // x5 = t6 + t7
460 psub2 loc22 = loc36, loc37 // x6 = t4 - t5
461 psub2 loc23 = loc38, loc39 // x7 = t6 - t7
462 ;;
463 pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
464 padd2 loc32 = loc16, loc17 // t0 = x0 + x1
465 pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
466 pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
467 psub2 loc33 = loc16, loc17 // t1 = x0 - x1
468 pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
469 pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
470 pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
471 ;;
472 padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
473 padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
474 padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
475 ;;
476 psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
477 padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
478 psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
479 padd2 loc38 = loc22, loc47 // t6 = x6 + (buf7 * c1)
480 psub2 loc39 = loc46, loc23 // t7 = (c1 * buf6) - x7
481 pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
482 pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
483 ;;
484 padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0
485 pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
486 pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
487 padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1
488 pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
489 pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
490 ;;
491 padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2
492 pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
493 pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
494 padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3
495 nop.i 0x0
496 nop.i 0x0
497 ;;
498
499 // *******************
500 // Transpose matrix
501 // *******************
502 padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4
503 mix2.l loc32 = loc49, loc48 // tmp0 = mixr y1, y0
504 mix2.r loc33 = loc49, loc48 // tmp1 = mixl y1, y0
505 padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5
506 mix2.l loc34 = loc51, loc50 // tmp2 = mixr y3, y2
507 mix2.r loc35 = loc51, loc50 // tmp3 = mixl y3, y2
508 ;;
509 padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6
510 mix4.l loc16 = loc34, loc32 // x0 = mixr tmp2, tmp0
511 mix4.l loc17 = loc35, loc33 // x1 = mixr tmp3, tmp1
512 padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7
513 mix4.r loc18 = loc34, loc32 // x2 = mixl tmp2, tmp0
514 mix4.r loc19 = loc35, loc33 // x3 = mixl tmp3, tmp1
515 ;;
516 padd2 loc16 = loc16, r23
517 mix2.l loc32 = loc41, loc40 // tmp0 = mixr y0.2, y1.2
518 mix2.r loc33 = loc41, loc40 // tmp1 = mixl y0.2, y1.2
519 padd2 loc17 = loc17, r23
520 mix2.l loc34 = loc43, loc42 // tmp2 = mixr y2.2, y3.2
521 mix2.r loc35 = loc43, loc42 // tmp3 = mixl y2.2, y3.2
522 ;;
523 padd2 loc18 = loc18, r23
524 mix4.l loc20 = loc34, loc32 // x4 = mixr tmp2, tmp0
525 mix4.l loc21 = loc35, loc33 // x5 = mixr tmp3, tmp1
526 padd2 loc19 = loc19, r23
527 mix4.r loc22 = loc34, loc32 // x6 = mixl tmp2, tmp0
528 mix4.r loc23 = loc35, loc33 // x7 = mixl tmp3, tmp1
529 ;;
530 padd2 loc20 = loc20, r23
531 mix2.l loc32 = loc53, loc52 // tmp0 = mixr y5, y4
532 mix2.r loc33 = loc53, loc52 // tmp1 = mixl y5, y4
533 padd2 loc21 = loc21, r23
534 mix2.l loc34 = loc55, loc54 // tmp2 = mixr y7, y6
535 mix2.r loc35 = loc55, loc54 // tmp3 = mixl y7, y6
536 ;;
537 padd2 loc22 = loc22, r23
538 mix4.l loc24 = loc34, loc32 // x0.2 = mixr tmp2, tmp0
539 mix4.l loc25 = loc35, loc33 // x1.2 = mixr tmp3, tmp1
540 padd2 loc23 = loc23, r23
541 mix4.r loc26 = loc34, loc32 // x2.2 = mixl tmp2, tmp0
542 mix4.r loc27 = loc35, loc33 // x3.2 = mixl tmp3, tmp1
543 ;;
544 padd2 loc24 = loc24, r23
545 mix2.l loc32 = loc45, loc44 // tmp0 = mixr y4.2, y5.2
546 mix2.r loc33 = loc45, loc44 // tmp1 = mixl y4.2, y5.2
547 padd2 loc25 = loc25, r23
548 mix2.l loc34 = loc47, loc46 // tmp2 = mixr y6.2, y6.2
549 mix2.r loc35 = loc47, loc46 // tmp3 = mixl y6.2, y6.2
550 ;;
551 padd2 loc26 = loc26, r23
552 mix4.l loc28 = loc34, loc32 // x4.2 = mixr tmp2, tmp0
553 mix4.l loc29 = loc35, loc33 // x5.2 = mixr tmp3, tmp1
554 padd2 loc27 = loc27, r23
555 mix4.r loc30 = loc34, loc32 // x6.2 = mixl tmp2, tmp0
556 mix4.r loc31 = loc35, loc33 // x7.2 = mixl tmp3, tmp1
557 ;;
558 // *******************
559 // Descale
560 // *******************
561 padd2 loc28 = loc28, r23
562 pshr2 loc16 = loc16, 3
563 pshr2 loc17 = loc17, 3
564 padd2 loc29 = loc29, r23
565 pshr2 loc18 = loc18, 3
566 pshr2 loc19 = loc19, 3
567 padd2 loc30 = loc30, r23
568 pshr2 loc20 = loc20, 3
569 pshr2 loc21 = loc21, 3
570 padd2 loc31 = loc31, r23
571 pshr2 loc22 = loc22, 3
572 pshr2 loc23 = loc23, 3
573 ;;
574 pshr2 loc24 = loc24, 3
575 pshr2 loc25 = loc25, 3
576 pshr2 loc26 = loc26, 3
577 pshr2 loc27 = loc27, 3
578 pshr2 loc28 = loc28, 3
579 pshr2 loc29 = loc29, 3
580 pshr2 loc30 = loc30, 3
581 pshr2 loc31 = loc31, 3
582 ;;
583 // *******************
584 // Store matrix
585 // *******************
586 st8 [loc0] = loc16
587 st8 [loc1] = loc24
588 st8 [loc2] = loc17
589 st8 [loc3] = loc25
590 st8 [loc4] = loc18
591 st8 [loc5] = loc26
592 st8 [loc6] = loc19
593 st8 [loc7] = loc27
594 st8 [loc8] = loc20
595 st8 [loc9] = loc28
596 st8 [loc10] = loc21
597 st8 [loc11] = loc29
598 st8 [loc12] = loc22
599 st8 [loc13] = loc30
600 st8 [loc14] = loc23
601 st8 [loc15] = loc31
602
603 mov ar.pfs = r14
604 br.ret.sptk.many b0
605 .endp fdct_ia64#
606 .common fdct#,8,8
607
608
609
610
611
612
613
614
615 //***********************************************
616 //* Here is a version of the DCT implementation *
617 //* unoptimized in terms of command ordering. *
618 //* This version is about 30% slower but *
619 //* easier understand. *
620 //***********************************************
621 //
622 // .pred.safe_across_calls p1-p5,p16-p63
623 //.text
624 // .align 16
625 // .global fdct_ia64#
626 // .proc fdct_ia64#
627 //fdct_ia64:
628 // .prologue
629 // alloc r14 = ar.pfs, 1, 56, 0, 0
630 //
631 // // *******************
632 // // Save constants
633 // // *******************
634 // mov r31 = 0x32ec // c0 = tan(1pi/16)
635 // mov r30 = 0x6a0a // c1 = tan(2pi/16)
636 // mov r29 = 0xab0e // c2 = tan(3pi/16)
637 // mov r28 = 0xb505 // g4 = cos(4pi/16)
638 // mov r27 = 0xd4db // g3 = cos(3pi/16)
639 // mov r26 = 0xec83 // g2 = cos(2pi/16)
640 // mov r25 = 0xfb15 // g1 = cos(1pi/16)
641 // mov r24 = 0x0002 // correction bit for descaling
642 // mov r23 = 0x0004 // correction bit for descaling
643 //
644 // // **************************
645 // // Load Matrix into registers
646 // // **************************
647 //
648 // add loc0 = r0, r32
649 // ;;
650 // mux2 r31 = r31, 0x00
651 // mux2 r30 = r30, 0x00
652 // mux2 r29 = r29, 0x00
653 // mux2 r28 = r28, 0x00
654 // mux2 r27 = r27, 0x00
655 // mux2 r26 = r26, 0x00
656 // mux2 r25 = r25, 0x00
657 // mux2 r24 = r24, 0x00
658 // mux2 r23 = r23, 0x00
659 // ld8 loc16 = [loc0]
660 // add loc2 = 16, r32
661 // add loc4 = 32, r32
662 // add loc6 = 48, r32
663 // add loc8 = 64, r32
664 // add loc10 = 80, r32
665 // ;;
666 // ld8 loc17 = [loc2]
667 // ld8 loc18 = [loc4]
668 // add loc12 = 96, r32
669 // ld8 loc19 = [loc6]
670 // ld8 loc20 = [loc8]
671 // add loc14 = 112, r32
672 // ;;
673 // ld8 loc21 = [loc10]
674 // ld8 loc22 = [loc12]
675 // add loc1 = 8, r32
676 // ld8 loc23 = [loc14]
677 // add loc3 = 24, r32
678 // add loc5 = 40, r32
679 // ;;
680 // ld8 loc24 = [loc1]
681 // ld8 loc25 = [loc3]
682 // add loc7 = 56, r32
683 // ld8 loc26 = [loc5]
684 // add loc9 = 72, r32
685 // add loc11 = 88, r32
686 // ;;
687 // ld8 loc27 = [loc7]
688 // ld8 loc28 = [loc9]
689 // add loc13 = 104, r32
690 // ld8 loc29 = [loc11]
691 // add loc15 = 120, r32
692 // ;;
693 // ld8 loc30 = [loc13]
694 // ld8 loc31 = [loc15]
695 // ;;
696 // // ******
697 // // Scale
698 // // ******
699 // pshl2 loc16 = loc16, 3
700 // pshl2 loc17 = loc17, 3
701 // pshl2 loc18 = loc18, 3
702 // pshl2 loc19 = loc19, 3
703 // pshl2 loc20 = loc20, 3
704 // pshl2 loc21 = loc21, 3
705 // pshl2 loc22 = loc22, 3
706 // pshl2 loc23 = loc23, 3
707 // pshl2 loc24 = loc24, 3
708 // pshl2 loc25 = loc25, 3
709 // pshl2 loc26 = loc26, 3
710 // pshl2 loc27 = loc27, 3
711 // pshl2 loc28 = loc28, 3
712 // pshl2 loc29 = loc29, 3
713 // pshl2 loc30 = loc30, 3
714 // pshl2 loc31 = loc31, 3
715 // ;;
716 //
717 // // *******************
718 // // column-DTC 1st half
719 // // *******************
720 //
721 // padd2 loc32 = loc16, loc23 // t0 = x0 + x7
722 // padd2 loc33 = loc17, loc22 // t1 = x1 + x6
723 // padd2 loc34 = loc18, loc21 // t2 = x2 + x5
724 // padd2 loc35 = loc19, loc20 // t3 = x3 + x4
725 // psub2 loc36 = loc16, loc23 // t4 = x0 - x7
726 // psub2 loc37 = loc17, loc22 // t5 = x1 - x6
727 // psub2 loc38 = loc18, loc21 // t6 = x2 - x5
728 // psub2 loc39 = loc19, loc20 // t7 = x3 - x4
729 // ;;
730 // padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
731 // psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
732 // ;;
733 // pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
734 // pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
735 // ;;
736 // padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
737 // padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
738 // ;;
739 // padd2 loc16 = loc32, loc35 // x0 = t0 + t3
740 // padd2 loc17 = loc33, loc34 // x1 = t1 + t2
741 // psub2 loc18 = loc32, loc35 // x2 = t0 - t3
742 // psub2 loc19 = loc33, loc34 // x3 = t1 - t2
743 // padd2 loc20 = loc36, loc37 // x4 = t4 + t5
744 // padd2 loc21 = loc38, loc39 // x5 = t6 + t7
745 // psub2 loc22 = loc36, loc37 // x6 = t4 - t5
746 // psub2 loc23 = loc38, loc39 // x7 = t6 - t7
747 // ;;
748 //
749 // padd2 loc32 = loc16, loc17 // t0 = x0 + x1
750 // psub2 loc33 = loc16, loc17 // t1 = x0 - x1
751 // pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
752 // pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
753 // pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
754 // pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
755 // pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
756 // pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
757 // ;;
758 // padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
759 // padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
760 // ;;
761 // padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
762 // psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
763 // padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
764 // psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
765 // padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1)
766 // psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7
767 // ;;
768 // pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
769 // pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
770 // pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
771 // pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
772 // pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
773 // pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
774 // pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
775 // pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
776 // ;;
777 // padd2 loc48 = loc16, loc32 // y0 = x0 + t0
778 // padd2 loc49 = loc20, loc36 // y1 = x4 + t4
779 // padd2 loc50 = loc18, loc34 // y2 = x2 + t2
780 // padd2 loc51 = loc22, loc38 // y3 = x6 + t6
781 // padd2 loc52 = loc17, loc33 // y4 = x1 + t1
782 // padd2 loc53 = loc23, loc39 // y5 = x7 + t7
783 // padd2 loc54 = loc19, loc35 // y6 = x3 + t3
784 // padd2 loc55 = loc21, loc37 // y7 = x5 + t5
785 // ;;
786 //
787 // // *******************
788 // // column-DTC 2nd half
789 // // *******************
790 //
791 // padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2
792 // padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2
793 // padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2
794 // padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2
795 // psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2
796 // psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2
797 // psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2
798 // psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2
799 // ;;
800 // padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
801 // psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
802 // ;;
803 // pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
804 // pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
805 // ;;
806 // padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
807 // padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
808 // ;;
809 // padd2 loc16 = loc32, loc35 // x0 = t0 + t3
810 // padd2 loc17 = loc33, loc34 // x1 = t1 + t2
811 // psub2 loc18 = loc32, loc35 // x2 = t0 - t3
812 // psub2 loc19 = loc33, loc34 // x3 = t1 - t2
813 // padd2 loc20 = loc36, loc37 // x4 = t4 + t5
814 // padd2 loc21 = loc38, loc39 // x5 = t6 + t7
815 // psub2 loc22 = loc36, loc37 // x6 = t4 - t5
816 // psub2 loc23 = loc38, loc39 // x7 = t6 - t7
817 // ;;
818 // padd2 loc32 = loc16, loc17 // t0 = x0 + x1
819 // psub2 loc33 = loc16, loc17 // t1 = x0 - x1
820 // pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
821 // pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
822 // pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
823 // pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
824 // pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
825 // pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
826 // ;;
827 // padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
828 // padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
829 // ;;
830 // padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
831 // psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
832 // padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
833 // psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
834 // padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1)
835 // psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7
836 // ;;
837 // pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
838 // pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
839 // pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
840 // pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
841 // pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
842 // pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
843 // pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
844 // pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
845 // ;;
846 // padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0
847 // padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4
848 // padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2
849 // padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6
850 // padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1
851 // padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7
852 // padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3
853 // padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5
854 // ;;
855 // padd2 loc40 = loc40, r24 // add r24 to correct rounding
856 // padd2 loc41 = loc41, r24
857 // padd2 loc42 = loc42, r24
858 // padd2 loc43 = loc43, r24
859 // padd2 loc44 = loc44, r24
860 // padd2 loc45 = loc45, r24
861 // padd2 loc46 = loc46, r24
862 // padd2 loc47 = loc47, r24
863 // padd2 loc48 = loc48, r24
864 // padd2 loc49 = loc49, r24
865 // padd2 loc50 = loc50, r24
866 // padd2 loc51 = loc51, r24
867 // padd2 loc52 = loc52, r24
868 // padd2 loc53 = loc53, r24
869 // padd2 loc54 = loc54, r24
870 // padd2 loc55 = loc55, r24
871 // ;;
872 // pshr2 loc40 = loc40, 2 // Divide all matrix elements through 4
873 // pshr2 loc41 = loc41, 2
874 // pshr2 loc42 = loc42, 2
875 // pshr2 loc43 = loc43, 2
876 // pshr2 loc44 = loc44, 2
877 // pshr2 loc45 = loc45, 2
878 // pshr2 loc46 = loc46, 2
879 // pshr2 loc47 = loc47, 2
880 // pshr2 loc48 = loc48, 2
881 // pshr2 loc49 = loc49, 2
882 // pshr2 loc50 = loc50, 2
883 // pshr2 loc51 = loc51, 2
884 // pshr2 loc52 = loc52, 2
885 // pshr2 loc53 = loc53, 2
886 // pshr2 loc54 = loc54, 2
887 // pshr2 loc55 = loc55, 2
888 // ;;
889 //
890 // // *****************
891 // // Transpose matrix
892 // // *****************
893 //
894 // mix2.r loc32 = loc48, loc49 // tmp0 = mixr y0, y1
895 // mix2.l loc33 = loc48, loc49 // tmp1 = mixl y0, y1
896 // mix2.r loc34 = loc50, loc51 // tmp2 = mixr y2, y3
897 // mix2.l loc35 = loc50, loc51 // tmp3 = mixl y2, y3
898 // ;;
899 // mix4.r loc16 = loc32, loc34 // x0 = mixr tmp0, tmp2
900 // mix4.r loc17 = loc33, loc35 // x1 = mixr tmp1, tmp3
901 // mix4.l loc18 = loc32, loc34 // x2 = mixl tmp0, tmp2
902 // mix4.l loc19 = loc33, loc35 // x3 = mixl tmp1, tmp3
903 // ;;
904 // mix2.r loc32 = loc40, loc41 // tmp0 = mixr y0.2, y1.2
905 // mix2.l loc33 = loc40, loc41 // tmp1 = mixl y0.2, y1.2
906 // mix2.r loc34 = loc42, loc43 // tmp2 = mixr y2.2, y3.2
907 // mix2.l loc35 = loc42, loc43 // tmp3 = mixl y2.2, y3.2
908 // ;;
909 // mix4.r loc20 = loc32, loc34 // x4 = mixr tmp0, tmp2
910 // mix4.r loc21 = loc33, loc35 // x5 = mixr tmp1, tmp3
911 // mix4.l loc22 = loc32, loc34 // x6 = mixl tmp0, tmp2
912 // mix4.l loc23 = loc33, loc35 // x7 = mixl tmp1, tmp3
913 // ;;
914 // mix2.r loc32 = loc52, loc53 // tmp0 = mixr y4, y5
915 // mix2.l loc33 = loc52, loc53 // tmp1 = mixl y4, y5
916 // mix2.r loc34 = loc54, loc55 // tmp2 = mixr y6, y7
917 // mix2.l loc35 = loc54, loc55 // tmp3 = mixl y6, y7
918 // ;;
919 // mix4.r loc24 = loc32, loc34 // x0.2 = mixr tmp0, tmp2
920 // mix4.r loc25 = loc33, loc35 // x1.2 = mixr tmp1, tmp3
921 // mix4.l loc26 = loc32, loc34 // x2.2 = mixl tmp0, tmp2
922 // mix4.l loc27 = loc33, loc35 // x3.2 = mixl tmp1, tmp3
923 // ;;
924 // mix2.r loc32 = loc44, loc45 // tmp0 = mixr y4.2, y5.2
925 // mix2.l loc33 = loc44, loc45 // tmp1 = mixl y4.2, y5.2
926 // mix2.r loc34 = loc46, loc47 // tmp2 = mixr y6.2, y6.2
927 // mix2.l loc35 = loc46, loc47 // tmp3 = mixl y6.2, y6.2
928 // ;;
929 // mix4.r loc28 = loc32, loc34 // x4.2 = mixr tmp0, tmp2
930 // mix4.r loc29 = loc33, loc35 // x5.2 = mixr tmp1, tmp3
931 // mix4.l loc30 = loc32, loc34 // x6.2 = mixl tmp0, tmp2
932 // mix4.l loc31 = loc33, loc35 // x7.2 = mixl tmp1, tmp3
933 // ;;
934 //
935 // // *******************
936 // // row-DTC 1st half
937 // // *******************
938 //
939 // padd2 loc32 = loc16, loc23 // t0 = x0 + x7
940 // padd2 loc33 = loc17, loc22 // t1 = x1 + x6
941 // padd2 loc34 = loc18, loc21 // t2 = x2 + x5
942 // padd2 loc35 = loc19, loc20 // t3 = x3 + x4
943 // psub2 loc36 = loc16, loc23 // t4 = x0 - x7
944 // psub2 loc37 = loc17, loc22 // t5 = x1 - x6
945 // psub2 loc38 = loc18, loc21 // t6 = x2 - x5
946 // psub2 loc39 = loc19, loc20 // t7 = x3 - x4
947 // ;;
948 // padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
949 // psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
950 // ;;
951 // pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
952 // pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
953 // ;;
954 // padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
955 // padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
956 // ;;
957 // padd2 loc16 = loc32, loc35 // x0 = t0 + t3
958 // padd2 loc17 = loc33, loc34 // x1 = t1 + t2
959 // psub2 loc18 = loc32, loc35 // x2 = t0 - t3
960 // psub2 loc19 = loc33, loc34 // x3 = t1 - t2
961 // padd2 loc20 = loc36, loc37 // x4 = t4 + t5
962 // padd2 loc21 = loc38, loc39 // x5 = t6 + t7
963 // psub2 loc22 = loc36, loc37 // x6 = t4 - t5
964 // psub2 loc23 = loc38, loc39 // x7 = t6 - t7
965 // ;;
966 // padd2 loc32 = loc16, loc17 // t0 = x0 + x1
967 // psub2 loc33 = loc16, loc17 // t1 = x0 - x1
968 // pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
969 // pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
970 // pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
971 // pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
972 // pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
973 // pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
974 // ;;
975 // padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
976 // padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
977 // ;;
978 // padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
979 // psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
980 // padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
981 // psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
982 // padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1)
983 // psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7
984 // ;;
985 // pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
986 // pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
987 // pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
988 // pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
989 // pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
990 // pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
991 // pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
992 // pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
993 // ;;
994 // padd2 loc48 = loc16, loc32 // y0 = x0 + t0
995 // padd2 loc49 = loc20, loc36 // y1 = x4 + t4
996 // padd2 loc50 = loc18, loc34 // y2 = x2 + t2
997 // padd2 loc51 = loc22, loc38 // y3 = x6 + t6
998 // padd2 loc52 = loc17, loc33 // y4 = x1 + t1
999 // padd2 loc53 = loc23, loc39 // y5 = x7 + t7
1000 // padd2 loc54 = loc19, loc35 // y6 = x3 + t3
1001 // padd2 loc55 = loc21, loc37 // y7 = x5 + t5
1002 // ;;
1003 //
1004 // // *******************
1005 // // row-DTC 2nd half
1006 // // *******************
1007 //
1008 // padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2
1009 // padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2
1010 // padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2
1011 // padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2
1012 // psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2
1013 // psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2
1014 // psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2
1015 // psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2
1016 // ;;
1017 // padd2 loc40 = loc37, loc38 // buf0 = t5 + t6
1018 // psub2 loc41 = loc37, loc38 // buf1 = t5 - t6
1019 // ;;
1020 // pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4
1021 // pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4
1022 // ;;
1023 // padd2 loc37 = loc37, loc40 // t5 = t5 + buf1
1024 // padd2 loc38 = loc38, loc41 // t6 = t6 + buf2
1025 // ;;
1026 // padd2 loc16 = loc32, loc35 // x0 = t0 + t3
1027 // padd2 loc17 = loc33, loc34 // x1 = t1 + t2
1028 // psub2 loc18 = loc32, loc35 // x2 = t0 - t3
1029 // psub2 loc19 = loc33, loc34 // x3 = t1 - t2
1030 // padd2 loc20 = loc36, loc37 // x4 = t4 + t5
1031 // padd2 loc21 = loc38, loc39 // x5 = t6 + t7
1032 // psub2 loc22 = loc36, loc37 // x6 = t4 - t5
1033 // psub2 loc23 = loc38, loc39 // x7 = t6 - t7
1034 // ;;
1035 // padd2 loc32 = loc16, loc17 // t0 = x0 + x1
1036 // psub2 loc33 = loc16, loc17 // t1 = x0 - x1
1037 // pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1
1038 // pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1
1039 // pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0
1040 // pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0
1041 // pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2
1042 // pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2
1043 // ;;
1044 // padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6
1045 // padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7
1046 // ;;
1047 // padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1)
1048 // psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3
1049 // padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1)
1050 // psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5
1051 // padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1)
1052 // psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7
1053 // ;;
1054 // pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4
1055 // pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4
1056 // pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2
1057 // pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2
1058 // pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1
1059 // pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1
1060 // pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3
1061 // pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3
1062 // ;;
1063 // padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0
1064 // padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4
1065 // padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2
1066 // padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6
1067 // padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1
1068 // padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7
1069 // padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3
1070 // padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5
1071 // ;;
1072 // // *******************
1073 // // Transpose matrix
1074 // // *******************
1075 //
1076 // mix2.l loc32 = loc49, loc48 // tmp0 = mixr y1, y0
1077 // mix2.r loc33 = loc49, loc48 // tmp1 = mixl y1, y0
1078 // mix2.l loc34 = loc51, loc50 // tmp2 = mixr y3, y2
1079 // mix2.r loc35 = loc51, loc50 // tmp3 = mixl y3, y2
1080 // ;;
1081 // mix4.l loc16 = loc34, loc32 // x0 = mixr tmp2, tmp0
1082 // mix4.l loc17 = loc35, loc33 // x1 = mixr tmp3, tmp1
1083 // mix4.r loc18 = loc34, loc32 // x2 = mixl tmp2, tmp0
1084 // mix4.r loc19 = loc35, loc33 // x3 = mixl tmp3, tmp1
1085 // ;;
1086 // mix2.l loc32 = loc41, loc40 // tmp0 = mixr y0.2, y1.2
1087 // mix2.r loc33 = loc41, loc40 // tmp1 = mixl y0.2, y1.2
1088 // mix2.l loc34 = loc43, loc42 // tmp2 = mixr y2.2, y3.2
1089 // mix2.r loc35 = loc43, loc42 // tmp3 = mixl y2.2, y3.2
1090 // ;;
1091 // mix4.l loc20 = loc34, loc32 // x4 = mixr tmp2, tmp0
1092 // mix4.l loc21 = loc35, loc33 // x5 = mixr tmp3, tmp1
1093 // mix4.r loc22 = loc34, loc32 // x6 = mixl tmp2, tmp0
1094 // mix4.r loc23 = loc35, loc33 // x7 = mixl tmp3, tmp1
1095 // ;;
1096 // mix2.l loc32 = loc53, loc52 // tmp0 = mixr y5, y4
1097 // mix2.r loc33 = loc53, loc52 // tmp1 = mixl y5, y4
1098 // mix2.l loc34 = loc55, loc54 // tmp2 = mixr y7, y6
1099 // mix2.r loc35 = loc55, loc54 // tmp3 = mixl y7, y6
1100 // ;;
1101 // mix4.l loc24 = loc34, loc32 // x0.2 = mixr tmp2, tmp0
1102 // mix4.l loc25 = loc35, loc33 // x1.2 = mixr tmp3, tmp1
1103 // mix4.r loc26 = loc34, loc32 // x2.2 = mixl tmp2, tmp0
1104 // mix4.r loc27 = loc35, loc33 // x3.2 = mixl tmp3, tmp1
1105 // ;;
1106 // mix2.l loc32 = loc45, loc44 // tmp0 = mixr y4.2, y5.2
1107 // mix2.r loc33 = loc45, loc44 // tmp1 = mixl y4.2, y5.2
1108 // mix2.l loc34 = loc47, loc46 // tmp2 = mixr y6.2, y6.2
1109 // mix2.r loc35 = loc47, loc46 // tmp3 = mixl y6.2, y6.2
1110 // ;;
1111 // mix4.l loc28 = loc34, loc32 // x4.2 = mixr tmp2, tmp0
1112 // mix4.l loc29 = loc35, loc33 // x5.2 = mixr tmp3, tmp1
1113 // mix4.r loc30 = loc34, loc32 // x6.2 = mixl tmp2, tmp0
1114 // mix4.r loc31 = loc35, loc33 // x7.2 = mixl tmp3, tmp1
1115 // ;;
1116 //
1117 // // ********
1118 // // descale
1119 // // ********
1120 //
1121 // padd2 loc16 = loc16, r23
1122 // padd2 loc17 = loc17, r23
1123 // padd2 loc18 = loc18, r23
1124 // padd2 loc19 = loc19, r23
1125 // padd2 loc20 = loc20, r23
1126 // padd2 loc21 = loc21, r23
1127 // padd2 loc22 = loc22, r23
1128 // padd2 loc23 = loc23, r23
1129 // padd2 loc24 = loc24, r23
1130 // padd2 loc25 = loc25, r23
1131 // padd2 loc26 = loc26, r23
1132 // padd2 loc27 = loc27, r23
1133 // padd2 loc28 = loc28, r23
1134 // padd2 loc29 = loc29, r23
1135 // padd2 loc30 = loc30, r23
1136 // padd2 loc31 = loc31, r23
1137 // ;;
1138 // pshr2 loc16 = loc16, 3
1139 // pshr2 loc17 = loc17, 3
1140 // pshr2 loc18 = loc18, 3
1141 // pshr2 loc19 = loc19, 3
1142 // pshr2 loc20 = loc20, 3
1143 // pshr2 loc21 = loc21, 3
1144 // pshr2 loc22 = loc22, 3
1145 // pshr2 loc23 = loc23, 3
1146 // pshr2 loc24 = loc24, 3
1147 // pshr2 loc25 = loc25, 3
1148 // pshr2 loc26 = loc26, 3
1149 // pshr2 loc27 = loc27, 3
1150 // pshr2 loc28 = loc28, 3
1151 // pshr2 loc29 = loc29, 3
1152 // pshr2 loc30 = loc30, 3
1153 // pshr2 loc31 = loc31, 3
1154 // ;;
1155 // // ************
1156 // // Store Matrix
1157 // // ************
1158 // st8 [loc0] = loc16
1159 // st8 [loc1] = loc24
1160 // st8 [loc2] = loc17
1161 // st8 [loc3] = loc25
1162 // st8 [loc4] = loc18
1163 // st8 [loc5] = loc26
1164 // st8 [loc6] = loc19
1165 // st8 [loc7] = loc27
1166 // st8 [loc8] = loc20
1167 // st8 [loc9] = loc28
1168 // st8 [loc10] = loc21
1169 // st8 [loc11] = loc29
1170 // st8 [loc12] = loc22
1171 // st8 [loc13] = loc30
1172 // st8 [loc14] = loc23
1173 // st8 [loc15] = loc31
1174 //
1175 // mov ar.pfs = r14
1176 // br.ret.sptk.many b0
1177 // .endp fdct_ia64#
1178 // .common fdct#,8,8
1179 //