Parent Directory
|
Revision Log
Revision 1855 - (view) (download)
1 : | Isibaar | 1855 | // **************************************************************************** |
2 : | // * | ||
3 : | // * XVID MPEG-4 VIDEO CODEC | ||
4 : | // * - IA64 forward discrete cosine transform - | ||
5 : | // * | ||
6 : | // * Copyright(C) 2002 Stephan Krause, Ingo-Marc Weber, Daniel Kallfass | ||
7 : | // * | ||
8 : | // * This program is free software; you can redistribute it and/or modify it | ||
9 : | // * under the terms of the GNU General Public License as published by | ||
10 : | // * the Free Software Foundation; either version 2 of the License, or | ||
11 : | // * (at your option) any later version. | ||
12 : | // * | ||
13 : | // * This program is distributed in the hope that it will be useful, | ||
14 : | // * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 : | // * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 : | // * GNU General Public License for more details. | ||
17 : | // * | ||
18 : | // * You should have received a copy of the GNU General Public License | ||
19 : | // * along with this program; if not, write to the Free Software | ||
20 : | // * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
21 : | // * | ||
22 : | // * $Id: fdct_ia64.s,v 1.6 2009-02-19 17:07:29 Isibaar Exp $ | ||
23 : | // * | ||
24 : | // ***************************************************************************/ | ||
25 : | ia64p | 302 | // |
26 : | Isibaar | 1855 | // **************************************************************************** |
27 : | // * | ||
28 : | // * fdct_ia64.s, IA-64 optimized forward DCT | ||
29 : | // * | ||
30 : | // * Completed version provided by Intel at AppNote AP-922 | ||
31 : | // * http://developer.intel.com/software/products/college/ia32/strmsimd/ | ||
32 : | // * Copyright (C) 1999 Intel Corporation, | ||
33 : | // * | ||
34 : | // * This version was implemented during an IA-64 practical training at | ||
35 : | // * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/) | ||
36 : | // * | ||
37 : | // ***************************************************************************** | ||
38 : | edgomez | 851 | // |
39 : | Isibaar | 1855 | // ***************************************************************************** |
40 : | // * | ||
41 : | // * Revision history: | ||
42 : | // * | ||
43 : | // * 24.07.2002 Initial Version | ||
44 : | // * | ||
45 : | // ***************************************************************************** | ||
46 : | ia64p | 302 | |
47 : | edgomez | 851 | |
48 : | ia64p | 302 | // This is a fast precise implementation of 8x8 Discrete Cosine Transform |
49 : | // published in Intel Application Note 922 from 1999 and optimized for IA-64. | ||
50 : | // | ||
51 : | // An unoptimized "straight forward" version can be found at the end of this file. | ||
52 : | |||
53 : | |||
54 : | .pred.safe_across_calls p1-p5,p16-p63 | ||
55 : | .text | ||
56 : | .align 16 | ||
57 : | .global fdct_ia64# | ||
58 : | .proc fdct_ia64# | ||
59 : | fdct_ia64: | ||
60 : | .prologue | ||
61 : | alloc r14 = ar.pfs, 1, 56, 0, 0 | ||
62 : | // Save constants | ||
63 : | mov r31 = 0x32ec // c0 = tan(1pi/16) | ||
64 : | mov r30 = 0x6a0a // c1 = tan(2pi/16) | ||
65 : | mov r29 = 0xab0e // c2 = tan(3pi/16) | ||
66 : | mov r28 = 0xb505 // g4 = cos(4pi/16) | ||
67 : | mov r27 = 0xd4db // g3 = cos(3pi/16) | ||
68 : | mov r26 = 0xec83 // g2 = cos(2pi/16) | ||
69 : | mov r25 = 0xfb15 // g1 = cos(1pi/16) | ||
70 : | mov r24 = 0x0002 // correction bit for descaling | ||
71 : | mov r23 = 0x0004 // correction bit for descaling | ||
72 : | |||
73 : | // Load Matrix into registers | ||
74 : | |||
75 : | add loc0 = r0, r32 | ||
76 : | add loc2 = 16, r32 | ||
77 : | add loc4 = 32, r32 | ||
78 : | add loc6 = 48, r32 | ||
79 : | add loc8 = 64, r32 | ||
80 : | add loc10 = 80, r32 | ||
81 : | add loc12 = 96, r32 | ||
82 : | add loc14 = 112, r32 | ||
83 : | add loc1 = 8, r32 | ||
84 : | add loc3 = 24, r32 | ||
85 : | add loc5 = 40, r32 | ||
86 : | add loc7 = 56, r32 | ||
87 : | add loc9 = 72, r32 | ||
88 : | add loc11 = 88, r32 | ||
89 : | add loc13 = 104, r32 | ||
90 : | add loc15 = 120, r32 | ||
91 : | ;; | ||
92 : | ld8 loc16 = [loc0] | ||
93 : | ld8 loc17 = [loc2] | ||
94 : | ld8 loc18 = [loc4] | ||
95 : | ld8 loc19 = [loc6] | ||
96 : | ld8 loc20 = [loc8] | ||
97 : | ld8 loc21 = [loc10] | ||
98 : | ld8 loc22 = [loc12] | ||
99 : | ld8 loc23 = [loc14] | ||
100 : | ld8 loc24 = [loc1] | ||
101 : | ld8 loc25 = [loc3] | ||
102 : | ld8 loc26 = [loc5] | ||
103 : | ld8 loc27 = [loc7] | ||
104 : | mux2 r26 = r26, 0x00 | ||
105 : | ld8 loc28 = [loc9] | ||
106 : | mux2 r31 = r31, 0x00 | ||
107 : | mux2 r25 = r25, 0x00 | ||
108 : | ld8 loc29 = [loc11] | ||
109 : | mux2 r30 = r30, 0x00 | ||
110 : | mux2 r29 = r29, 0x00 | ||
111 : | ld8 loc30 = [loc13] | ||
112 : | mux2 r28 = r28, 0x00 | ||
113 : | mux2 r27 = r27, 0x00 | ||
114 : | ld8 loc31 = [loc15] | ||
115 : | mux2 r24 = r24, 0x00 | ||
116 : | mux2 r23 = r23, 0x00 | ||
117 : | ;; | ||
118 : | pshl2 loc16 = loc16, 3 | ||
119 : | pshl2 loc17 = loc17, 3 | ||
120 : | pshl2 loc18 = loc18, 3 | ||
121 : | pshl2 loc19 = loc19, 3 | ||
122 : | pshl2 loc20 = loc20, 3 | ||
123 : | pshl2 loc21 = loc21, 3 | ||
124 : | pshl2 loc22 = loc22, 3 | ||
125 : | pshl2 loc23 = loc23, 3 | ||
126 : | ;; | ||
127 : | pshl2 loc24 = loc24, 3 | ||
128 : | |||
129 : | // ******************* | ||
130 : | // column-DTC 1st half | ||
131 : | // ******************* | ||
132 : | |||
133 : | psub2 loc37 = loc17, loc22 // t5 = x1 - x6 | ||
134 : | pshl2 loc25 = loc25, 3 | ||
135 : | pshl2 loc26 = loc26, 3 | ||
136 : | psub2 loc38 = loc18, loc21 // t6 = x2 - x5 | ||
137 : | pshl2 loc27 = loc27, 3 | ||
138 : | pshl2 loc28 = loc28, 3 | ||
139 : | ;; | ||
140 : | padd2 loc32 = loc16, loc23 // t0 = x0 + x7 | ||
141 : | pshl2 loc29 = loc29, 3 | ||
142 : | pshl2 loc30 = loc30, 3 | ||
143 : | padd2 loc33 = loc17, loc22 // t1 = x1 + x6 | ||
144 : | padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 | ||
145 : | psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 | ||
146 : | ;; | ||
147 : | padd2 loc34 = loc18, loc21 // t2 = x2 + x5 | ||
148 : | pshl2 loc31 = loc31, 3 | ||
149 : | padd2 loc35 = loc19, loc20 // t3 = x3 + x4 | ||
150 : | psub2 loc36 = loc16, loc23 // t4 = x0 - x7 | ||
151 : | pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 | ||
152 : | pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 | ||
153 : | ;; | ||
154 : | psub2 loc39 = loc19, loc20 // t7 = x3 - x4 | ||
155 : | padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 | ||
156 : | padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 | ||
157 : | |||
158 : | padd2 loc16 = loc32, loc35 // x0 = t0 + t3 | ||
159 : | padd2 loc17 = loc33, loc34 // x1 = t1 + t2 | ||
160 : | psub2 loc18 = loc32, loc35 // x2 = t0 - t3 | ||
161 : | ;; | ||
162 : | psub2 loc19 = loc33, loc34 // x3 = t1 - t2 | ||
163 : | padd2 loc20 = loc36, loc37 // x4 = t4 + t5 | ||
164 : | padd2 loc21 = loc38, loc39 // x5 = t6 + t7 | ||
165 : | psub2 loc22 = loc36, loc37 // x6 = t4 - t5 | ||
166 : | psub2 loc23 = loc38, loc39 // x7 = t6 - t7 | ||
167 : | ;; | ||
168 : | pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 | ||
169 : | padd2 loc32 = loc16, loc17 // t0 = x0 + x1 | ||
170 : | pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 | ||
171 : | pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 | ||
172 : | psub2 loc33 = loc16, loc17 // t1 = x0 - x1 | ||
173 : | pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 | ||
174 : | pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 | ||
175 : | ;; | ||
176 : | padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 | ||
177 : | pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 | ||
178 : | padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 | ||
179 : | padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) | ||
180 : | psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 | ||
181 : | psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 | ||
182 : | ;; | ||
183 : | padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) | ||
184 : | padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) | ||
185 : | pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 | ||
186 : | pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 | ||
187 : | psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 | ||
188 : | ;; | ||
189 : | padd2 loc48 = loc16, loc32 // y0 = x0 + t0 | ||
190 : | pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 | ||
191 : | pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 | ||
192 : | padd2 loc52 = loc17, loc33 // y4 = x1 + t1 | ||
193 : | pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 | ||
194 : | pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 | ||
195 : | ;; | ||
196 : | padd2 loc50 = loc18, loc34 // y2 = x2 + t2 | ||
197 : | pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 | ||
198 : | pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 | ||
199 : | padd2 loc55 = loc21, loc37 // y7 = x5 + t5 | ||
200 : | padd2 loc49 = loc20, loc36 // y1 = x4 + t4 | ||
201 : | padd2 loc54 = loc19, loc35 // y6 = x3 + t3 | ||
202 : | ;; | ||
203 : | padd2 loc51 = loc22, loc38 // y3 = x6 + t6 | ||
204 : | padd2 loc53 = loc23, loc39 // y5 = x7 + t7 | ||
205 : | |||
206 : | //divide by 4 | ||
207 : | |||
208 : | padd2 loc48 = loc48, r24 | ||
209 : | padd2 loc49 = loc49, r24 | ||
210 : | padd2 loc50 = loc50, r24 | ||
211 : | padd2 loc52 = loc52, r24 | ||
212 : | ;; | ||
213 : | padd2 loc51 = loc51, r24 | ||
214 : | pshr2 loc48 = loc48, 2 | ||
215 : | padd2 loc53 = loc53, r24 | ||
216 : | pshr2 loc49 = loc49, 2 | ||
217 : | padd2 loc54 = loc54, r24 | ||
218 : | pshr2 loc50 = loc50, 2 | ||
219 : | padd2 loc55 = loc55, r24 | ||
220 : | pshr2 loc52 = loc52, 2 | ||
221 : | ;; | ||
222 : | pshr2 loc51 = loc51, 2 | ||
223 : | pshr2 loc53 = loc53, 2 | ||
224 : | pshr2 loc54 = loc54, 2 | ||
225 : | pshr2 loc55 = loc55, 2 | ||
226 : | |||
227 : | |||
228 : | // ******************* | ||
229 : | // column-DTC 2nd half | ||
230 : | // ******************* | ||
231 : | |||
232 : | psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2 | ||
233 : | psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2 | ||
234 : | padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2 | ||
235 : | padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2 | ||
236 : | ;; | ||
237 : | padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2 | ||
238 : | psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 | ||
239 : | padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 | ||
240 : | padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2 | ||
241 : | ;; | ||
242 : | psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2 | ||
243 : | pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 | ||
244 : | pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 | ||
245 : | ;; | ||
246 : | psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2 | ||
247 : | padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 | ||
248 : | padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 | ||
249 : | |||
250 : | padd2 loc16 = loc32, loc35 // x0 = t0 + t3 | ||
251 : | padd2 loc17 = loc33, loc34 // x1 = t1 + t2 | ||
252 : | psub2 loc18 = loc32, loc35 // x2 = t0 - t3 | ||
253 : | ;; | ||
254 : | psub2 loc19 = loc33, loc34 // x3 = t1 - t2 | ||
255 : | padd2 loc20 = loc36, loc37 // x4 = t4 + t5 | ||
256 : | padd2 loc21 = loc38, loc39 // x5 = t6 + t7 | ||
257 : | psub2 loc22 = loc36, loc37 // x6 = t4 - t5 | ||
258 : | psub2 loc23 = loc38, loc39 // x7 = t6 - t7 | ||
259 : | ;; | ||
260 : | pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 | ||
261 : | padd2 loc32 = loc16, loc17 // t0 = x0 + x1 | ||
262 : | pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 | ||
263 : | pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 | ||
264 : | psub2 loc33 = loc16, loc17 // t1 = x0 - x1 | ||
265 : | pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 | ||
266 : | pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 | ||
267 : | pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 | ||
268 : | ;; | ||
269 : | padd2 loc34 = loc18, loc43 // t2 = x2 + buf3 | ||
270 : | padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 | ||
271 : | padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 | ||
272 : | psub2 loc35 = loc42, loc19 // t3 = buf2 - x3 | ||
273 : | padd2 loc36 = loc20, loc45 // t4 = x4 + buf5 | ||
274 : | pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 | ||
275 : | ;; | ||
276 : | psub2 loc37 = loc44, loc21 // t5 = buf4 - x5 | ||
277 : | padd2 loc38 = loc22, loc47 // t6 = x6 + buf7 | ||
278 : | psub2 loc39 = loc46, loc23 // t7 = buf6 - x7 | ||
279 : | pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 | ||
280 : | ;; | ||
281 : | padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0 | ||
282 : | pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 | ||
283 : | pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 | ||
284 : | padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1 | ||
285 : | pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 | ||
286 : | pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 | ||
287 : | ;; | ||
288 : | padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2 | ||
289 : | pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 | ||
290 : | pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 | ||
291 : | padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5 | ||
292 : | padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4 | ||
293 : | padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3 | ||
294 : | ;; | ||
295 : | padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6 | ||
296 : | |||
297 : | // ******************* | ||
298 : | // transpose matrix | ||
299 : | // ******************* | ||
300 : | |||
301 : | mix2.r loc32 = loc48, loc49 // tmp0 = mixr y0, y1 | ||
302 : | mix2.l loc33 = loc48, loc49 // tmp1 = mixl y0, y1 | ||
303 : | padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7 | ||
304 : | mix2.r loc34 = loc50, loc51 // tmp2 = mixr y2, y3 | ||
305 : | mix2.l loc35 = loc50, loc51 // tmp3 = mixl y2, y3 | ||
306 : | ;; | ||
307 : | |||
308 : | //divide by 4 | ||
309 : | |||
310 : | padd2 loc40 = loc40, r24 | ||
311 : | padd2 loc41 = loc41, r24 | ||
312 : | mix4.r loc16 = loc32, loc34 // x0 = mixr tmp0, tmp2 | ||
313 : | padd2 loc42 = loc42, r24 | ||
314 : | padd2 loc43 = loc43, r24 | ||
315 : | mix4.r loc17 = loc33, loc35 // x1 = mixr tmp1, tmp3 | ||
316 : | padd2 loc44 = loc44, r24 | ||
317 : | padd2 loc45 = loc45, r24 | ||
318 : | mix4.l loc18 = loc32, loc34 // x2 = mixl tmp0, tmp2 | ||
319 : | padd2 loc46 = loc46, r24 | ||
320 : | padd2 loc47 = loc47, r24 | ||
321 : | mix4.l loc19 = loc33, loc35 // x3 = mixl tmp1, tmp3 | ||
322 : | ;; | ||
323 : | pshr2 loc40 = loc40, 2 | ||
324 : | pshr2 loc41 = loc41, 2 | ||
325 : | pshr2 loc42 = loc42, 2 | ||
326 : | pshr2 loc43 = loc43, 2 | ||
327 : | mix2.r loc32 = loc52, loc53 // tmp0 = mixr y4, y5 | ||
328 : | mix2.l loc33 = loc52, loc53 // tmp1 = mixl y4, y5 | ||
329 : | mix2.r loc34 = loc54, loc55 // tmp2 = mixr y6, y7 | ||
330 : | mix2.l loc35 = loc54, loc55 // tmp3 = mixl y6, y7 | ||
331 : | ;; | ||
332 : | pshr2 loc44 = loc44, 2 | ||
333 : | pshr2 loc45 = loc45, 2 | ||
334 : | pshr2 loc46 = loc46, 2 | ||
335 : | pshr2 loc47 = loc47, 2 | ||
336 : | mix4.r loc24 = loc32, loc34 // x0.2 = mixr tmp0, tmp2 | ||
337 : | mix4.r loc25 = loc33, loc35 // x1.2 = mixr tmp1, tmp3 | ||
338 : | mix4.l loc26 = loc32, loc34 // x2.2 = mixl tmp0, tmp2 | ||
339 : | mix4.l loc27 = loc33, loc35 // x3.2 = mixl tmp1, tmp3 | ||
340 : | ;; | ||
341 : | mix2.r loc32 = loc40, loc41 // tmp0 = mixr y0.2, y1.2 | ||
342 : | mix2.l loc33 = loc40, loc41 // tmp1 = mixl y0.2, y1.2 | ||
343 : | mix2.r loc34 = loc42, loc43 // tmp2 = mixr y2.2, y3.2 | ||
344 : | mix2.l loc35 = loc42, loc43 // tmp3 = mixl y2.2, y3.2 | ||
345 : | ;; | ||
346 : | mix4.r loc20 = loc32, loc34 // x4 = mixr tmp0, tmp2 | ||
347 : | mix4.r loc21 = loc33, loc35 // x5 = mixr tmp1, tmp3 | ||
348 : | mix4.l loc22 = loc32, loc34 // x6 = mixl tmp0, tmp2 | ||
349 : | mix4.l loc23 = loc33, loc35 // x7 = mixl tmp1, tmp3 | ||
350 : | ;; | ||
351 : | mix2.r loc32 = loc44, loc45 // tmp0 = mixr y4.2, y5.2 | ||
352 : | mix2.l loc33 = loc44, loc45 // tmp1 = mixl y4.2, y5.2 | ||
353 : | mix2.r loc34 = loc46, loc47 // tmp2 = mixr y6.2, y6.2 | ||
354 : | mix2.l loc35 = loc46, loc47 // tmp3 = mixl y6.2, y6.2 | ||
355 : | ;; | ||
356 : | mix4.r loc28 = loc32, loc34 // x4.2 = mixr tmp0, tmp2 | ||
357 : | mix4.r loc29 = loc33, loc35 // x5.2 = mixr tmp1, tmp3 | ||
358 : | mix4.l loc30 = loc32, loc34 // x6.2 = mixl tmp0, tmp2 | ||
359 : | mix4.l loc31 = loc33, loc35 // x7.2 = mixl tmp1, tmp3 | ||
360 : | |||
361 : | // ******************* | ||
362 : | // row-DTC 1st half | ||
363 : | // ******************* | ||
364 : | |||
365 : | psub2 loc37 = loc17, loc22 // t5 = x1 - x6 | ||
366 : | psub2 loc38 = loc18, loc21 // t6 = x2 - x5 | ||
367 : | ;; | ||
368 : | padd2 loc32 = loc16, loc23 // t0 = x0 + x7 | ||
369 : | padd2 loc33 = loc17, loc22 // t1 = x1 + x6 | ||
370 : | padd2 loc34 = loc18, loc21 // t2 = x2 + x5 | ||
371 : | psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 | ||
372 : | padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 | ||
373 : | padd2 loc35 = loc19, loc20 // t3 = x3 + x4 | ||
374 : | ;; | ||
375 : | psub2 loc36 = loc16, loc23 // t4 = x0 - x7 | ||
376 : | pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 | ||
377 : | pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 | ||
378 : | ;; | ||
379 : | psub2 loc39 = loc19, loc20 // t7 = x3 - x4 | ||
380 : | padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 | ||
381 : | padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 | ||
382 : | |||
383 : | padd2 loc16 = loc32, loc35 // x0 = t0 + t3 | ||
384 : | padd2 loc17 = loc33, loc34 // x1 = t1 + t2 | ||
385 : | psub2 loc18 = loc32, loc35 // x2 = t0 - t3 | ||
386 : | ;; | ||
387 : | psub2 loc19 = loc33, loc34 // x3 = t1 - t2 | ||
388 : | padd2 loc20 = loc36, loc37 // x4 = t4 + t5 | ||
389 : | padd2 loc21 = loc38, loc39 // x5 = t6 + t7 | ||
390 : | psub2 loc22 = loc36, loc37 // x6 = t4 - t5 | ||
391 : | psub2 loc23 = loc38, loc39 // x7 = t6 - t7 | ||
392 : | ;; | ||
393 : | pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 | ||
394 : | padd2 loc32 = loc16, loc17 // t0 = x0 + x1 | ||
395 : | pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 | ||
396 : | pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 | ||
397 : | psub2 loc33 = loc16, loc17 // t1 = x0 - x1 | ||
398 : | pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 | ||
399 : | pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 | ||
400 : | pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 | ||
401 : | ;; | ||
402 : | padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 | ||
403 : | padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 | ||
404 : | padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) | ||
405 : | ;; | ||
406 : | psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 | ||
407 : | padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) | ||
408 : | psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 | ||
409 : | padd2 loc38 = loc22, loc47 // t6 = x6 + (buf7 * c1) | ||
410 : | psub2 loc39 = loc46, loc23 // t7 = (c1 * buf6) - x7 | ||
411 : | pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 | ||
412 : | pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 | ||
413 : | ;; | ||
414 : | padd2 loc48 = loc16, loc32 // y0 = x0 + t0 | ||
415 : | pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 | ||
416 : | pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 | ||
417 : | padd2 loc52 = loc17, loc33 // y4 = x1 + t1 | ||
418 : | pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 | ||
419 : | pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 | ||
420 : | ;; | ||
421 : | padd2 loc50 = loc18, loc34 // y2 = x2 + t2 | ||
422 : | pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 | ||
423 : | pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 | ||
424 : | padd2 loc55 = loc21, loc37 // y7 = x5 + t5 | ||
425 : | padd2 loc49 = loc20, loc36 // y1 = x4 + t4 | ||
426 : | padd2 loc54 = loc19, loc35 // y6 = x3 + t3 | ||
427 : | ;; | ||
428 : | padd2 loc51 = loc22, loc38 // y3 = x6 + t6 | ||
429 : | padd2 loc53 = loc23, loc39 // y5 = x7 + t7 | ||
430 : | |||
431 : | // ******************* | ||
432 : | // row-DTC 2nd half | ||
433 : | // ******************* | ||
434 : | |||
435 : | psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2 | ||
436 : | psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2 | ||
437 : | padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2 | ||
438 : | padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2 | ||
439 : | ;; | ||
440 : | padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2 | ||
441 : | psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 | ||
442 : | padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 | ||
443 : | padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2 | ||
444 : | ;; | ||
445 : | psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2 | ||
446 : | pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 | ||
447 : | pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 | ||
448 : | ;; | ||
449 : | psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2 | ||
450 : | padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 | ||
451 : | padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 | ||
452 : | |||
453 : | padd2 loc16 = loc32, loc35 // x0 = t0 + t3 | ||
454 : | padd2 loc17 = loc33, loc34 // x1 = t1 + t2 | ||
455 : | psub2 loc18 = loc32, loc35 // x2 = t0 - t3 | ||
456 : | ;; | ||
457 : | psub2 loc19 = loc33, loc34 // x3 = t1 - t2 | ||
458 : | padd2 loc20 = loc36, loc37 // x4 = t4 + t5 | ||
459 : | padd2 loc21 = loc38, loc39 // x5 = t6 + t7 | ||
460 : | psub2 loc22 = loc36, loc37 // x6 = t4 - t5 | ||
461 : | psub2 loc23 = loc38, loc39 // x7 = t6 - t7 | ||
462 : | ;; | ||
463 : | pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 | ||
464 : | padd2 loc32 = loc16, loc17 // t0 = x0 + x1 | ||
465 : | pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 | ||
466 : | pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 | ||
467 : | psub2 loc33 = loc16, loc17 // t1 = x0 - x1 | ||
468 : | pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 | ||
469 : | pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 | ||
470 : | pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 | ||
471 : | ;; | ||
472 : | padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 | ||
473 : | padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 | ||
474 : | padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) | ||
475 : | ;; | ||
476 : | psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 | ||
477 : | padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) | ||
478 : | psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 | ||
479 : | padd2 loc38 = loc22, loc47 // t6 = x6 + (buf7 * c1) | ||
480 : | psub2 loc39 = loc46, loc23 // t7 = (c1 * buf6) - x7 | ||
481 : | pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 | ||
482 : | pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 | ||
483 : | ;; | ||
484 : | padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0 | ||
485 : | pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 | ||
486 : | pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 | ||
487 : | padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1 | ||
488 : | pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 | ||
489 : | pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 | ||
490 : | ;; | ||
491 : | padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2 | ||
492 : | pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 | ||
493 : | pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 | ||
494 : | padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3 | ||
495 : | nop.i 0x0 | ||
496 : | nop.i 0x0 | ||
497 : | ;; | ||
498 : | |||
499 : | // ******************* | ||
500 : | // Transpose matrix | ||
501 : | // ******************* | ||
502 : | padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4 | ||
503 : | mix2.l loc32 = loc49, loc48 // tmp0 = mixr y1, y0 | ||
504 : | mix2.r loc33 = loc49, loc48 // tmp1 = mixl y1, y0 | ||
505 : | padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5 | ||
506 : | mix2.l loc34 = loc51, loc50 // tmp2 = mixr y3, y2 | ||
507 : | mix2.r loc35 = loc51, loc50 // tmp3 = mixl y3, y2 | ||
508 : | ;; | ||
509 : | padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6 | ||
510 : | mix4.l loc16 = loc34, loc32 // x0 = mixr tmp2, tmp0 | ||
511 : | mix4.l loc17 = loc35, loc33 // x1 = mixr tmp3, tmp1 | ||
512 : | padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7 | ||
513 : | mix4.r loc18 = loc34, loc32 // x2 = mixl tmp2, tmp0 | ||
514 : | mix4.r loc19 = loc35, loc33 // x3 = mixl tmp3, tmp1 | ||
515 : | ;; | ||
516 : | padd2 loc16 = loc16, r23 | ||
517 : | mix2.l loc32 = loc41, loc40 // tmp0 = mixr y0.2, y1.2 | ||
518 : | mix2.r loc33 = loc41, loc40 // tmp1 = mixl y0.2, y1.2 | ||
519 : | padd2 loc17 = loc17, r23 | ||
520 : | mix2.l loc34 = loc43, loc42 // tmp2 = mixr y2.2, y3.2 | ||
521 : | mix2.r loc35 = loc43, loc42 // tmp3 = mixl y2.2, y3.2 | ||
522 : | ;; | ||
523 : | padd2 loc18 = loc18, r23 | ||
524 : | mix4.l loc20 = loc34, loc32 // x4 = mixr tmp2, tmp0 | ||
525 : | mix4.l loc21 = loc35, loc33 // x5 = mixr tmp3, tmp1 | ||
526 : | padd2 loc19 = loc19, r23 | ||
527 : | mix4.r loc22 = loc34, loc32 // x6 = mixl tmp2, tmp0 | ||
528 : | mix4.r loc23 = loc35, loc33 // x7 = mixl tmp3, tmp1 | ||
529 : | ;; | ||
530 : | padd2 loc20 = loc20, r23 | ||
531 : | mix2.l loc32 = loc53, loc52 // tmp0 = mixr y5, y4 | ||
532 : | mix2.r loc33 = loc53, loc52 // tmp1 = mixl y5, y4 | ||
533 : | padd2 loc21 = loc21, r23 | ||
534 : | mix2.l loc34 = loc55, loc54 // tmp2 = mixr y7, y6 | ||
535 : | mix2.r loc35 = loc55, loc54 // tmp3 = mixl y7, y6 | ||
536 : | ;; | ||
537 : | padd2 loc22 = loc22, r23 | ||
538 : | mix4.l loc24 = loc34, loc32 // x0.2 = mixr tmp2, tmp0 | ||
539 : | mix4.l loc25 = loc35, loc33 // x1.2 = mixr tmp3, tmp1 | ||
540 : | padd2 loc23 = loc23, r23 | ||
541 : | mix4.r loc26 = loc34, loc32 // x2.2 = mixl tmp2, tmp0 | ||
542 : | mix4.r loc27 = loc35, loc33 // x3.2 = mixl tmp3, tmp1 | ||
543 : | ;; | ||
544 : | padd2 loc24 = loc24, r23 | ||
545 : | mix2.l loc32 = loc45, loc44 // tmp0 = mixr y4.2, y5.2 | ||
546 : | mix2.r loc33 = loc45, loc44 // tmp1 = mixl y4.2, y5.2 | ||
547 : | padd2 loc25 = loc25, r23 | ||
548 : | mix2.l loc34 = loc47, loc46 // tmp2 = mixr y6.2, y6.2 | ||
549 : | mix2.r loc35 = loc47, loc46 // tmp3 = mixl y6.2, y6.2 | ||
550 : | ;; | ||
551 : | padd2 loc26 = loc26, r23 | ||
552 : | mix4.l loc28 = loc34, loc32 // x4.2 = mixr tmp2, tmp0 | ||
553 : | mix4.l loc29 = loc35, loc33 // x5.2 = mixr tmp3, tmp1 | ||
554 : | padd2 loc27 = loc27, r23 | ||
555 : | mix4.r loc30 = loc34, loc32 // x6.2 = mixl tmp2, tmp0 | ||
556 : | mix4.r loc31 = loc35, loc33 // x7.2 = mixl tmp3, tmp1 | ||
557 : | ;; | ||
558 : | // ******************* | ||
559 : | // Descale | ||
560 : | // ******************* | ||
561 : | padd2 loc28 = loc28, r23 | ||
562 : | pshr2 loc16 = loc16, 3 | ||
563 : | pshr2 loc17 = loc17, 3 | ||
564 : | padd2 loc29 = loc29, r23 | ||
565 : | pshr2 loc18 = loc18, 3 | ||
566 : | pshr2 loc19 = loc19, 3 | ||
567 : | padd2 loc30 = loc30, r23 | ||
568 : | pshr2 loc20 = loc20, 3 | ||
569 : | pshr2 loc21 = loc21, 3 | ||
570 : | padd2 loc31 = loc31, r23 | ||
571 : | pshr2 loc22 = loc22, 3 | ||
572 : | pshr2 loc23 = loc23, 3 | ||
573 : | ;; | ||
574 : | pshr2 loc24 = loc24, 3 | ||
575 : | pshr2 loc25 = loc25, 3 | ||
576 : | pshr2 loc26 = loc26, 3 | ||
577 : | pshr2 loc27 = loc27, 3 | ||
578 : | pshr2 loc28 = loc28, 3 | ||
579 : | pshr2 loc29 = loc29, 3 | ||
580 : | pshr2 loc30 = loc30, 3 | ||
581 : | pshr2 loc31 = loc31, 3 | ||
582 : | ;; | ||
583 : | // ******************* | ||
584 : | // Store matrix | ||
585 : | // ******************* | ||
586 : | st8 [loc0] = loc16 | ||
587 : | st8 [loc1] = loc24 | ||
588 : | st8 [loc2] = loc17 | ||
589 : | st8 [loc3] = loc25 | ||
590 : | st8 [loc4] = loc18 | ||
591 : | st8 [loc5] = loc26 | ||
592 : | st8 [loc6] = loc19 | ||
593 : | st8 [loc7] = loc27 | ||
594 : | st8 [loc8] = loc20 | ||
595 : | st8 [loc9] = loc28 | ||
596 : | st8 [loc10] = loc21 | ||
597 : | st8 [loc11] = loc29 | ||
598 : | st8 [loc12] = loc22 | ||
599 : | st8 [loc13] = loc30 | ||
600 : | st8 [loc14] = loc23 | ||
601 : | st8 [loc15] = loc31 | ||
602 : | |||
603 : | mov ar.pfs = r14 | ||
604 : | br.ret.sptk.many b0 | ||
605 : | .endp fdct_ia64# | ||
606 : | .common fdct#,8,8 | ||
607 : | |||
608 : | |||
609 : | |||
610 : | |||
611 : | |||
612 : | |||
613 : | |||
614 : | |||
615 : | //*********************************************** | ||
616 : | //* Here is a version of the DCT implementation * | ||
617 : | //* unoptimized in terms of command ordering. * | ||
618 : | //* This version is about 30% slower but * | ||
619 : | //* easier understand. * | ||
620 : | //*********************************************** | ||
621 : | // | ||
622 : | // .pred.safe_across_calls p1-p5,p16-p63 | ||
623 : | //.text | ||
624 : | // .align 16 | ||
625 : | // .global fdct_ia64# | ||
626 : | // .proc fdct_ia64# | ||
627 : | //fdct_ia64: | ||
628 : | // .prologue | ||
629 : | // alloc r14 = ar.pfs, 1, 56, 0, 0 | ||
630 : | // | ||
631 : | // // ******************* | ||
632 : | // // Save constants | ||
633 : | // // ******************* | ||
634 : | // mov r31 = 0x32ec // c0 = tan(1pi/16) | ||
635 : | // mov r30 = 0x6a0a // c1 = tan(2pi/16) | ||
636 : | // mov r29 = 0xab0e // c2 = tan(3pi/16) | ||
637 : | // mov r28 = 0xb505 // g4 = cos(4pi/16) | ||
638 : | // mov r27 = 0xd4db // g3 = cos(3pi/16) | ||
639 : | // mov r26 = 0xec83 // g2 = cos(2pi/16) | ||
640 : | // mov r25 = 0xfb15 // g1 = cos(1pi/16) | ||
641 : | // mov r24 = 0x0002 // correction bit for descaling | ||
642 : | // mov r23 = 0x0004 // correction bit for descaling | ||
643 : | // | ||
644 : | // // ************************** | ||
645 : | // // Load Matrix into registers | ||
646 : | // // ************************** | ||
647 : | // | ||
648 : | // add loc0 = r0, r32 | ||
649 : | // ;; | ||
650 : | // mux2 r31 = r31, 0x00 | ||
651 : | // mux2 r30 = r30, 0x00 | ||
652 : | // mux2 r29 = r29, 0x00 | ||
653 : | // mux2 r28 = r28, 0x00 | ||
654 : | // mux2 r27 = r27, 0x00 | ||
655 : | // mux2 r26 = r26, 0x00 | ||
656 : | // mux2 r25 = r25, 0x00 | ||
657 : | // mux2 r24 = r24, 0x00 | ||
658 : | // mux2 r23 = r23, 0x00 | ||
659 : | // ld8 loc16 = [loc0] | ||
660 : | // add loc2 = 16, r32 | ||
661 : | // add loc4 = 32, r32 | ||
662 : | // add loc6 = 48, r32 | ||
663 : | // add loc8 = 64, r32 | ||
664 : | // add loc10 = 80, r32 | ||
665 : | // ;; | ||
666 : | // ld8 loc17 = [loc2] | ||
667 : | // ld8 loc18 = [loc4] | ||
668 : | // add loc12 = 96, r32 | ||
669 : | // ld8 loc19 = [loc6] | ||
670 : | // ld8 loc20 = [loc8] | ||
671 : | // add loc14 = 112, r32 | ||
672 : | // ;; | ||
673 : | // ld8 loc21 = [loc10] | ||
674 : | // ld8 loc22 = [loc12] | ||
675 : | // add loc1 = 8, r32 | ||
676 : | // ld8 loc23 = [loc14] | ||
677 : | // add loc3 = 24, r32 | ||
678 : | // add loc5 = 40, r32 | ||
679 : | // ;; | ||
680 : | // ld8 loc24 = [loc1] | ||
681 : | // ld8 loc25 = [loc3] | ||
682 : | // add loc7 = 56, r32 | ||
683 : | // ld8 loc26 = [loc5] | ||
684 : | // add loc9 = 72, r32 | ||
685 : | // add loc11 = 88, r32 | ||
686 : | // ;; | ||
687 : | // ld8 loc27 = [loc7] | ||
688 : | // ld8 loc28 = [loc9] | ||
689 : | // add loc13 = 104, r32 | ||
690 : | // ld8 loc29 = [loc11] | ||
691 : | // add loc15 = 120, r32 | ||
692 : | // ;; | ||
693 : | // ld8 loc30 = [loc13] | ||
694 : | // ld8 loc31 = [loc15] | ||
695 : | // ;; | ||
696 : | // // ****** | ||
697 : | // // Scale | ||
698 : | // // ****** | ||
699 : | // pshl2 loc16 = loc16, 3 | ||
700 : | // pshl2 loc17 = loc17, 3 | ||
701 : | // pshl2 loc18 = loc18, 3 | ||
702 : | // pshl2 loc19 = loc19, 3 | ||
703 : | // pshl2 loc20 = loc20, 3 | ||
704 : | // pshl2 loc21 = loc21, 3 | ||
705 : | // pshl2 loc22 = loc22, 3 | ||
706 : | // pshl2 loc23 = loc23, 3 | ||
707 : | // pshl2 loc24 = loc24, 3 | ||
708 : | // pshl2 loc25 = loc25, 3 | ||
709 : | // pshl2 loc26 = loc26, 3 | ||
710 : | // pshl2 loc27 = loc27, 3 | ||
711 : | // pshl2 loc28 = loc28, 3 | ||
712 : | // pshl2 loc29 = loc29, 3 | ||
713 : | // pshl2 loc30 = loc30, 3 | ||
714 : | // pshl2 loc31 = loc31, 3 | ||
715 : | // ;; | ||
716 : | // | ||
717 : | // // ******************* | ||
718 : | // // column-DTC 1st half | ||
719 : | // // ******************* | ||
720 : | // | ||
721 : | // padd2 loc32 = loc16, loc23 // t0 = x0 + x7 | ||
722 : | // padd2 loc33 = loc17, loc22 // t1 = x1 + x6 | ||
723 : | // padd2 loc34 = loc18, loc21 // t2 = x2 + x5 | ||
724 : | // padd2 loc35 = loc19, loc20 // t3 = x3 + x4 | ||
725 : | // psub2 loc36 = loc16, loc23 // t4 = x0 - x7 | ||
726 : | // psub2 loc37 = loc17, loc22 // t5 = x1 - x6 | ||
727 : | // psub2 loc38 = loc18, loc21 // t6 = x2 - x5 | ||
728 : | // psub2 loc39 = loc19, loc20 // t7 = x3 - x4 | ||
729 : | // ;; | ||
730 : | // padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 | ||
731 : | // psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 | ||
732 : | // ;; | ||
733 : | // pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 | ||
734 : | // pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 | ||
735 : | // ;; | ||
736 : | // padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 | ||
737 : | // padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 | ||
738 : | // ;; | ||
739 : | // padd2 loc16 = loc32, loc35 // x0 = t0 + t3 | ||
740 : | // padd2 loc17 = loc33, loc34 // x1 = t1 + t2 | ||
741 : | // psub2 loc18 = loc32, loc35 // x2 = t0 - t3 | ||
742 : | // psub2 loc19 = loc33, loc34 // x3 = t1 - t2 | ||
743 : | // padd2 loc20 = loc36, loc37 // x4 = t4 + t5 | ||
744 : | // padd2 loc21 = loc38, loc39 // x5 = t6 + t7 | ||
745 : | // psub2 loc22 = loc36, loc37 // x6 = t4 - t5 | ||
746 : | // psub2 loc23 = loc38, loc39 // x7 = t6 - t7 | ||
747 : | // ;; | ||
748 : | // | ||
749 : | // padd2 loc32 = loc16, loc17 // t0 = x0 + x1 | ||
750 : | // psub2 loc33 = loc16, loc17 // t1 = x0 - x1 | ||
751 : | // pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 | ||
752 : | // pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 | ||
753 : | // pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 | ||
754 : | // pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 | ||
755 : | // pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 | ||
756 : | // pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 | ||
757 : | // ;; | ||
758 : | // padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 | ||
759 : | // padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 | ||
760 : | // ;; | ||
761 : | // padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) | ||
762 : | // psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 | ||
763 : | // padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) | ||
764 : | // psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 | ||
765 : | // padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) | ||
766 : | // psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 | ||
767 : | // ;; | ||
768 : | // pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 | ||
769 : | // pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 | ||
770 : | // pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 | ||
771 : | // pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 | ||
772 : | // pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 | ||
773 : | // pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 | ||
774 : | // pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 | ||
775 : | // pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 | ||
776 : | // ;; | ||
777 : | // padd2 loc48 = loc16, loc32 // y0 = x0 + t0 | ||
778 : | // padd2 loc49 = loc20, loc36 // y1 = x4 + t4 | ||
779 : | // padd2 loc50 = loc18, loc34 // y2 = x2 + t2 | ||
780 : | // padd2 loc51 = loc22, loc38 // y3 = x6 + t6 | ||
781 : | // padd2 loc52 = loc17, loc33 // y4 = x1 + t1 | ||
782 : | // padd2 loc53 = loc23, loc39 // y5 = x7 + t7 | ||
783 : | // padd2 loc54 = loc19, loc35 // y6 = x3 + t3 | ||
784 : | // padd2 loc55 = loc21, loc37 // y7 = x5 + t5 | ||
785 : | // ;; | ||
786 : | // | ||
787 : | // // ******************* | ||
788 : | // // column-DTC 2nd half | ||
789 : | // // ******************* | ||
790 : | // | ||
791 : | // padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2 | ||
792 : | // padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2 | ||
793 : | // padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2 | ||
794 : | // padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2 | ||
795 : | // psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2 | ||
796 : | // psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2 | ||
797 : | // psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2 | ||
798 : | // psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2 | ||
799 : | // ;; | ||
800 : | // padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 | ||
801 : | // psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 | ||
802 : | // ;; | ||
803 : | // pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 | ||
804 : | // pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 | ||
805 : | // ;; | ||
806 : | // padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 | ||
807 : | // padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 | ||
808 : | // ;; | ||
809 : | // padd2 loc16 = loc32, loc35 // x0 = t0 + t3 | ||
810 : | // padd2 loc17 = loc33, loc34 // x1 = t1 + t2 | ||
811 : | // psub2 loc18 = loc32, loc35 // x2 = t0 - t3 | ||
812 : | // psub2 loc19 = loc33, loc34 // x3 = t1 - t2 | ||
813 : | // padd2 loc20 = loc36, loc37 // x4 = t4 + t5 | ||
814 : | // padd2 loc21 = loc38, loc39 // x5 = t6 + t7 | ||
815 : | // psub2 loc22 = loc36, loc37 // x6 = t4 - t5 | ||
816 : | // psub2 loc23 = loc38, loc39 // x7 = t6 - t7 | ||
817 : | // ;; | ||
818 : | // padd2 loc32 = loc16, loc17 // t0 = x0 + x1 | ||
819 : | // psub2 loc33 = loc16, loc17 // t1 = x0 - x1 | ||
820 : | // pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 | ||
821 : | // pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 | ||
822 : | // pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 | ||
823 : | // pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 | ||
824 : | // pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 | ||
825 : | // pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 | ||
826 : | // ;; | ||
827 : | // padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 | ||
828 : | // padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 | ||
829 : | // ;; | ||
830 : | // padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) | ||
831 : | // psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 | ||
832 : | // padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) | ||
833 : | // psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 | ||
834 : | // padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) | ||
835 : | // psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 | ||
836 : | // ;; | ||
837 : | // pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 | ||
838 : | // pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 | ||
839 : | // pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 | ||
840 : | // pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 | ||
841 : | // pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 | ||
842 : | // pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 | ||
843 : | // pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 | ||
844 : | // pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 | ||
845 : | // ;; | ||
846 : | // padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0 | ||
847 : | // padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4 | ||
848 : | // padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2 | ||
849 : | // padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6 | ||
850 : | // padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1 | ||
851 : | // padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7 | ||
852 : | // padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3 | ||
853 : | // padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5 | ||
854 : | // ;; | ||
855 : | // padd2 loc40 = loc40, r24 // add r24 to correct rounding | ||
856 : | // padd2 loc41 = loc41, r24 | ||
857 : | // padd2 loc42 = loc42, r24 | ||
858 : | // padd2 loc43 = loc43, r24 | ||
859 : | // padd2 loc44 = loc44, r24 | ||
860 : | // padd2 loc45 = loc45, r24 | ||
861 : | // padd2 loc46 = loc46, r24 | ||
862 : | // padd2 loc47 = loc47, r24 | ||
863 : | // padd2 loc48 = loc48, r24 | ||
864 : | // padd2 loc49 = loc49, r24 | ||
865 : | // padd2 loc50 = loc50, r24 | ||
866 : | // padd2 loc51 = loc51, r24 | ||
867 : | // padd2 loc52 = loc52, r24 | ||
868 : | // padd2 loc53 = loc53, r24 | ||
869 : | // padd2 loc54 = loc54, r24 | ||
870 : | // padd2 loc55 = loc55, r24 | ||
871 : | // ;; | ||
872 : | // pshr2 loc40 = loc40, 2 // Divide all matrix elements through 4 | ||
873 : | // pshr2 loc41 = loc41, 2 | ||
874 : | // pshr2 loc42 = loc42, 2 | ||
875 : | // pshr2 loc43 = loc43, 2 | ||
876 : | // pshr2 loc44 = loc44, 2 | ||
877 : | // pshr2 loc45 = loc45, 2 | ||
878 : | // pshr2 loc46 = loc46, 2 | ||
879 : | // pshr2 loc47 = loc47, 2 | ||
880 : | // pshr2 loc48 = loc48, 2 | ||
881 : | // pshr2 loc49 = loc49, 2 | ||
882 : | // pshr2 loc50 = loc50, 2 | ||
883 : | // pshr2 loc51 = loc51, 2 | ||
884 : | // pshr2 loc52 = loc52, 2 | ||
885 : | // pshr2 loc53 = loc53, 2 | ||
886 : | // pshr2 loc54 = loc54, 2 | ||
887 : | // pshr2 loc55 = loc55, 2 | ||
888 : | // ;; | ||
889 : | // | ||
890 : | // // ***************** | ||
891 : | // // Transpose matrix | ||
892 : | // // ***************** | ||
893 : | // | ||
894 : | // mix2.r loc32 = loc48, loc49 // tmp0 = mixr y0, y1 | ||
895 : | // mix2.l loc33 = loc48, loc49 // tmp1 = mixl y0, y1 | ||
896 : | // mix2.r loc34 = loc50, loc51 // tmp2 = mixr y2, y3 | ||
897 : | // mix2.l loc35 = loc50, loc51 // tmp3 = mixl y2, y3 | ||
898 : | // ;; | ||
899 : | // mix4.r loc16 = loc32, loc34 // x0 = mixr tmp0, tmp2 | ||
900 : | // mix4.r loc17 = loc33, loc35 // x1 = mixr tmp1, tmp3 | ||
901 : | // mix4.l loc18 = loc32, loc34 // x2 = mixl tmp0, tmp2 | ||
902 : | // mix4.l loc19 = loc33, loc35 // x3 = mixl tmp1, tmp3 | ||
903 : | // ;; | ||
904 : | // mix2.r loc32 = loc40, loc41 // tmp0 = mixr y0.2, y1.2 | ||
905 : | // mix2.l loc33 = loc40, loc41 // tmp1 = mixl y0.2, y1.2 | ||
906 : | // mix2.r loc34 = loc42, loc43 // tmp2 = mixr y2.2, y3.2 | ||
907 : | // mix2.l loc35 = loc42, loc43 // tmp3 = mixl y2.2, y3.2 | ||
908 : | // ;; | ||
909 : | // mix4.r loc20 = loc32, loc34 // x4 = mixr tmp0, tmp2 | ||
910 : | // mix4.r loc21 = loc33, loc35 // x5 = mixr tmp1, tmp3 | ||
911 : | // mix4.l loc22 = loc32, loc34 // x6 = mixl tmp0, tmp2 | ||
912 : | // mix4.l loc23 = loc33, loc35 // x7 = mixl tmp1, tmp3 | ||
913 : | // ;; | ||
914 : | // mix2.r loc32 = loc52, loc53 // tmp0 = mixr y4, y5 | ||
915 : | // mix2.l loc33 = loc52, loc53 // tmp1 = mixl y4, y5 | ||
916 : | // mix2.r loc34 = loc54, loc55 // tmp2 = mixr y6, y7 | ||
917 : | // mix2.l loc35 = loc54, loc55 // tmp3 = mixl y6, y7 | ||
918 : | // ;; | ||
919 : | // mix4.r loc24 = loc32, loc34 // x0.2 = mixr tmp0, tmp2 | ||
920 : | // mix4.r loc25 = loc33, loc35 // x1.2 = mixr tmp1, tmp3 | ||
921 : | // mix4.l loc26 = loc32, loc34 // x2.2 = mixl tmp0, tmp2 | ||
922 : | // mix4.l loc27 = loc33, loc35 // x3.2 = mixl tmp1, tmp3 | ||
923 : | // ;; | ||
924 : | // mix2.r loc32 = loc44, loc45 // tmp0 = mixr y4.2, y5.2 | ||
925 : | // mix2.l loc33 = loc44, loc45 // tmp1 = mixl y4.2, y5.2 | ||
926 : | // mix2.r loc34 = loc46, loc47 // tmp2 = mixr y6.2, y6.2 | ||
927 : | // mix2.l loc35 = loc46, loc47 // tmp3 = mixl y6.2, y6.2 | ||
928 : | // ;; | ||
929 : | // mix4.r loc28 = loc32, loc34 // x4.2 = mixr tmp0, tmp2 | ||
930 : | // mix4.r loc29 = loc33, loc35 // x5.2 = mixr tmp1, tmp3 | ||
931 : | // mix4.l loc30 = loc32, loc34 // x6.2 = mixl tmp0, tmp2 | ||
932 : | // mix4.l loc31 = loc33, loc35 // x7.2 = mixl tmp1, tmp3 | ||
933 : | // ;; | ||
934 : | // | ||
935 : | // // ******************* | ||
936 : | // // row-DTC 1st half | ||
937 : | // // ******************* | ||
938 : | // | ||
939 : | // padd2 loc32 = loc16, loc23 // t0 = x0 + x7 | ||
940 : | // padd2 loc33 = loc17, loc22 // t1 = x1 + x6 | ||
941 : | // padd2 loc34 = loc18, loc21 // t2 = x2 + x5 | ||
942 : | // padd2 loc35 = loc19, loc20 // t3 = x3 + x4 | ||
943 : | // psub2 loc36 = loc16, loc23 // t4 = x0 - x7 | ||
944 : | // psub2 loc37 = loc17, loc22 // t5 = x1 - x6 | ||
945 : | // psub2 loc38 = loc18, loc21 // t6 = x2 - x5 | ||
946 : | // psub2 loc39 = loc19, loc20 // t7 = x3 - x4 | ||
947 : | // ;; | ||
948 : | // padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 | ||
949 : | // psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 | ||
950 : | // ;; | ||
951 : | // pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 | ||
952 : | // pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 | ||
953 : | // ;; | ||
954 : | // padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 | ||
955 : | // padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 | ||
956 : | // ;; | ||
957 : | // padd2 loc16 = loc32, loc35 // x0 = t0 + t3 | ||
958 : | // padd2 loc17 = loc33, loc34 // x1 = t1 + t2 | ||
959 : | // psub2 loc18 = loc32, loc35 // x2 = t0 - t3 | ||
960 : | // psub2 loc19 = loc33, loc34 // x3 = t1 - t2 | ||
961 : | // padd2 loc20 = loc36, loc37 // x4 = t4 + t5 | ||
962 : | // padd2 loc21 = loc38, loc39 // x5 = t6 + t7 | ||
963 : | // psub2 loc22 = loc36, loc37 // x6 = t4 - t5 | ||
964 : | // psub2 loc23 = loc38, loc39 // x7 = t6 - t7 | ||
965 : | // ;; | ||
966 : | // padd2 loc32 = loc16, loc17 // t0 = x0 + x1 | ||
967 : | // psub2 loc33 = loc16, loc17 // t1 = x0 - x1 | ||
968 : | // pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 | ||
969 : | // pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 | ||
970 : | // pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 | ||
971 : | // pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 | ||
972 : | // pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 | ||
973 : | // pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 | ||
974 : | // ;; | ||
975 : | // padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 | ||
976 : | // padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 | ||
977 : | // ;; | ||
978 : | // padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) | ||
979 : | // psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 | ||
980 : | // padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) | ||
981 : | // psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 | ||
982 : | // padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) | ||
983 : | // psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 | ||
984 : | // ;; | ||
985 : | // pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 | ||
986 : | // pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 | ||
987 : | // pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 | ||
988 : | // pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 | ||
989 : | // pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 | ||
990 : | // pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 | ||
991 : | // pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 | ||
992 : | // pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 | ||
993 : | // ;; | ||
994 : | // padd2 loc48 = loc16, loc32 // y0 = x0 + t0 | ||
995 : | // padd2 loc49 = loc20, loc36 // y1 = x4 + t4 | ||
996 : | // padd2 loc50 = loc18, loc34 // y2 = x2 + t2 | ||
997 : | // padd2 loc51 = loc22, loc38 // y3 = x6 + t6 | ||
998 : | // padd2 loc52 = loc17, loc33 // y4 = x1 + t1 | ||
999 : | // padd2 loc53 = loc23, loc39 // y5 = x7 + t7 | ||
1000 : | // padd2 loc54 = loc19, loc35 // y6 = x3 + t3 | ||
1001 : | // padd2 loc55 = loc21, loc37 // y7 = x5 + t5 | ||
1002 : | // ;; | ||
1003 : | // | ||
1004 : | // // ******************* | ||
1005 : | // // row-DTC 2nd half | ||
1006 : | // // ******************* | ||
1007 : | // | ||
1008 : | // padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2 | ||
1009 : | // padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2 | ||
1010 : | // padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2 | ||
1011 : | // padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2 | ||
1012 : | // psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2 | ||
1013 : | // psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2 | ||
1014 : | // psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2 | ||
1015 : | // psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2 | ||
1016 : | // ;; | ||
1017 : | // padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 | ||
1018 : | // psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 | ||
1019 : | // ;; | ||
1020 : | // pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 | ||
1021 : | // pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 | ||
1022 : | // ;; | ||
1023 : | // padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 | ||
1024 : | // padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 | ||
1025 : | // ;; | ||
1026 : | // padd2 loc16 = loc32, loc35 // x0 = t0 + t3 | ||
1027 : | // padd2 loc17 = loc33, loc34 // x1 = t1 + t2 | ||
1028 : | // psub2 loc18 = loc32, loc35 // x2 = t0 - t3 | ||
1029 : | // psub2 loc19 = loc33, loc34 // x3 = t1 - t2 | ||
1030 : | // padd2 loc20 = loc36, loc37 // x4 = t4 + t5 | ||
1031 : | // padd2 loc21 = loc38, loc39 // x5 = t6 + t7 | ||
1032 : | // psub2 loc22 = loc36, loc37 // x6 = t4 - t5 | ||
1033 : | // psub2 loc23 = loc38, loc39 // x7 = t6 - t7 | ||
1034 : | // ;; | ||
1035 : | // padd2 loc32 = loc16, loc17 // t0 = x0 + x1 | ||
1036 : | // psub2 loc33 = loc16, loc17 // t1 = x0 - x1 | ||
1037 : | // pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 | ||
1038 : | // pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 | ||
1039 : | // pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 | ||
1040 : | // pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 | ||
1041 : | // pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 | ||
1042 : | // pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 | ||
1043 : | // ;; | ||
1044 : | // padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 | ||
1045 : | // padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 | ||
1046 : | // ;; | ||
1047 : | // padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) | ||
1048 : | // psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 | ||
1049 : | // padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) | ||
1050 : | // psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 | ||
1051 : | // padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) | ||
1052 : | // psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 | ||
1053 : | // ;; | ||
1054 : | // pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 | ||
1055 : | // pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 | ||
1056 : | // pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 | ||
1057 : | // pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 | ||
1058 : | // pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 | ||
1059 : | // pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 | ||
1060 : | // pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 | ||
1061 : | // pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 | ||
1062 : | // ;; | ||
1063 : | // padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0 | ||
1064 : | // padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4 | ||
1065 : | // padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2 | ||
1066 : | // padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6 | ||
1067 : | // padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1 | ||
1068 : | // padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7 | ||
1069 : | // padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3 | ||
1070 : | // padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5 | ||
1071 : | // ;; | ||
1072 : | // // ******************* | ||
1073 : | // // Transpose matrix | ||
1074 : | // // ******************* | ||
1075 : | // | ||
1076 : | // mix2.l loc32 = loc49, loc48 // tmp0 = mixr y1, y0 | ||
1077 : | // mix2.r loc33 = loc49, loc48 // tmp1 = mixl y1, y0 | ||
1078 : | // mix2.l loc34 = loc51, loc50 // tmp2 = mixr y3, y2 | ||
1079 : | // mix2.r loc35 = loc51, loc50 // tmp3 = mixl y3, y2 | ||
1080 : | // ;; | ||
1081 : | // mix4.l loc16 = loc34, loc32 // x0 = mixr tmp2, tmp0 | ||
1082 : | // mix4.l loc17 = loc35, loc33 // x1 = mixr tmp3, tmp1 | ||
1083 : | // mix4.r loc18 = loc34, loc32 // x2 = mixl tmp2, tmp0 | ||
1084 : | // mix4.r loc19 = loc35, loc33 // x3 = mixl tmp3, tmp1 | ||
1085 : | // ;; | ||
1086 : | // mix2.l loc32 = loc41, loc40 // tmp0 = mixr y0.2, y1.2 | ||
1087 : | // mix2.r loc33 = loc41, loc40 // tmp1 = mixl y0.2, y1.2 | ||
1088 : | // mix2.l loc34 = loc43, loc42 // tmp2 = mixr y2.2, y3.2 | ||
1089 : | // mix2.r loc35 = loc43, loc42 // tmp3 = mixl y2.2, y3.2 | ||
1090 : | // ;; | ||
1091 : | // mix4.l loc20 = loc34, loc32 // x4 = mixr tmp2, tmp0 | ||
1092 : | // mix4.l loc21 = loc35, loc33 // x5 = mixr tmp3, tmp1 | ||
1093 : | // mix4.r loc22 = loc34, loc32 // x6 = mixl tmp2, tmp0 | ||
1094 : | // mix4.r loc23 = loc35, loc33 // x7 = mixl tmp3, tmp1 | ||
1095 : | // ;; | ||
1096 : | // mix2.l loc32 = loc53, loc52 // tmp0 = mixr y5, y4 | ||
1097 : | // mix2.r loc33 = loc53, loc52 // tmp1 = mixl y5, y4 | ||
1098 : | // mix2.l loc34 = loc55, loc54 // tmp2 = mixr y7, y6 | ||
1099 : | // mix2.r loc35 = loc55, loc54 // tmp3 = mixl y7, y6 | ||
1100 : | // ;; | ||
1101 : | // mix4.l loc24 = loc34, loc32 // x0.2 = mixr tmp2, tmp0 | ||
1102 : | // mix4.l loc25 = loc35, loc33 // x1.2 = mixr tmp3, tmp1 | ||
1103 : | // mix4.r loc26 = loc34, loc32 // x2.2 = mixl tmp2, tmp0 | ||
1104 : | // mix4.r loc27 = loc35, loc33 // x3.2 = mixl tmp3, tmp1 | ||
1105 : | // ;; | ||
1106 : | // mix2.l loc32 = loc45, loc44 // tmp0 = mixr y4.2, y5.2 | ||
1107 : | // mix2.r loc33 = loc45, loc44 // tmp1 = mixl y4.2, y5.2 | ||
1108 : | // mix2.l loc34 = loc47, loc46 // tmp2 = mixr y6.2, y6.2 | ||
1109 : | // mix2.r loc35 = loc47, loc46 // tmp3 = mixl y6.2, y6.2 | ||
1110 : | // ;; | ||
1111 : | // mix4.l loc28 = loc34, loc32 // x4.2 = mixr tmp2, tmp0 | ||
1112 : | // mix4.l loc29 = loc35, loc33 // x5.2 = mixr tmp3, tmp1 | ||
1113 : | // mix4.r loc30 = loc34, loc32 // x6.2 = mixl tmp2, tmp0 | ||
1114 : | // mix4.r loc31 = loc35, loc33 // x7.2 = mixl tmp3, tmp1 | ||
1115 : | // ;; | ||
1116 : | // | ||
1117 : | // // ******** | ||
1118 : | // // descale | ||
1119 : | // // ******** | ||
1120 : | // | ||
1121 : | // padd2 loc16 = loc16, r23 | ||
1122 : | // padd2 loc17 = loc17, r23 | ||
1123 : | // padd2 loc18 = loc18, r23 | ||
1124 : | // padd2 loc19 = loc19, r23 | ||
1125 : | // padd2 loc20 = loc20, r23 | ||
1126 : | // padd2 loc21 = loc21, r23 | ||
1127 : | // padd2 loc22 = loc22, r23 | ||
1128 : | // padd2 loc23 = loc23, r23 | ||
1129 : | // padd2 loc24 = loc24, r23 | ||
1130 : | // padd2 loc25 = loc25, r23 | ||
1131 : | // padd2 loc26 = loc26, r23 | ||
1132 : | // padd2 loc27 = loc27, r23 | ||
1133 : | // padd2 loc28 = loc28, r23 | ||
1134 : | // padd2 loc29 = loc29, r23 | ||
1135 : | // padd2 loc30 = loc30, r23 | ||
1136 : | // padd2 loc31 = loc31, r23 | ||
1137 : | // ;; | ||
1138 : | // pshr2 loc16 = loc16, 3 | ||
1139 : | // pshr2 loc17 = loc17, 3 | ||
1140 : | // pshr2 loc18 = loc18, 3 | ||
1141 : | // pshr2 loc19 = loc19, 3 | ||
1142 : | // pshr2 loc20 = loc20, 3 | ||
1143 : | // pshr2 loc21 = loc21, 3 | ||
1144 : | // pshr2 loc22 = loc22, 3 | ||
1145 : | // pshr2 loc23 = loc23, 3 | ||
1146 : | // pshr2 loc24 = loc24, 3 | ||
1147 : | // pshr2 loc25 = loc25, 3 | ||
1148 : | // pshr2 loc26 = loc26, 3 | ||
1149 : | // pshr2 loc27 = loc27, 3 | ||
1150 : | // pshr2 loc28 = loc28, 3 | ||
1151 : | // pshr2 loc29 = loc29, 3 | ||
1152 : | // pshr2 loc30 = loc30, 3 | ||
1153 : | // pshr2 loc31 = loc31, 3 | ||
1154 : | // ;; | ||
1155 : | // // ************ | ||
1156 : | // // Store Matrix | ||
1157 : | // // ************ | ||
1158 : | // st8 [loc0] = loc16 | ||
1159 : | // st8 [loc1] = loc24 | ||
1160 : | // st8 [loc2] = loc17 | ||
1161 : | // st8 [loc3] = loc25 | ||
1162 : | // st8 [loc4] = loc18 | ||
1163 : | // st8 [loc5] = loc26 | ||
1164 : | // st8 [loc6] = loc19 | ||
1165 : | // st8 [loc7] = loc27 | ||
1166 : | // st8 [loc8] = loc20 | ||
1167 : | // st8 [loc9] = loc28 | ||
1168 : | // st8 [loc10] = loc21 | ||
1169 : | // st8 [loc11] = loc29 | ||
1170 : | // st8 [loc12] = loc22 | ||
1171 : | // st8 [loc13] = loc30 | ||
1172 : | // st8 [loc14] = loc23 | ||
1173 : | // st8 [loc15] = loc31 | ||
1174 : | // | ||
1175 : | // mov ar.pfs = r14 | ||
1176 : | // br.ret.sptk.many b0 | ||
1177 : | // .endp fdct_ia64# | ||
1178 : | // .common fdct#,8,8 | ||
1179 : | // |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |