1 |
// **************************************************************************** |
2 |
// * |
3 |
// * XVID MPEG-4 VIDEO CODEC |
4 |
// * - IA64 forward discrete cosine transform - |
5 |
// * |
6 |
// * Copyright(C) 2002 Stephan Krause, Ingo-Marc Weber, Daniel Kallfass |
7 |
// * |
8 |
// * This program is free software; you can redistribute it and/or modify it |
9 |
// * under the terms of the GNU General Public License as published by |
10 |
// * the Free Software Foundation; either version 2 of the License, or |
11 |
// * (at your option) any later version. |
12 |
// * |
13 |
// * This program is distributed in the hope that it will be useful, |
14 |
// * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
// * GNU General Public License for more details. |
17 |
// * |
18 |
// * You should have received a copy of the GNU General Public License |
19 |
// * along with this program; if not, write to the Free Software |
20 |
// * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
// * |
22 |
// * $Id: fdct_ia64.s,v 1.6 2009-02-19 17:07:29 Isibaar Exp $ |
23 |
// * |
24 |
// ***************************************************************************/ |
25 |
// |
26 |
// **************************************************************************** |
27 |
// * |
28 |
// * fdct_ia64.s, IA-64 optimized forward DCT |
29 |
// * |
30 |
// * Completed version provided by Intel at AppNote AP-922 |
31 |
// * http://developer.intel.com/software/products/college/ia32/strmsimd/ |
32 |
// * Copyright (C) 1999 Intel Corporation, |
33 |
// * |
34 |
// * This version was implemented during an IA-64 practical training at |
35 |
// * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/) |
36 |
// * |
37 |
// ***************************************************************************** |
38 |
// |
39 |
// ***************************************************************************** |
40 |
// * |
41 |
// * Revision history: |
42 |
// * |
43 |
// * 24.07.2002 Initial Version |
44 |
// * |
45 |
// ***************************************************************************** |
46 |
|
47 |
|
48 |
// This is a fast precise implementation of 8x8 Discrete Cosine Transform |
49 |
// published in Intel Application Note 922 from 1999 and optimized for IA-64. |
50 |
// |
51 |
// An unoptimized "straight forward" version can be found at the end of this file. |
52 |
|
53 |
|
54 |
.pred.safe_across_calls p1-p5,p16-p63 |
55 |
.text |
56 |
.align 16 |
57 |
.global fdct_ia64# |
58 |
.proc fdct_ia64# |
59 |
fdct_ia64: |
60 |
.prologue |
61 |
alloc r14 = ar.pfs, 1, 56, 0, 0 |
62 |
// Save constants |
63 |
mov r31 = 0x32ec // c0 = tan(1pi/16) |
64 |
mov r30 = 0x6a0a // c1 = tan(2pi/16) |
65 |
mov r29 = 0xab0e // c2 = tan(3pi/16) |
66 |
mov r28 = 0xb505 // g4 = cos(4pi/16) |
67 |
mov r27 = 0xd4db // g3 = cos(3pi/16) |
68 |
mov r26 = 0xec83 // g2 = cos(2pi/16) |
69 |
mov r25 = 0xfb15 // g1 = cos(1pi/16) |
70 |
mov r24 = 0x0002 // correction bit for descaling |
71 |
mov r23 = 0x0004 // correction bit for descaling |
72 |
|
73 |
// Load Matrix into registers |
74 |
|
75 |
add loc0 = r0, r32 |
76 |
add loc2 = 16, r32 |
77 |
add loc4 = 32, r32 |
78 |
add loc6 = 48, r32 |
79 |
add loc8 = 64, r32 |
80 |
add loc10 = 80, r32 |
81 |
add loc12 = 96, r32 |
82 |
add loc14 = 112, r32 |
83 |
add loc1 = 8, r32 |
84 |
add loc3 = 24, r32 |
85 |
add loc5 = 40, r32 |
86 |
add loc7 = 56, r32 |
87 |
add loc9 = 72, r32 |
88 |
add loc11 = 88, r32 |
89 |
add loc13 = 104, r32 |
90 |
add loc15 = 120, r32 |
91 |
;; |
92 |
ld8 loc16 = [loc0] |
93 |
ld8 loc17 = [loc2] |
94 |
ld8 loc18 = [loc4] |
95 |
ld8 loc19 = [loc6] |
96 |
ld8 loc20 = [loc8] |
97 |
ld8 loc21 = [loc10] |
98 |
ld8 loc22 = [loc12] |
99 |
ld8 loc23 = [loc14] |
100 |
ld8 loc24 = [loc1] |
101 |
ld8 loc25 = [loc3] |
102 |
ld8 loc26 = [loc5] |
103 |
ld8 loc27 = [loc7] |
104 |
mux2 r26 = r26, 0x00 |
105 |
ld8 loc28 = [loc9] |
106 |
mux2 r31 = r31, 0x00 |
107 |
mux2 r25 = r25, 0x00 |
108 |
ld8 loc29 = [loc11] |
109 |
mux2 r30 = r30, 0x00 |
110 |
mux2 r29 = r29, 0x00 |
111 |
ld8 loc30 = [loc13] |
112 |
mux2 r28 = r28, 0x00 |
113 |
mux2 r27 = r27, 0x00 |
114 |
ld8 loc31 = [loc15] |
115 |
mux2 r24 = r24, 0x00 |
116 |
mux2 r23 = r23, 0x00 |
117 |
;; |
118 |
pshl2 loc16 = loc16, 3 |
119 |
pshl2 loc17 = loc17, 3 |
120 |
pshl2 loc18 = loc18, 3 |
121 |
pshl2 loc19 = loc19, 3 |
122 |
pshl2 loc20 = loc20, 3 |
123 |
pshl2 loc21 = loc21, 3 |
124 |
pshl2 loc22 = loc22, 3 |
125 |
pshl2 loc23 = loc23, 3 |
126 |
;; |
127 |
pshl2 loc24 = loc24, 3 |
128 |
|
129 |
// ******************* |
130 |
// column-DTC 1st half |
131 |
// ******************* |
132 |
|
133 |
psub2 loc37 = loc17, loc22 // t5 = x1 - x6 |
134 |
pshl2 loc25 = loc25, 3 |
135 |
pshl2 loc26 = loc26, 3 |
136 |
psub2 loc38 = loc18, loc21 // t6 = x2 - x5 |
137 |
pshl2 loc27 = loc27, 3 |
138 |
pshl2 loc28 = loc28, 3 |
139 |
;; |
140 |
padd2 loc32 = loc16, loc23 // t0 = x0 + x7 |
141 |
pshl2 loc29 = loc29, 3 |
142 |
pshl2 loc30 = loc30, 3 |
143 |
padd2 loc33 = loc17, loc22 // t1 = x1 + x6 |
144 |
padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
145 |
psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
146 |
;; |
147 |
padd2 loc34 = loc18, loc21 // t2 = x2 + x5 |
148 |
pshl2 loc31 = loc31, 3 |
149 |
padd2 loc35 = loc19, loc20 // t3 = x3 + x4 |
150 |
psub2 loc36 = loc16, loc23 // t4 = x0 - x7 |
151 |
pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
152 |
pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
153 |
;; |
154 |
psub2 loc39 = loc19, loc20 // t7 = x3 - x4 |
155 |
padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
156 |
padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
157 |
|
158 |
padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
159 |
padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
160 |
psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
161 |
;; |
162 |
psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
163 |
padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
164 |
padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
165 |
psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
166 |
psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
167 |
;; |
168 |
pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
169 |
padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
170 |
pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
171 |
pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
172 |
psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
173 |
pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
174 |
pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
175 |
;; |
176 |
padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
177 |
pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
178 |
padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
179 |
padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
180 |
psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
181 |
psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
182 |
;; |
183 |
padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
184 |
padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) |
185 |
pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
186 |
pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
187 |
psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 |
188 |
;; |
189 |
padd2 loc48 = loc16, loc32 // y0 = x0 + t0 |
190 |
pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
191 |
pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
192 |
padd2 loc52 = loc17, loc33 // y4 = x1 + t1 |
193 |
pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
194 |
pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
195 |
;; |
196 |
padd2 loc50 = loc18, loc34 // y2 = x2 + t2 |
197 |
pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
198 |
pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
199 |
padd2 loc55 = loc21, loc37 // y7 = x5 + t5 |
200 |
padd2 loc49 = loc20, loc36 // y1 = x4 + t4 |
201 |
padd2 loc54 = loc19, loc35 // y6 = x3 + t3 |
202 |
;; |
203 |
padd2 loc51 = loc22, loc38 // y3 = x6 + t6 |
204 |
padd2 loc53 = loc23, loc39 // y5 = x7 + t7 |
205 |
|
206 |
//divide by 4 |
207 |
|
208 |
padd2 loc48 = loc48, r24 |
209 |
padd2 loc49 = loc49, r24 |
210 |
padd2 loc50 = loc50, r24 |
211 |
padd2 loc52 = loc52, r24 |
212 |
;; |
213 |
padd2 loc51 = loc51, r24 |
214 |
pshr2 loc48 = loc48, 2 |
215 |
padd2 loc53 = loc53, r24 |
216 |
pshr2 loc49 = loc49, 2 |
217 |
padd2 loc54 = loc54, r24 |
218 |
pshr2 loc50 = loc50, 2 |
219 |
padd2 loc55 = loc55, r24 |
220 |
pshr2 loc52 = loc52, 2 |
221 |
;; |
222 |
pshr2 loc51 = loc51, 2 |
223 |
pshr2 loc53 = loc53, 2 |
224 |
pshr2 loc54 = loc54, 2 |
225 |
pshr2 loc55 = loc55, 2 |
226 |
|
227 |
|
228 |
// ******************* |
229 |
// column-DTC 2nd half |
230 |
// ******************* |
231 |
|
232 |
psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2 |
233 |
psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2 |
234 |
padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2 |
235 |
padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2 |
236 |
;; |
237 |
padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2 |
238 |
psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
239 |
padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
240 |
padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2 |
241 |
;; |
242 |
psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2 |
243 |
pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
244 |
pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
245 |
;; |
246 |
psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2 |
247 |
padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
248 |
padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
249 |
|
250 |
padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
251 |
padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
252 |
psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
253 |
;; |
254 |
psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
255 |
padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
256 |
padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
257 |
psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
258 |
psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
259 |
;; |
260 |
pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
261 |
padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
262 |
pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
263 |
pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
264 |
psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
265 |
pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
266 |
pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
267 |
pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
268 |
;; |
269 |
padd2 loc34 = loc18, loc43 // t2 = x2 + buf3 |
270 |
padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
271 |
padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
272 |
psub2 loc35 = loc42, loc19 // t3 = buf2 - x3 |
273 |
padd2 loc36 = loc20, loc45 // t4 = x4 + buf5 |
274 |
pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
275 |
;; |
276 |
psub2 loc37 = loc44, loc21 // t5 = buf4 - x5 |
277 |
padd2 loc38 = loc22, loc47 // t6 = x6 + buf7 |
278 |
psub2 loc39 = loc46, loc23 // t7 = buf6 - x7 |
279 |
pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
280 |
;; |
281 |
padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0 |
282 |
pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
283 |
pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
284 |
padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1 |
285 |
pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
286 |
pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
287 |
;; |
288 |
padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2 |
289 |
pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
290 |
pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
291 |
padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5 |
292 |
padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4 |
293 |
padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3 |
294 |
;; |
295 |
padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6 |
296 |
|
297 |
// ******************* |
298 |
// transpose matrix |
299 |
// ******************* |
300 |
|
301 |
mix2.r loc32 = loc48, loc49 // tmp0 = mixr y0, y1 |
302 |
mix2.l loc33 = loc48, loc49 // tmp1 = mixl y0, y1 |
303 |
padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7 |
304 |
mix2.r loc34 = loc50, loc51 // tmp2 = mixr y2, y3 |
305 |
mix2.l loc35 = loc50, loc51 // tmp3 = mixl y2, y3 |
306 |
;; |
307 |
|
308 |
//divide by 4 |
309 |
|
310 |
padd2 loc40 = loc40, r24 |
311 |
padd2 loc41 = loc41, r24 |
312 |
mix4.r loc16 = loc32, loc34 // x0 = mixr tmp0, tmp2 |
313 |
padd2 loc42 = loc42, r24 |
314 |
padd2 loc43 = loc43, r24 |
315 |
mix4.r loc17 = loc33, loc35 // x1 = mixr tmp1, tmp3 |
316 |
padd2 loc44 = loc44, r24 |
317 |
padd2 loc45 = loc45, r24 |
318 |
mix4.l loc18 = loc32, loc34 // x2 = mixl tmp0, tmp2 |
319 |
padd2 loc46 = loc46, r24 |
320 |
padd2 loc47 = loc47, r24 |
321 |
mix4.l loc19 = loc33, loc35 // x3 = mixl tmp1, tmp3 |
322 |
;; |
323 |
pshr2 loc40 = loc40, 2 |
324 |
pshr2 loc41 = loc41, 2 |
325 |
pshr2 loc42 = loc42, 2 |
326 |
pshr2 loc43 = loc43, 2 |
327 |
mix2.r loc32 = loc52, loc53 // tmp0 = mixr y4, y5 |
328 |
mix2.l loc33 = loc52, loc53 // tmp1 = mixl y4, y5 |
329 |
mix2.r loc34 = loc54, loc55 // tmp2 = mixr y6, y7 |
330 |
mix2.l loc35 = loc54, loc55 // tmp3 = mixl y6, y7 |
331 |
;; |
332 |
pshr2 loc44 = loc44, 2 |
333 |
pshr2 loc45 = loc45, 2 |
334 |
pshr2 loc46 = loc46, 2 |
335 |
pshr2 loc47 = loc47, 2 |
336 |
mix4.r loc24 = loc32, loc34 // x0.2 = mixr tmp0, tmp2 |
337 |
mix4.r loc25 = loc33, loc35 // x1.2 = mixr tmp1, tmp3 |
338 |
mix4.l loc26 = loc32, loc34 // x2.2 = mixl tmp0, tmp2 |
339 |
mix4.l loc27 = loc33, loc35 // x3.2 = mixl tmp1, tmp3 |
340 |
;; |
341 |
mix2.r loc32 = loc40, loc41 // tmp0 = mixr y0.2, y1.2 |
342 |
mix2.l loc33 = loc40, loc41 // tmp1 = mixl y0.2, y1.2 |
343 |
mix2.r loc34 = loc42, loc43 // tmp2 = mixr y2.2, y3.2 |
344 |
mix2.l loc35 = loc42, loc43 // tmp3 = mixl y2.2, y3.2 |
345 |
;; |
346 |
mix4.r loc20 = loc32, loc34 // x4 = mixr tmp0, tmp2 |
347 |
mix4.r loc21 = loc33, loc35 // x5 = mixr tmp1, tmp3 |
348 |
mix4.l loc22 = loc32, loc34 // x6 = mixl tmp0, tmp2 |
349 |
mix4.l loc23 = loc33, loc35 // x7 = mixl tmp1, tmp3 |
350 |
;; |
351 |
mix2.r loc32 = loc44, loc45 // tmp0 = mixr y4.2, y5.2 |
352 |
mix2.l loc33 = loc44, loc45 // tmp1 = mixl y4.2, y5.2 |
353 |
mix2.r loc34 = loc46, loc47 // tmp2 = mixr y6.2, y6.2 |
354 |
mix2.l loc35 = loc46, loc47 // tmp3 = mixl y6.2, y6.2 |
355 |
;; |
356 |
mix4.r loc28 = loc32, loc34 // x4.2 = mixr tmp0, tmp2 |
357 |
mix4.r loc29 = loc33, loc35 // x5.2 = mixr tmp1, tmp3 |
358 |
mix4.l loc30 = loc32, loc34 // x6.2 = mixl tmp0, tmp2 |
359 |
mix4.l loc31 = loc33, loc35 // x7.2 = mixl tmp1, tmp3 |
360 |
|
361 |
// ******************* |
362 |
// row-DTC 1st half |
363 |
// ******************* |
364 |
|
365 |
psub2 loc37 = loc17, loc22 // t5 = x1 - x6 |
366 |
psub2 loc38 = loc18, loc21 // t6 = x2 - x5 |
367 |
;; |
368 |
padd2 loc32 = loc16, loc23 // t0 = x0 + x7 |
369 |
padd2 loc33 = loc17, loc22 // t1 = x1 + x6 |
370 |
padd2 loc34 = loc18, loc21 // t2 = x2 + x5 |
371 |
psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
372 |
padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
373 |
padd2 loc35 = loc19, loc20 // t3 = x3 + x4 |
374 |
;; |
375 |
psub2 loc36 = loc16, loc23 // t4 = x0 - x7 |
376 |
pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
377 |
pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
378 |
;; |
379 |
psub2 loc39 = loc19, loc20 // t7 = x3 - x4 |
380 |
padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
381 |
padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
382 |
|
383 |
padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
384 |
padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
385 |
psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
386 |
;; |
387 |
psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
388 |
padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
389 |
padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
390 |
psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
391 |
psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
392 |
;; |
393 |
pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
394 |
padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
395 |
pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
396 |
pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
397 |
psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
398 |
pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
399 |
pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
400 |
pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
401 |
;; |
402 |
padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
403 |
padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
404 |
padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
405 |
;; |
406 |
psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
407 |
padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
408 |
psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
409 |
padd2 loc38 = loc22, loc47 // t6 = x6 + (buf7 * c1) |
410 |
psub2 loc39 = loc46, loc23 // t7 = (c1 * buf6) - x7 |
411 |
pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
412 |
pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
413 |
;; |
414 |
padd2 loc48 = loc16, loc32 // y0 = x0 + t0 |
415 |
pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
416 |
pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
417 |
padd2 loc52 = loc17, loc33 // y4 = x1 + t1 |
418 |
pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
419 |
pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
420 |
;; |
421 |
padd2 loc50 = loc18, loc34 // y2 = x2 + t2 |
422 |
pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
423 |
pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
424 |
padd2 loc55 = loc21, loc37 // y7 = x5 + t5 |
425 |
padd2 loc49 = loc20, loc36 // y1 = x4 + t4 |
426 |
padd2 loc54 = loc19, loc35 // y6 = x3 + t3 |
427 |
;; |
428 |
padd2 loc51 = loc22, loc38 // y3 = x6 + t6 |
429 |
padd2 loc53 = loc23, loc39 // y5 = x7 + t7 |
430 |
|
431 |
// ******************* |
432 |
// row-DTC 2nd half |
433 |
// ******************* |
434 |
|
435 |
psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2 |
436 |
psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2 |
437 |
padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2 |
438 |
padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2 |
439 |
;; |
440 |
padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2 |
441 |
psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
442 |
padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
443 |
padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2 |
444 |
;; |
445 |
psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2 |
446 |
pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
447 |
pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
448 |
;; |
449 |
psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2 |
450 |
padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
451 |
padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
452 |
|
453 |
padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
454 |
padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
455 |
psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
456 |
;; |
457 |
psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
458 |
padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
459 |
padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
460 |
psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
461 |
psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
462 |
;; |
463 |
pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
464 |
padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
465 |
pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
466 |
pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
467 |
psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
468 |
pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
469 |
pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
470 |
pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
471 |
;; |
472 |
padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
473 |
padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
474 |
padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
475 |
;; |
476 |
psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
477 |
padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
478 |
psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
479 |
padd2 loc38 = loc22, loc47 // t6 = x6 + (buf7 * c1) |
480 |
psub2 loc39 = loc46, loc23 // t7 = (c1 * buf6) - x7 |
481 |
pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
482 |
pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
483 |
;; |
484 |
padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0 |
485 |
pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
486 |
pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
487 |
padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1 |
488 |
pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
489 |
pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
490 |
;; |
491 |
padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2 |
492 |
pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
493 |
pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
494 |
padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3 |
495 |
nop.i 0x0 |
496 |
nop.i 0x0 |
497 |
;; |
498 |
|
499 |
// ******************* |
500 |
// Transpose matrix |
501 |
// ******************* |
502 |
padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4 |
503 |
mix2.l loc32 = loc49, loc48 // tmp0 = mixr y1, y0 |
504 |
mix2.r loc33 = loc49, loc48 // tmp1 = mixl y1, y0 |
505 |
padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5 |
506 |
mix2.l loc34 = loc51, loc50 // tmp2 = mixr y3, y2 |
507 |
mix2.r loc35 = loc51, loc50 // tmp3 = mixl y3, y2 |
508 |
;; |
509 |
padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6 |
510 |
mix4.l loc16 = loc34, loc32 // x0 = mixr tmp2, tmp0 |
511 |
mix4.l loc17 = loc35, loc33 // x1 = mixr tmp3, tmp1 |
512 |
padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7 |
513 |
mix4.r loc18 = loc34, loc32 // x2 = mixl tmp2, tmp0 |
514 |
mix4.r loc19 = loc35, loc33 // x3 = mixl tmp3, tmp1 |
515 |
;; |
516 |
padd2 loc16 = loc16, r23 |
517 |
mix2.l loc32 = loc41, loc40 // tmp0 = mixr y0.2, y1.2 |
518 |
mix2.r loc33 = loc41, loc40 // tmp1 = mixl y0.2, y1.2 |
519 |
padd2 loc17 = loc17, r23 |
520 |
mix2.l loc34 = loc43, loc42 // tmp2 = mixr y2.2, y3.2 |
521 |
mix2.r loc35 = loc43, loc42 // tmp3 = mixl y2.2, y3.2 |
522 |
;; |
523 |
padd2 loc18 = loc18, r23 |
524 |
mix4.l loc20 = loc34, loc32 // x4 = mixr tmp2, tmp0 |
525 |
mix4.l loc21 = loc35, loc33 // x5 = mixr tmp3, tmp1 |
526 |
padd2 loc19 = loc19, r23 |
527 |
mix4.r loc22 = loc34, loc32 // x6 = mixl tmp2, tmp0 |
528 |
mix4.r loc23 = loc35, loc33 // x7 = mixl tmp3, tmp1 |
529 |
;; |
530 |
padd2 loc20 = loc20, r23 |
531 |
mix2.l loc32 = loc53, loc52 // tmp0 = mixr y5, y4 |
532 |
mix2.r loc33 = loc53, loc52 // tmp1 = mixl y5, y4 |
533 |
padd2 loc21 = loc21, r23 |
534 |
mix2.l loc34 = loc55, loc54 // tmp2 = mixr y7, y6 |
535 |
mix2.r loc35 = loc55, loc54 // tmp3 = mixl y7, y6 |
536 |
;; |
537 |
padd2 loc22 = loc22, r23 |
538 |
mix4.l loc24 = loc34, loc32 // x0.2 = mixr tmp2, tmp0 |
539 |
mix4.l loc25 = loc35, loc33 // x1.2 = mixr tmp3, tmp1 |
540 |
padd2 loc23 = loc23, r23 |
541 |
mix4.r loc26 = loc34, loc32 // x2.2 = mixl tmp2, tmp0 |
542 |
mix4.r loc27 = loc35, loc33 // x3.2 = mixl tmp3, tmp1 |
543 |
;; |
544 |
padd2 loc24 = loc24, r23 |
545 |
mix2.l loc32 = loc45, loc44 // tmp0 = mixr y4.2, y5.2 |
546 |
mix2.r loc33 = loc45, loc44 // tmp1 = mixl y4.2, y5.2 |
547 |
padd2 loc25 = loc25, r23 |
548 |
mix2.l loc34 = loc47, loc46 // tmp2 = mixr y6.2, y6.2 |
549 |
mix2.r loc35 = loc47, loc46 // tmp3 = mixl y6.2, y6.2 |
550 |
;; |
551 |
padd2 loc26 = loc26, r23 |
552 |
mix4.l loc28 = loc34, loc32 // x4.2 = mixr tmp2, tmp0 |
553 |
mix4.l loc29 = loc35, loc33 // x5.2 = mixr tmp3, tmp1 |
554 |
padd2 loc27 = loc27, r23 |
555 |
mix4.r loc30 = loc34, loc32 // x6.2 = mixl tmp2, tmp0 |
556 |
mix4.r loc31 = loc35, loc33 // x7.2 = mixl tmp3, tmp1 |
557 |
;; |
558 |
// ******************* |
559 |
// Descale |
560 |
// ******************* |
561 |
padd2 loc28 = loc28, r23 |
562 |
pshr2 loc16 = loc16, 3 |
563 |
pshr2 loc17 = loc17, 3 |
564 |
padd2 loc29 = loc29, r23 |
565 |
pshr2 loc18 = loc18, 3 |
566 |
pshr2 loc19 = loc19, 3 |
567 |
padd2 loc30 = loc30, r23 |
568 |
pshr2 loc20 = loc20, 3 |
569 |
pshr2 loc21 = loc21, 3 |
570 |
padd2 loc31 = loc31, r23 |
571 |
pshr2 loc22 = loc22, 3 |
572 |
pshr2 loc23 = loc23, 3 |
573 |
;; |
574 |
pshr2 loc24 = loc24, 3 |
575 |
pshr2 loc25 = loc25, 3 |
576 |
pshr2 loc26 = loc26, 3 |
577 |
pshr2 loc27 = loc27, 3 |
578 |
pshr2 loc28 = loc28, 3 |
579 |
pshr2 loc29 = loc29, 3 |
580 |
pshr2 loc30 = loc30, 3 |
581 |
pshr2 loc31 = loc31, 3 |
582 |
;; |
583 |
// ******************* |
584 |
// Store matrix |
585 |
// ******************* |
586 |
st8 [loc0] = loc16 |
587 |
st8 [loc1] = loc24 |
588 |
st8 [loc2] = loc17 |
589 |
st8 [loc3] = loc25 |
590 |
st8 [loc4] = loc18 |
591 |
st8 [loc5] = loc26 |
592 |
st8 [loc6] = loc19 |
593 |
st8 [loc7] = loc27 |
594 |
st8 [loc8] = loc20 |
595 |
st8 [loc9] = loc28 |
596 |
st8 [loc10] = loc21 |
597 |
st8 [loc11] = loc29 |
598 |
st8 [loc12] = loc22 |
599 |
st8 [loc13] = loc30 |
600 |
st8 [loc14] = loc23 |
601 |
st8 [loc15] = loc31 |
602 |
|
603 |
mov ar.pfs = r14 |
604 |
br.ret.sptk.many b0 |
605 |
.endp fdct_ia64# |
606 |
.common fdct#,8,8 |
607 |
|
608 |
|
609 |
|
610 |
|
611 |
|
612 |
|
613 |
|
614 |
|
615 |
//*********************************************** |
616 |
//* Here is a version of the DCT implementation * |
617 |
//* unoptimized in terms of command ordering. * |
618 |
//* This version is about 30% slower but * |
619 |
//* easier understand. * |
620 |
//*********************************************** |
621 |
// |
622 |
// .pred.safe_across_calls p1-p5,p16-p63 |
623 |
//.text |
624 |
// .align 16 |
625 |
// .global fdct_ia64# |
626 |
// .proc fdct_ia64# |
627 |
//fdct_ia64: |
628 |
// .prologue |
629 |
// alloc r14 = ar.pfs, 1, 56, 0, 0 |
630 |
// |
631 |
// // ******************* |
632 |
// // Save constants |
633 |
// // ******************* |
634 |
// mov r31 = 0x32ec // c0 = tan(1pi/16) |
635 |
// mov r30 = 0x6a0a // c1 = tan(2pi/16) |
636 |
// mov r29 = 0xab0e // c2 = tan(3pi/16) |
637 |
// mov r28 = 0xb505 // g4 = cos(4pi/16) |
638 |
// mov r27 = 0xd4db // g3 = cos(3pi/16) |
639 |
// mov r26 = 0xec83 // g2 = cos(2pi/16) |
640 |
// mov r25 = 0xfb15 // g1 = cos(1pi/16) |
641 |
// mov r24 = 0x0002 // correction bit for descaling |
642 |
// mov r23 = 0x0004 // correction bit for descaling |
643 |
// |
644 |
// // ************************** |
645 |
// // Load Matrix into registers |
646 |
// // ************************** |
647 |
// |
648 |
// add loc0 = r0, r32 |
649 |
// ;; |
650 |
// mux2 r31 = r31, 0x00 |
651 |
// mux2 r30 = r30, 0x00 |
652 |
// mux2 r29 = r29, 0x00 |
653 |
// mux2 r28 = r28, 0x00 |
654 |
// mux2 r27 = r27, 0x00 |
655 |
// mux2 r26 = r26, 0x00 |
656 |
// mux2 r25 = r25, 0x00 |
657 |
// mux2 r24 = r24, 0x00 |
658 |
// mux2 r23 = r23, 0x00 |
659 |
// ld8 loc16 = [loc0] |
660 |
// add loc2 = 16, r32 |
661 |
// add loc4 = 32, r32 |
662 |
// add loc6 = 48, r32 |
663 |
// add loc8 = 64, r32 |
664 |
// add loc10 = 80, r32 |
665 |
// ;; |
666 |
// ld8 loc17 = [loc2] |
667 |
// ld8 loc18 = [loc4] |
668 |
// add loc12 = 96, r32 |
669 |
// ld8 loc19 = [loc6] |
670 |
// ld8 loc20 = [loc8] |
671 |
// add loc14 = 112, r32 |
672 |
// ;; |
673 |
// ld8 loc21 = [loc10] |
674 |
// ld8 loc22 = [loc12] |
675 |
// add loc1 = 8, r32 |
676 |
// ld8 loc23 = [loc14] |
677 |
// add loc3 = 24, r32 |
678 |
// add loc5 = 40, r32 |
679 |
// ;; |
680 |
// ld8 loc24 = [loc1] |
681 |
// ld8 loc25 = [loc3] |
682 |
// add loc7 = 56, r32 |
683 |
// ld8 loc26 = [loc5] |
684 |
// add loc9 = 72, r32 |
685 |
// add loc11 = 88, r32 |
686 |
// ;; |
687 |
// ld8 loc27 = [loc7] |
688 |
// ld8 loc28 = [loc9] |
689 |
// add loc13 = 104, r32 |
690 |
// ld8 loc29 = [loc11] |
691 |
// add loc15 = 120, r32 |
692 |
// ;; |
693 |
// ld8 loc30 = [loc13] |
694 |
// ld8 loc31 = [loc15] |
695 |
// ;; |
696 |
// // ****** |
697 |
// // Scale |
698 |
// // ****** |
699 |
// pshl2 loc16 = loc16, 3 |
700 |
// pshl2 loc17 = loc17, 3 |
701 |
// pshl2 loc18 = loc18, 3 |
702 |
// pshl2 loc19 = loc19, 3 |
703 |
// pshl2 loc20 = loc20, 3 |
704 |
// pshl2 loc21 = loc21, 3 |
705 |
// pshl2 loc22 = loc22, 3 |
706 |
// pshl2 loc23 = loc23, 3 |
707 |
// pshl2 loc24 = loc24, 3 |
708 |
// pshl2 loc25 = loc25, 3 |
709 |
// pshl2 loc26 = loc26, 3 |
710 |
// pshl2 loc27 = loc27, 3 |
711 |
// pshl2 loc28 = loc28, 3 |
712 |
// pshl2 loc29 = loc29, 3 |
713 |
// pshl2 loc30 = loc30, 3 |
714 |
// pshl2 loc31 = loc31, 3 |
715 |
// ;; |
716 |
// |
717 |
// // ******************* |
718 |
// // column-DTC 1st half |
719 |
// // ******************* |
720 |
// |
721 |
// padd2 loc32 = loc16, loc23 // t0 = x0 + x7 |
722 |
// padd2 loc33 = loc17, loc22 // t1 = x1 + x6 |
723 |
// padd2 loc34 = loc18, loc21 // t2 = x2 + x5 |
724 |
// padd2 loc35 = loc19, loc20 // t3 = x3 + x4 |
725 |
// psub2 loc36 = loc16, loc23 // t4 = x0 - x7 |
726 |
// psub2 loc37 = loc17, loc22 // t5 = x1 - x6 |
727 |
// psub2 loc38 = loc18, loc21 // t6 = x2 - x5 |
728 |
// psub2 loc39 = loc19, loc20 // t7 = x3 - x4 |
729 |
// ;; |
730 |
// padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
731 |
// psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
732 |
// ;; |
733 |
// pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
734 |
// pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
735 |
// ;; |
736 |
// padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
737 |
// padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
738 |
// ;; |
739 |
// padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
740 |
// padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
741 |
// psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
742 |
// psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
743 |
// padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
744 |
// padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
745 |
// psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
746 |
// psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
747 |
// ;; |
748 |
// |
749 |
// padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
750 |
// psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
751 |
// pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
752 |
// pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
753 |
// pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
754 |
// pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
755 |
// pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
756 |
// pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
757 |
// ;; |
758 |
// padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
759 |
// padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
760 |
// ;; |
761 |
// padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
762 |
// psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
763 |
// padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
764 |
// psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
765 |
// padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) |
766 |
// psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 |
767 |
// ;; |
768 |
// pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
769 |
// pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
770 |
// pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
771 |
// pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
772 |
// pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
773 |
// pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
774 |
// pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
775 |
// pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
776 |
// ;; |
777 |
// padd2 loc48 = loc16, loc32 // y0 = x0 + t0 |
778 |
// padd2 loc49 = loc20, loc36 // y1 = x4 + t4 |
779 |
// padd2 loc50 = loc18, loc34 // y2 = x2 + t2 |
780 |
// padd2 loc51 = loc22, loc38 // y3 = x6 + t6 |
781 |
// padd2 loc52 = loc17, loc33 // y4 = x1 + t1 |
782 |
// padd2 loc53 = loc23, loc39 // y5 = x7 + t7 |
783 |
// padd2 loc54 = loc19, loc35 // y6 = x3 + t3 |
784 |
// padd2 loc55 = loc21, loc37 // y7 = x5 + t5 |
785 |
// ;; |
786 |
// |
787 |
// // ******************* |
788 |
// // column-DTC 2nd half |
789 |
// // ******************* |
790 |
// |
791 |
// padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2 |
792 |
// padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2 |
793 |
// padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2 |
794 |
// padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2 |
795 |
// psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2 |
796 |
// psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2 |
797 |
// psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2 |
798 |
// psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2 |
799 |
// ;; |
800 |
// padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
801 |
// psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
802 |
// ;; |
803 |
// pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
804 |
// pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
805 |
// ;; |
806 |
// padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
807 |
// padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
808 |
// ;; |
809 |
// padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
810 |
// padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
811 |
// psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
812 |
// psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
813 |
// padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
814 |
// padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
815 |
// psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
816 |
// psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
817 |
// ;; |
818 |
// padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
819 |
// psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
820 |
// pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
821 |
// pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
822 |
// pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
823 |
// pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
824 |
// pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
825 |
// pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
826 |
// ;; |
827 |
// padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
828 |
// padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
829 |
// ;; |
830 |
// padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
831 |
// psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
832 |
// padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
833 |
// psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
834 |
// padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) |
835 |
// psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 |
836 |
// ;; |
837 |
// pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
838 |
// pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
839 |
// pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
840 |
// pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
841 |
// pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
842 |
// pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
843 |
// pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
844 |
// pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
845 |
// ;; |
846 |
// padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0 |
847 |
// padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4 |
848 |
// padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2 |
849 |
// padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6 |
850 |
// padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1 |
851 |
// padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7 |
852 |
// padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3 |
853 |
// padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5 |
854 |
// ;; |
855 |
// padd2 loc40 = loc40, r24 // add r24 to correct rounding |
856 |
// padd2 loc41 = loc41, r24 |
857 |
// padd2 loc42 = loc42, r24 |
858 |
// padd2 loc43 = loc43, r24 |
859 |
// padd2 loc44 = loc44, r24 |
860 |
// padd2 loc45 = loc45, r24 |
861 |
// padd2 loc46 = loc46, r24 |
862 |
// padd2 loc47 = loc47, r24 |
863 |
// padd2 loc48 = loc48, r24 |
864 |
// padd2 loc49 = loc49, r24 |
865 |
// padd2 loc50 = loc50, r24 |
866 |
// padd2 loc51 = loc51, r24 |
867 |
// padd2 loc52 = loc52, r24 |
868 |
// padd2 loc53 = loc53, r24 |
869 |
// padd2 loc54 = loc54, r24 |
870 |
// padd2 loc55 = loc55, r24 |
871 |
// ;; |
872 |
// pshr2 loc40 = loc40, 2 // Divide all matrix elements through 4 |
873 |
// pshr2 loc41 = loc41, 2 |
874 |
// pshr2 loc42 = loc42, 2 |
875 |
// pshr2 loc43 = loc43, 2 |
876 |
// pshr2 loc44 = loc44, 2 |
877 |
// pshr2 loc45 = loc45, 2 |
878 |
// pshr2 loc46 = loc46, 2 |
879 |
// pshr2 loc47 = loc47, 2 |
880 |
// pshr2 loc48 = loc48, 2 |
881 |
// pshr2 loc49 = loc49, 2 |
882 |
// pshr2 loc50 = loc50, 2 |
883 |
// pshr2 loc51 = loc51, 2 |
884 |
// pshr2 loc52 = loc52, 2 |
885 |
// pshr2 loc53 = loc53, 2 |
886 |
// pshr2 loc54 = loc54, 2 |
887 |
// pshr2 loc55 = loc55, 2 |
888 |
// ;; |
889 |
// |
890 |
// // ***************** |
891 |
// // Transpose matrix |
892 |
// // ***************** |
893 |
// |
894 |
// mix2.r loc32 = loc48, loc49 // tmp0 = mixr y0, y1 |
895 |
// mix2.l loc33 = loc48, loc49 // tmp1 = mixl y0, y1 |
896 |
// mix2.r loc34 = loc50, loc51 // tmp2 = mixr y2, y3 |
897 |
// mix2.l loc35 = loc50, loc51 // tmp3 = mixl y2, y3 |
898 |
// ;; |
899 |
// mix4.r loc16 = loc32, loc34 // x0 = mixr tmp0, tmp2 |
900 |
// mix4.r loc17 = loc33, loc35 // x1 = mixr tmp1, tmp3 |
901 |
// mix4.l loc18 = loc32, loc34 // x2 = mixl tmp0, tmp2 |
902 |
// mix4.l loc19 = loc33, loc35 // x3 = mixl tmp1, tmp3 |
903 |
// ;; |
904 |
// mix2.r loc32 = loc40, loc41 // tmp0 = mixr y0.2, y1.2 |
905 |
// mix2.l loc33 = loc40, loc41 // tmp1 = mixl y0.2, y1.2 |
906 |
// mix2.r loc34 = loc42, loc43 // tmp2 = mixr y2.2, y3.2 |
907 |
// mix2.l loc35 = loc42, loc43 // tmp3 = mixl y2.2, y3.2 |
908 |
// ;; |
909 |
// mix4.r loc20 = loc32, loc34 // x4 = mixr tmp0, tmp2 |
910 |
// mix4.r loc21 = loc33, loc35 // x5 = mixr tmp1, tmp3 |
911 |
// mix4.l loc22 = loc32, loc34 // x6 = mixl tmp0, tmp2 |
912 |
// mix4.l loc23 = loc33, loc35 // x7 = mixl tmp1, tmp3 |
913 |
// ;; |
914 |
// mix2.r loc32 = loc52, loc53 // tmp0 = mixr y4, y5 |
915 |
// mix2.l loc33 = loc52, loc53 // tmp1 = mixl y4, y5 |
916 |
// mix2.r loc34 = loc54, loc55 // tmp2 = mixr y6, y7 |
917 |
// mix2.l loc35 = loc54, loc55 // tmp3 = mixl y6, y7 |
918 |
// ;; |
919 |
// mix4.r loc24 = loc32, loc34 // x0.2 = mixr tmp0, tmp2 |
920 |
// mix4.r loc25 = loc33, loc35 // x1.2 = mixr tmp1, tmp3 |
921 |
// mix4.l loc26 = loc32, loc34 // x2.2 = mixl tmp0, tmp2 |
922 |
// mix4.l loc27 = loc33, loc35 // x3.2 = mixl tmp1, tmp3 |
923 |
// ;; |
924 |
// mix2.r loc32 = loc44, loc45 // tmp0 = mixr y4.2, y5.2 |
925 |
// mix2.l loc33 = loc44, loc45 // tmp1 = mixl y4.2, y5.2 |
926 |
// mix2.r loc34 = loc46, loc47 // tmp2 = mixr y6.2, y6.2 |
927 |
// mix2.l loc35 = loc46, loc47 // tmp3 = mixl y6.2, y6.2 |
928 |
// ;; |
929 |
// mix4.r loc28 = loc32, loc34 // x4.2 = mixr tmp0, tmp2 |
930 |
// mix4.r loc29 = loc33, loc35 // x5.2 = mixr tmp1, tmp3 |
931 |
// mix4.l loc30 = loc32, loc34 // x6.2 = mixl tmp0, tmp2 |
932 |
// mix4.l loc31 = loc33, loc35 // x7.2 = mixl tmp1, tmp3 |
933 |
// ;; |
934 |
// |
935 |
// // ******************* |
936 |
// // row-DTC 1st half |
937 |
// // ******************* |
938 |
// |
939 |
// padd2 loc32 = loc16, loc23 // t0 = x0 + x7 |
940 |
// padd2 loc33 = loc17, loc22 // t1 = x1 + x6 |
941 |
// padd2 loc34 = loc18, loc21 // t2 = x2 + x5 |
942 |
// padd2 loc35 = loc19, loc20 // t3 = x3 + x4 |
943 |
// psub2 loc36 = loc16, loc23 // t4 = x0 - x7 |
944 |
// psub2 loc37 = loc17, loc22 // t5 = x1 - x6 |
945 |
// psub2 loc38 = loc18, loc21 // t6 = x2 - x5 |
946 |
// psub2 loc39 = loc19, loc20 // t7 = x3 - x4 |
947 |
// ;; |
948 |
// padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
949 |
// psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
950 |
// ;; |
951 |
// pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
952 |
// pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
953 |
// ;; |
954 |
// padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
955 |
// padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
956 |
// ;; |
957 |
// padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
958 |
// padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
959 |
// psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
960 |
// psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
961 |
// padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
962 |
// padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
963 |
// psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
964 |
// psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
965 |
// ;; |
966 |
// padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
967 |
// psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
968 |
// pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
969 |
// pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
970 |
// pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
971 |
// pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
972 |
// pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
973 |
// pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
974 |
// ;; |
975 |
// padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
976 |
// padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
977 |
// ;; |
978 |
// padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
979 |
// psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
980 |
// padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
981 |
// psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
982 |
// padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) |
983 |
// psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 |
984 |
// ;; |
985 |
// pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
986 |
// pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
987 |
// pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
988 |
// pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
989 |
// pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
990 |
// pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
991 |
// pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
992 |
// pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
993 |
// ;; |
994 |
// padd2 loc48 = loc16, loc32 // y0 = x0 + t0 |
995 |
// padd2 loc49 = loc20, loc36 // y1 = x4 + t4 |
996 |
// padd2 loc50 = loc18, loc34 // y2 = x2 + t2 |
997 |
// padd2 loc51 = loc22, loc38 // y3 = x6 + t6 |
998 |
// padd2 loc52 = loc17, loc33 // y4 = x1 + t1 |
999 |
// padd2 loc53 = loc23, loc39 // y5 = x7 + t7 |
1000 |
// padd2 loc54 = loc19, loc35 // y6 = x3 + t3 |
1001 |
// padd2 loc55 = loc21, loc37 // y7 = x5 + t5 |
1002 |
// ;; |
1003 |
// |
1004 |
// // ******************* |
1005 |
// // row-DTC 2nd half |
1006 |
// // ******************* |
1007 |
// |
1008 |
// padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2 |
1009 |
// padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2 |
1010 |
// padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2 |
1011 |
// padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2 |
1012 |
// psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2 |
1013 |
// psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2 |
1014 |
// psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2 |
1015 |
// psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2 |
1016 |
// ;; |
1017 |
// padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
1018 |
// psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
1019 |
// ;; |
1020 |
// pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
1021 |
// pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
1022 |
// ;; |
1023 |
// padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
1024 |
// padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
1025 |
// ;; |
1026 |
// padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
1027 |
// padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
1028 |
// psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
1029 |
// psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
1030 |
// padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
1031 |
// padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
1032 |
// psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
1033 |
// psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
1034 |
// ;; |
1035 |
// padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
1036 |
// psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
1037 |
// pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
1038 |
// pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
1039 |
// pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
1040 |
// pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
1041 |
// pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
1042 |
// pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
1043 |
// ;; |
1044 |
// padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
1045 |
// padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
1046 |
// ;; |
1047 |
// padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
1048 |
// psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
1049 |
// padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
1050 |
// psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
1051 |
// padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) |
1052 |
// psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 |
1053 |
// ;; |
1054 |
// pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
1055 |
// pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
1056 |
// pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
1057 |
// pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
1058 |
// pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
1059 |
// pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
1060 |
// pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
1061 |
// pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
1062 |
// ;; |
1063 |
// padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0 |
1064 |
// padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4 |
1065 |
// padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2 |
1066 |
// padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6 |
1067 |
// padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1 |
1068 |
// padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7 |
1069 |
// padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3 |
1070 |
// padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5 |
1071 |
// ;; |
1072 |
// // ******************* |
1073 |
// // Transpose matrix |
1074 |
// // ******************* |
1075 |
// |
1076 |
// mix2.l loc32 = loc49, loc48 // tmp0 = mixr y1, y0 |
1077 |
// mix2.r loc33 = loc49, loc48 // tmp1 = mixl y1, y0 |
1078 |
// mix2.l loc34 = loc51, loc50 // tmp2 = mixr y3, y2 |
1079 |
// mix2.r loc35 = loc51, loc50 // tmp3 = mixl y3, y2 |
1080 |
// ;; |
1081 |
// mix4.l loc16 = loc34, loc32 // x0 = mixr tmp2, tmp0 |
1082 |
// mix4.l loc17 = loc35, loc33 // x1 = mixr tmp3, tmp1 |
1083 |
// mix4.r loc18 = loc34, loc32 // x2 = mixl tmp2, tmp0 |
1084 |
// mix4.r loc19 = loc35, loc33 // x3 = mixl tmp3, tmp1 |
1085 |
// ;; |
1086 |
// mix2.l loc32 = loc41, loc40 // tmp0 = mixr y0.2, y1.2 |
1087 |
// mix2.r loc33 = loc41, loc40 // tmp1 = mixl y0.2, y1.2 |
1088 |
// mix2.l loc34 = loc43, loc42 // tmp2 = mixr y2.2, y3.2 |
1089 |
// mix2.r loc35 = loc43, loc42 // tmp3 = mixl y2.2, y3.2 |
1090 |
// ;; |
1091 |
// mix4.l loc20 = loc34, loc32 // x4 = mixr tmp2, tmp0 |
1092 |
// mix4.l loc21 = loc35, loc33 // x5 = mixr tmp3, tmp1 |
1093 |
// mix4.r loc22 = loc34, loc32 // x6 = mixl tmp2, tmp0 |
1094 |
// mix4.r loc23 = loc35, loc33 // x7 = mixl tmp3, tmp1 |
1095 |
// ;; |
1096 |
// mix2.l loc32 = loc53, loc52 // tmp0 = mixr y5, y4 |
1097 |
// mix2.r loc33 = loc53, loc52 // tmp1 = mixl y5, y4 |
1098 |
// mix2.l loc34 = loc55, loc54 // tmp2 = mixr y7, y6 |
1099 |
// mix2.r loc35 = loc55, loc54 // tmp3 = mixl y7, y6 |
1100 |
// ;; |
1101 |
// mix4.l loc24 = loc34, loc32 // x0.2 = mixr tmp2, tmp0 |
1102 |
// mix4.l loc25 = loc35, loc33 // x1.2 = mixr tmp3, tmp1 |
1103 |
// mix4.r loc26 = loc34, loc32 // x2.2 = mixl tmp2, tmp0 |
1104 |
// mix4.r loc27 = loc35, loc33 // x3.2 = mixl tmp3, tmp1 |
1105 |
// ;; |
1106 |
// mix2.l loc32 = loc45, loc44 // tmp0 = mixr y4.2, y5.2 |
1107 |
// mix2.r loc33 = loc45, loc44 // tmp1 = mixl y4.2, y5.2 |
1108 |
// mix2.l loc34 = loc47, loc46 // tmp2 = mixr y6.2, y6.2 |
1109 |
// mix2.r loc35 = loc47, loc46 // tmp3 = mixl y6.2, y6.2 |
1110 |
// ;; |
1111 |
// mix4.l loc28 = loc34, loc32 // x4.2 = mixr tmp2, tmp0 |
1112 |
// mix4.l loc29 = loc35, loc33 // x5.2 = mixr tmp3, tmp1 |
1113 |
// mix4.r loc30 = loc34, loc32 // x6.2 = mixl tmp2, tmp0 |
1114 |
// mix4.r loc31 = loc35, loc33 // x7.2 = mixl tmp3, tmp1 |
1115 |
// ;; |
1116 |
// |
1117 |
// // ******** |
1118 |
// // descale |
1119 |
// // ******** |
1120 |
// |
1121 |
// padd2 loc16 = loc16, r23 |
1122 |
// padd2 loc17 = loc17, r23 |
1123 |
// padd2 loc18 = loc18, r23 |
1124 |
// padd2 loc19 = loc19, r23 |
1125 |
// padd2 loc20 = loc20, r23 |
1126 |
// padd2 loc21 = loc21, r23 |
1127 |
// padd2 loc22 = loc22, r23 |
1128 |
// padd2 loc23 = loc23, r23 |
1129 |
// padd2 loc24 = loc24, r23 |
1130 |
// padd2 loc25 = loc25, r23 |
1131 |
// padd2 loc26 = loc26, r23 |
1132 |
// padd2 loc27 = loc27, r23 |
1133 |
// padd2 loc28 = loc28, r23 |
1134 |
// padd2 loc29 = loc29, r23 |
1135 |
// padd2 loc30 = loc30, r23 |
1136 |
// padd2 loc31 = loc31, r23 |
1137 |
// ;; |
1138 |
// pshr2 loc16 = loc16, 3 |
1139 |
// pshr2 loc17 = loc17, 3 |
1140 |
// pshr2 loc18 = loc18, 3 |
1141 |
// pshr2 loc19 = loc19, 3 |
1142 |
// pshr2 loc20 = loc20, 3 |
1143 |
// pshr2 loc21 = loc21, 3 |
1144 |
// pshr2 loc22 = loc22, 3 |
1145 |
// pshr2 loc23 = loc23, 3 |
1146 |
// pshr2 loc24 = loc24, 3 |
1147 |
// pshr2 loc25 = loc25, 3 |
1148 |
// pshr2 loc26 = loc26, 3 |
1149 |
// pshr2 loc27 = loc27, 3 |
1150 |
// pshr2 loc28 = loc28, 3 |
1151 |
// pshr2 loc29 = loc29, 3 |
1152 |
// pshr2 loc30 = loc30, 3 |
1153 |
// pshr2 loc31 = loc31, 3 |
1154 |
// ;; |
1155 |
// // ************ |
1156 |
// // Store Matrix |
1157 |
// // ************ |
1158 |
// st8 [loc0] = loc16 |
1159 |
// st8 [loc1] = loc24 |
1160 |
// st8 [loc2] = loc17 |
1161 |
// st8 [loc3] = loc25 |
1162 |
// st8 [loc4] = loc18 |
1163 |
// st8 [loc5] = loc26 |
1164 |
// st8 [loc6] = loc19 |
1165 |
// st8 [loc7] = loc27 |
1166 |
// st8 [loc8] = loc20 |
1167 |
// st8 [loc9] = loc28 |
1168 |
// st8 [loc10] = loc21 |
1169 |
// st8 [loc11] = loc29 |
1170 |
// st8 [loc12] = loc22 |
1171 |
// st8 [loc13] = loc30 |
1172 |
// st8 [loc14] = loc23 |
1173 |
// st8 [loc15] = loc31 |
1174 |
// |
1175 |
// mov ar.pfs = r14 |
1176 |
// br.ret.sptk.many b0 |
1177 |
// .endp fdct_ia64# |
1178 |
// .common fdct#,8,8 |
1179 |
// |