Annotation of /trunk/xvidcore/src/utils/ia64_asm/mem_transfer_ia64.s

Revision 305 - (view) (download)

1 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
2 :			//
3 :			// mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel,
4 :			// University of Karlsruhe, Germany, 03.06.2002, during the laboratory
5 :			// "IA-64 Video Codec Assember Parktikum" at IPD Goos.
6 :			//
7 :			//
8 :			///// legal header taken from original C-file ///////////////////////////////////////
9 :			//
10 :			// XVID MPEG-4 VIDEO CODEC
11 :			// - 8bit<->16bit transfer -
12 :			//
13 :			// This program is an implementation of a part of one or more MPEG-4
14 :			// Video tools as specified in ISO/IEC 14496-2 standard. Those intending
15 :			// to use this software module in hardware or software products are
16 :			// advised that its use may infringe existing patents or copyrights, and
17 :			// any such use would be at such party's own risk. The original
18 :			// developer of this software module and his/her company, and subsequent
19 :			// editors and their companies, will have no liability for use of this
20 :			// software or modifications or derivatives thereof.
21 :			//
22 :			// This program is free software ; you can redistribute it and/or modify
23 :			// it under the terms of the GNU General Public License as published by
24 :			// the Free Software Foundation ; either version 2 of the License, or
25 :			// (at your option) any later version.
26 :			//
27 :			// This program is distributed in the hope that it will be useful,
28 :			// but WITHOUT ANY WARRANTY ; without even the implied warranty of
29 :			// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 :			// GNU General Public License for more details.
31 :			//
32 :			// You should have received a copy of the GNU General Public License
33 :			// along with this program ; if not, write to the Free Software
34 :			// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
35 :			//
36 :			///// History /////////////////////////////////////////////////////////////////
37 :			//
38 :			// - 16.07.2002: several minor changes for ecc-conformity
39 :			// - 03.06.2002: initial version
40 :			//
41 :			///////////////////////////////////////////////////////////////////////////////
42 :			//
43 :			// Annotations:
44 :			// ===========
45 :			//
46 :			// - All functions work on 8x8-matrices. While the C-code-functions treat each
47 :			// element seperatly, the functions in this assembler-code treat a whole line
48 :			// simultaneously. So one loop is saved.
49 :			// The remaining loop is relized by using softwarepipelining with rotating
50 :			// rregisters.
51 :			// - Register renaming is used for better readability
52 :			// - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both
53 :			// parts are shifted and joined together with an "OR"-Instruction.
54 :			// - First parameter is stored in GR 32, next in GR 33, and so on. They must be
55 :			// saved, as these GRs are used for register-rotation.
56 :			// - Some of the orininal, German comments used during development are left in
57 :			// in the code. They shouldn't bother anyone.
58 :			//
59 :			// Anmerkungen:
60 :			// ============
61 :			//
62 :			// - Alle Funtionen arbeiten mit 8x8-Matrizen. W�hrend die Funktionen im C-Code
63 :			// jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler-
64 :			// Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden.
65 :			// Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit
66 :			// rotierenden Registern realisiert.
67 :			// - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet.
68 :			// - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Bl�cke
69 :			// geladen, beide Teile mit "shift"-Operationen zurechter�ckt und mit einem
70 :			// logischen Oder zusammenkopiert.
71 :			// - Die Parameter werden in den Registern ab GR 32 �bergeben. Sie m�ssen ge-
72 :			// sichert werden, da die Register f�r die register-Rotation ben�tigt werden.
73 :			// - Einige der urspr�nglichen, deutschen Kommentare aus der Entwicklungsphase
74 :			// sind im Code verblieben. Sie sollten niemanden st�ren.
75 :			//
76 :			///////////////////////////////////////////////////////////////////////////////
77 :	ia64p	256
78 :
79 :			// * define Latencies for software pipilines *
80 :
81 :			LL = 3 // Load
82 :			SL = 3 // Store
83 :			PL = 1 // Pack
84 :			SHL = 1 // Shift
85 :			OL = 1 // Or
86 :			UL = 1 // Unpack
87 :			PAL = 1 // Parallel Add
88 :			PSL = 1 // Parallel Subtract
89 :			PAVGL = 1 // Parallel Avarage
90 :
91 :			.text
92 :
93 :
94 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
95 :			//
96 :			// transfer8x8_copy_ia64
97 :			//
98 :			// SRC is missaligned, to align the source load two 8-bytes-words, shift it,
99 :			// join them and store the aligned source into the destination address.
100 :			//
101 :			///////////////////////////////////////////////////////////////////////////////
102 :	ia64p	256
103 :	ia64p	205	.align 16
104 :	ia64p	256	.global transfer8x8_copy_ia64#
105 :			.proc transfer8x8_copy_ia64#
106 :
107 :			transfer8x8_copy_ia64:
108 :			.prologue
109 :
110 :			// * register renaming *
111 :			zero = r0
112 :
113 :			oldLC = r2
114 :			oldPR = r3
115 :
116 :			src_1 = r14 // left aligned address of src
117 :			src_2 = r15 // right aligned address of src
118 :			dst = r16 // destination address
119 :			stride = r17
120 :
121 :			offset = r18 // shift right offset
122 :			aoffset = r19 // shift left offset
123 :
124 :			// * Saving old Loop-Counter (LC) and Predicate Registers (PR) *
125 :			.save ar.lc, oldLC
126 :			mov oldLC = ar.lc
127 :			mov oldPR = pr
128 :	ia64p	305
129 :			.body
130 :
131 :	ia64p	256	// * Allocating new stackframe, initialize LC, Epilogue-Counter and PR *
132 :			alloc r9 = ar.pfs, 3, 29, 0, 32
133 :
134 :			// * Saving Parameters *
135 :			mov dst = r32
136 :			mov stride = r34
137 :
138 :			// * Misalingment-Treatment *
139 :			and src_1 = -8, r33 // Computing adress of first aligned block containing src-values
140 :			dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress
141 :			;;
142 :			sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl
143 :			add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values
144 :
145 :			// * init loop: set loop counter, epilog counter, predicates *
146 :			mov ar.lc = 7
147 :			mov ar.ec = LL + SHL + OL + 1
148 :			mov pr.rot = 1 << 16
149 :			;;
150 :
151 :			// * define register arrays and predicate array for software pipeline *
152 :			// src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left
153 :			.rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1]
154 :			.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1]
155 :	ia64p	305
156 :
157 :			// Software pipelined loop:
158 :			// Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2
159 :			// Stage 2: Shift both values of source to SHD_R and SHD_L
160 :			// Stage 3: Join both parts together with OR
161 :			// Stage 4: Store aligned date to destination and add stride to destination address
162 :
163 :
164 :	ia64p	256	.Loop_8x8copy:
165 :			{.mii
166 :			(ld_stage[0]) ld8 src_v1[0] = [src_1], stride
167 :			(sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset
168 :			}
169 :			{.mii
170 :			(ld_stage[0]) ld8 src_v2[0] = [src_2], stride
171 :			(sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset
172 :			(or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL]
173 :			}
174 :			{.mib
175 :			(st_stage[0]) st8 [dst] = value[OL]
176 :			(st_stage[0]) add dst = dst, stride
177 :			br.ctop.sptk.few .Loop_8x8copy
178 :			;;
179 :			}
180 :
181 :			// * Restore old LC and PRs *
182 :			mov ar.lc = oldLC
183 :			mov pr = oldPR, -1
184 :
185 :			br.ret.sptk.many b0
186 :
187 :			.endp transfer8x8_copy_ia64#
188 :
189 :
190 :
191 :
192 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
193 :			//
194 :			// transfer_8to16copy_ia64
195 :			//
196 :			// SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values,
197 :			// UNPACK is used. So 8 bytes are loaded from source, unpacked to two
198 :			// 4 x 16 bit values and stored to the destination. Destination is a continuous
199 :			// array of 64 x 16 bit signed data. To store the next line, only 16 must be
200 :			// added to the destination address.
201 :			///////////////////////////////////////////////////////////////////////////////
202 :	ia64p	256
203 :			.align 16
204 :	ia64p	205	.global transfer_8to16copy_ia64#
205 :			.proc transfer_8to16copy_ia64#
206 :	ia64p	256
207 :
208 :	ia64p	205	transfer_8to16copy_ia64:
209 :			.prologue
210 :	ia64p	256
211 :			// * register renaming *
212 :			oldLC = r2
213 :			oldPR = r3
214 :
215 :			zero = r0 // damit ist die Zahl "zero" = 0 gemeint
216 :
217 :			dst_1 = r14 // destination address for first 4 x 16 bit values
218 :			dst_2 = r15 // destination address for second 4 x 16 bit values
219 :			src = r16
220 :			stride = r17
221 :
222 :			// * Saving old Loop-Counter (LC) and Predicate Registers (PR) *
223 :			.save ar.lc, oldLC
224 :			mov oldLC = ar.lc
225 :			mov oldPR = pr
226 :
227 :	ia64p	305
228 :			.body
229 :
230 :	ia64p	256	// * Allocating new stackframe, define rotating registers *
231 :			alloc r9 = ar.pfs, 4, 92, 0, 96
232 :
233 :			// * Saving Paramters *
234 :			mov dst_1 = r32 // fist 4 x 16 bit values
235 :			add dst_2 = 8, r32 // second 4 x 16 bit values
236 :			mov src = r33
237 :			mov stride = r34
238 :
239 :			// * init loop: set loop counter, epilog counter, predicates *
240 :			mov ar.lc = 7
241 :			mov ar.ec = LL + UL + 1
242 :			mov pr.rot = 1 << 16
243 :	ia64p	205	;;
244 :	ia64p	256
245 :			// * define register arrays and predicate array for software pipeline *
246 :			// src_v = source value, dst_v1 = destination value 1
247 :			.rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1]
248 :			.rotp ld_stage[LL], upack_stage[UL], st_stage[1]
249 :	ia64p	305
250 :	ia64p	256
251 :	ia64p	305	// Software pipelined loop:
252 :			// Stage 1: Load value of SRC
253 :			// Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data
254 :			// Stage 3: Store both 8 byte of 16 bit data
255 :
256 :
257 :	ia64p	256	.Loop_8to16copy:
258 :			{.mii
259 :			(ld_stage[0]) ld8 src_v[0] = [src], stride
260 :			(upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL]
261 :			(upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL]
262 :			}
263 :			{.mmb
264 :			(st_stage[0]) st8 [dst_1] = dst_v1[UL], 16
265 :			(st_stage[0]) st8 [dst_2] = dst_v2[UL], 16
266 :			br.ctop.sptk.few .Loop_8to16copy
267 :			;;
268 :			}
269 :
270 :			// * Restore old LC and PRs *
271 :			mov ar.lc = oldLC
272 :			mov pr = oldPR, -1
273 :
274 :	ia64p	205	br.ret.sptk.many b0
275 :			.endp transfer_8to16copy_ia64#
276 :	ia64p	256
277 :
278 :
279 :
280 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
281 :			//
282 :			// transfer_16to8copy_ia64
283 :			//
284 :			// src is a 64 x 16 bit signed continuous array. To convert the 16 bit
285 :			// values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of
286 :			// 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word
287 :			// of 8 x 8 unsigned data to the destination.
288 :			///////////////////////////////////////////////////////////////////////////////
289 :	ia64p	256
290 :	ia64p	205	.align 16
291 :			.global transfer_16to8copy_ia64#
292 :			.proc transfer_16to8copy_ia64#
293 :			transfer_16to8copy_ia64:
294 :			.prologue
295 :	ia64p	256
296 :			// * register renaming *
297 :			dst = r14
298 :			src_1 = r15
299 :			src_2 = r17
300 :			stride = r16
301 :
302 :			// * Saving old Loop-Counter (LC) and Predicate Registers (PR) *
303 :			.save ar.lc, oldLC
304 :			mov oldLC = ar.lc
305 :			mov oldPR = pr
306 :
307 :	ia64p	305
308 :			.body
309 :
310 :	ia64p	256	// * Allocating new stackframe, define rotating registers *
311 :			alloc r9 = ar.pfs, 4, 92, 0, 96
312 :
313 :			// * Saving Paramters *
314 :			mov dst = r32
315 :			mov src_1 = r33
316 :			add src_2 = 8, r33
317 :			mov stride = r34
318 :
319 :			// * init loop: set loop counter, epilog counter, predicates *
320 :			mov ar.lc = 7
321 :			mov ar.ec = LL + PL + 1
322 :			mov pr.rot = 1 << 16
323 :	ia64p	205	;;
324 :	ia64p	256
325 :			// * define register arrays and predicate array for software pipeline *
326 :			// src_v1 = source value 1, dst_v = destination value
327 :			.rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1]
328 :			.rotp ld_stage[LL], pack_stage[PL], st_stage[1]
329 :
330 :
331 :	ia64p	305	// Software pipelined loop:
332 :			// Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data
333 :			// Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data
334 :			// Stage 3: Store the 8 byte to the destination address and add stride to
335 :			// destination address (to get the next 8 byte line of destination)
336 :
337 :
338 :	ia64p	256	.Loop_16to8copy:
339 :			{.mmi
340 :			(ld_stage[0]) ld8 src_v1[0] = [src_1], 16
341 :			(ld_stage[0]) ld8 src_v2[0] = [src_2], 16
342 :			(pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL]
343 :			}
344 :			{.mib
345 :			(st_stage[0]) st8 [dst] = dst_v[PL]
346 :			(st_stage[0]) add dst = dst, stride
347 :			br.ctop.sptk.few .Loop_16to8copy
348 :			;;
349 :			}
350 :
351 :			// * Restore old LC and PRs *
352 :			mov ar.lc = oldLC
353 :			mov pr = oldPR, -1
354 :
355 :	ia64p	205	br.ret.sptk.many b0
356 :			.endp transfer_16to8copy_ia64#
357 :	ia64p	256
358 :
359 :
360 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
361 :			//
362 :			// transfer_16to8add_ia64
363 :			//
364 :			// The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16-
365 :			// bit-values. These are "parallel-added" to the values of src. The result is
366 :			// converted into 8-bit-values using "PACK" and stored at the adress of dst.
367 :			// We assume that there is no misalignment.
368 :			//
369 :			///////////////////////////////////////////////////////////////////////////////
370 :	ia64p	256
371 :	ia64p	205	.align 16
372 :	ia64p	256	.global transfer_16to8add_ia64#
373 :			.proc transfer_16to8add_ia64#
374 :
375 :			transfer_16to8add_ia64:
376 :			.prologue
377 :
378 :			// * register renaming *
379 :			dst = r14
380 :			src = r15
381 :			stride = r16
382 :
383 :			_src = r17
384 :
385 :			// * Saving old Loop-Counter (LC) and Predicate Registers (PR) *
386 :			.save ar.lc, r2
387 :			mov oldLC = ar.lc
388 :			mov oldPR = pr
389 :
390 :	ia64p	305
391 :			.body
392 :
393 :	ia64p	256	// * Allocating new stackframe, initialize LC, Epilogue-Counter and PR *
394 :			alloc r9 = ar.pfs, 4, 92, 0, 96
395 :
396 :			// * Saving Paramters *
397 :			mov dst = r32
398 :			mov src = r33
399 :			mov stride = r34
400 :			add _src = 8, r33
401 :
402 :			// * init loop: set loop counter, epilog counter, predicates *
403 :			mov ar.lc = 7
404 :			mov ar.ec = LL + UL + PAL + PL + 1
405 :			mov pr.rot = 1 << 16
406 :			;;
407 :
408 :			// * define register arrays and predicate array for software pipeline *
409 :			.rotr _dst[LL+UL+PAL+PL+1], dst8[PL+1], pixel_1[PAL+1], pixel_2[PAL+1], w_dst16_1[UL+1], w_src_1[LL+UL+1], w_dst16_2[UL+1], w_src_2[LL+UL+1], w_dst8[LL+1]
410 :			.rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1]
411 :
412 :
413 :	ia64p	305	// Software pipelined loop:
414 :			// s1_p: The values of src and dst are loaded
415 :			// s2_p: The dst-values are converted to 16-bit-values
416 :			// s3_p: The values of src and dst are added
417 :			// s4_p: The Results are packed into 8-bit-values
418 :			// s5_p: The 8-bit-values are stored at the dst-adresses
419 :	ia64p	256
420 :	ia64p	305
421 :	ia64p	256	.Loop_16to8add:
422 :			{.mii
423 :			(s1_p[0]) ld8 w_src_1[0] = [src], 16 // l�d die 1. H�lfte der j. Zeile von src (i = 0..3)
424 :			(s1_p[0]) mov _dst[0] = dst // erh�ht die Adresse von dst um stride
425 :			(s3_p[0]) padd2.sss pixel_1[0] = w_dst16_1[UL], w_src_1[LL+UL] // parallele Addition von scr und dst
426 :			}
427 :			{.mii
428 :			(s1_p[0]) ld8 w_dst8[0] = [dst], stride // l�d die j. Zeile von dst
429 :			(s2_p[0]) unpack1.l w_dst16_1[0] = r0, w_dst8[LL]; // dst wird f�r i = 0..3 in 16-Bit umgewandelt
430 :			(s2_p[0]) unpack1.h w_dst16_2[0] = r0, w_dst8[LL]; // dst wird f�r i = 4..7 in 16-Bit umgewandelt
431 :			}
432 :			{.mii
433 :			(s1_p[0]) ld8 w_src_2[0] = [_src], 16 // l�d die 2. H�lfte der j. Zeile von src (i = 4..7)
434 :			(s3_p[0]) padd2.sss pixel_2[0] = w_dst16_2[UL], w_src_2[LL+UL] // parallele Addition von scr und dst
435 :			(s4_p[0]) pack2.uss dst8[0] = pixel_1[PAL], pixel_2[PAL] // wandelt die Summen (pixel) in 8-Bit Werte um. Die �berpr�fung der Wertebereiche erfolgt automatisch
436 :			}
437 :			{.mmb
438 :			(s5_p[0]) st8 [_dst[LL+UL+PAL+PL]] = dst8[PL] // speichert dst ab
439 :			(s1_p[0]) nop.m 0
440 :			br.ctop.sptk.few .Loop_16to8add
441 :			;;
442 :			}
443 :
444 :			// * Restore old LC and PRs *
445 :			mov ar.lc = oldLC
446 :			mov pr = oldPR, -1
447 :
448 :			br.ret.sptk.many b0
449 :			.endp transfer_16to8add_ia64#
450 :
451 :
452 :
453 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
454 :			//
455 :			// transfer_8to16sub_ia64
456 :			//
457 :			// The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The
458 :			// Difference of cur and ref ist stored at the dct-adresses and cur is copied
459 :			// into the ref-array.
460 :			//
461 :			// You must assume, that the data adressed by 'ref' are misaligned in memory.
462 :			// But you can assume, that the other data are aligned (at least I hope so).
463 :			//
464 :			///////////////////////////////////////////////////////////////////////////////
465 :	ia64p	256
466 :			.align 16
467 :	ia64p	205	.global transfer_8to16sub_ia64#
468 :			.proc transfer_8to16sub_ia64#
469 :	ia64p	256
470 :
471 :	ia64p	205	transfer_8to16sub_ia64:
472 :			.prologue
473 :	ia64p	256
474 :			// * register renaming *
475 :			oldLC = r2
476 :			oldPR = r3
477 :
478 :			zero = r0 // damit ist die Zahl "zero" = 0 gemeint
479 :
480 :			//Die folgenden Register erhalten die gleichen Namen, wie die Variablen in der C-Vorlage
481 :			dct = r14
482 :			cur = r15
483 :			ref = r34 // muss nicht extra gesichert werden, deswegen bleibt das �bergabeRegister in dieser Liste
484 :			stride = r16
485 :
486 :			offset = r17 // Offset der falsch ausgerichteten Daten zum zurechtr�cken
487 :			aoffset = r18 // Gegenst�ck zum Offset,
488 :			ref_a1 = r19 // Adresse des ersten 64-Bit Blocks von ref
489 :			ref_a2 = r20 // Adresse des zweiten 64-Bit Blocks von ref
490 :
491 :			_dct = r21 // Register f�r die Zieladressen des 2. dct-Blocks
492 :
493 :			// * Saving old Loop-Counter (LC) and Predicate Registers (PR) *
494 :			.save ar.lc, r2
495 :			mov oldLC = ar.lc
496 :			mov oldPR = pr
497 :
498 :	ia64p	305
499 :			.body
500 :
501 :	ia64p	256	// * Allocating new stackframe, define rotating registers *
502 :			alloc r9 = ar.pfs, 4, 92, 0, 96
503 :
504 :			// * Saving Paramters *
505 :			mov dct = r32
506 :			mov cur = r33
507 :			// mov ref = r34: ref is unaligned, get aligned ref below...
508 :			mov stride = r35
509 :
510 :			and ref_a1 = -8, ref // Die Adresse des ersten 64-Bit Blocks, in dem ref liegt, wird berechnet (entspricht mod 8)
511 :			dep offset = ref, zero, 3, 3
512 :	ia64p	205	;;
513 :	ia64p	256	add ref_a2 = 8, ref_a1
514 :			sub aoffset = 64, offset // Gegenst�ck zum Offset wird berechnet
515 :			add _dct = 8, dct // Die Adresse f�r den 2. dct-Block wird berechnet, um 8 Byte (= 64 Bit) h�her als beim 1. Block
516 :
517 :			// * init loop: set loop counter, epilog counter, predicates *
518 :			mov ar.lc = 7
519 :			mov ar.ec = LL + SHL + OL + UL + PSL + 1
520 :			mov pr.rot = 1 << 16
521 :	ia64p	205	;;
522 :	ia64p	256
523 :			// * define register arrays and predicate array for software pipeline *
524 :			.rotr c[LL+1], ref_v1[LL+1], ref_v2[LL+1], c16_1[SHL+OL+UL+1], c16_2[SHL+OL+UL+1], ref_shdr[SHL+1], ref_shdl[SHL+1], r[OL+1], r16_1[UL+1], r16_2[UL+1], dct_1[PSL+1], dct_2[PSL+1], _cur[LL+SHL+OL+UL+1]
525 :			.rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1]
526 :
527 :
528 :	ia64p	305	// Software pipelined loop:
529 :			// s1_p: The values of ref and cur ale loaded, a copy of cur is made.
530 :			// s2_p: cur is converted to 16-bit and thehe misaligned values of ref are
531 :			// shifted...
532 :			// s3_p: ... and copied together.
533 :			// s4_p: This ref-value is converted to 16-bit. The values of cur are stored
534 :			// at the ref-adresses.
535 :			// s5_p: the ref- abd cur-values are substracted...
536 :			// s6_p: ...and the result is stored at the dct-adresses.
537 :
538 :	ia64p	256
539 :			loop_8to16sub:
540 :			{.mii
541 :			(s1_p[0]) ld8 ref_v1[0] = [ref_a1], stride // l�d den 1. 64-Bit-Block, der einen Teil der ref-Daten enth�lt
542 :			(s1_p[0]) mov _cur[0] = cur // cur wird f�r sp�tere Verwendung gesichert
543 :			(s2_p[0]) shr.u ref_shdr[0] = ref_v1[LL], offset // Die rechte H�lfte wird zurechtger�ckt
544 :			}
545 :			{.mii
546 :			(s1_p[0]) ld8 ref_v2[0] = [ref_a2], stride // l�d den 2. 64-Bit-Block
547 :			(s2_p[0]) shl ref_shdl[0] = ref_v2[LL], aoffset // Die linke H�lfte wird zurechtger�ckt
548 :			(s3_p[0]) or r[0] = ref_shdr[SHL], ref_shdl[SHL] // Die zurechtger�ckten Daten werden in r zusammenkopiert
549 :			}
550 :			{.mii
551 :			(s1_p[0]) ld8 c[0] = [cur], stride //l�d die j. Zeile von cur komplett
552 :			(s2_p[0]) unpack1.l c16_1[0] = zero, c[LL]; // c wird f�r i = 0..3 in 16-Bit umgewandelt
553 :			(s2_p[0]) unpack1.h c16_2[0] = zero, c[LL]; // c wird f�r i = 4..7 in 16-Bit umgewandelt
554 :			}
555 :			{.mii
556 :			(s4_p[0]) st8 [_cur[LL+SHL+OL]] = r[OL] // cur wird auf den Wert von r gesetzt
557 :			//Umwandeln der 8-Bit r und c -Werte in 16-bit Werte
558 :			(s4_p[0]) unpack1.l r16_1[0] = zero, r[OL]; // r wird f�r i = 0..3 in 16-Bit umgewandelt
559 :			(s4_p[0]) unpack1.h r16_2[0] = zero, r[OL]; // r wird f�r i = 4..7 in 16-Bit umgewandelt
560 :			}
561 :			{.mii
562 :			(s5_p[0]) psub2.sss dct_1[0] = c16_1[SHL+OL+UL], r16_1[UL] // Subtraktion der 1. H�fte der j. Zeile
563 :			(s5_p[0]) psub2.sss dct_2[0] = c16_2[SHL+OL+UL], r16_2[UL] // Subtraktion der 2. H�lfte
564 :			}
565 :			{.mmb
566 :			(s6_p[0]) st8 [dct] = dct_1[PSL], 16 // speichert den 1. 64-Bit-Block an der vorgesehenen Adresse, erh�hen der Adresse um 16 Byte f�r den n�chsten Wert
567 :			(s6_p[0]) st8 [_dct] = dct_2[PSL], 16 // speichert den 2. 64-Bit-Block an der vorgesehenen Adresse, erh�hen der Adresse um 16 Byte f�r den n�chsten Wert
568 :			br.ctop.sptk.few loop_8to16sub // Und hopp
569 :			;;
570 :			}
571 :
572 :			// * Restore old LC and PRs *
573 :			mov ar.lc = oldLC
574 :			mov pr = oldPR, -1
575 :
576 :	ia64p	205	br.ret.sptk.many b0
577 :			.endp transfer_8to16sub_ia64#
578 :	ia64p	256
579 :
580 :
581 :
582 :
583 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
584 :			//
585 :			// transfer_8to16sub2_ia64
586 :			//
587 :			// At the time, this function was written, it was not yet in use.
588 :			// We assume that the values of ref1/2 are misaligned.
589 :			//
590 :			// The values of ref1/2 and cur are loaded, the ref-values need misalignment-
591 :			// treatment. The values are converted to 16-bit using unpack. The average of
592 :			// ref1 and ref2 is computed with pavg and substacted from cur. The results are
593 :			// stored at the dct-adresses.
594 :			// pavg1.raz is used to get the same results as the C-code-function.
595 :			//
596 :			///////////////////////////////////////////////////////////////////////////////
597 :	ia64p	256
598 :			.text
599 :	ia64p	205	.align 16
600 :			.global transfer_8to16sub2_ia64#
601 :			.proc transfer_8to16sub2_ia64#
602 :	ia64p	256
603 :	ia64p	205	transfer_8to16sub2_ia64:
604 :			.prologue
605 :	ia64p	256
606 :			// * register renaming *
607 :			// We've tried to keep the C-Code names as often as possible, at least as
608 :			// part of register-names
609 :			oldLC = r2
610 :			oldPR = r3
611 :
612 :			zero = r0
613 :
614 :			dct_al = r14 // dct: adress of left block in one line
615 :			dct_ar = r15 // dct: adress of right block in one line
616 :			cur = r16
617 :			ref1_al = r17 // ref1: aligned adress of lower part
618 :			ref1_ah = r18 // ref1: aligned adress of higher part
619 :			ref2_al = r19 // ref2: aligned adress of lower part
620 :			ref2_ah = r20 // ref2: aligned adress of higher part
621 :			stride = r21
622 :
623 :			offset_1 = r22
624 :			offset_2 = r23
625 :			aoffset_1 = r24
626 :			aoffset_2 = r25
627 :
628 :			// * Saving old Loop-Counter (LC) and Predicate Registers (PR) *
629 :	ia64p	205	.save ar.lc, r2
630 :	ia64p	256	mov oldLC = ar.lc
631 :			mov oldPR = pr
632 :
633 :	ia64p	305
634 :			.body
635 :
636 :	ia64p	256	// * Saving Paramters *
637 :			// * (as inputregisters r32 + are needed for register-rotation) *
638 :			mov dct_ar = r32
639 :			add dct_al = 8, r32
640 :			mov cur = r33
641 :
642 :			and ref1_al = -8, r34
643 :			and ref2_al = -8, r35 // ref2 aligned adrress of lower part
644 :
645 :			mov stride = r36
646 :
647 :			// * Calculations for Misaligment-Handling *
648 :			dep offset_1 = r34, zero, 3, 3
649 :			dep offset_2 = r35, zero, 3, 3
650 :	ia64p	205	;;
651 :	ia64p	256	add ref1_ah = 8, ref1_al
652 :			add ref2_ah = 8, ref2_al
653 :			sub aoffset_1 = 64, offset_1
654 :			sub aoffset_2 = 64, offset_2
655 :	ia64p	205	;;
656 :	ia64p	256
657 :			// * Allocating new stackframe, define rotating registers *
658 :			alloc r9 = ar.pfs, 5, 91, 0, 96
659 :
660 :			// * init loop: set loop counter, epilog counter, predicates *
661 :			mov ar.lc = 7
662 :			mov ar.ec = LL + SHL + OL + PAVGL + UL +PSL + 1
663 :			mov pr.rot = 1 << 16
664 :	ia64p	205	;;
665 :	ia64p	256
666 :			// * define register arrays and predicate array for software pipeline *
667 :			.rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1]
668 :			.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1]
669 :	ia64p	305
670 :	ia64p	256
671 :	ia64p	305	// software pipelined loop:
672 :			// ld_stage: The values of ref1, ref2, cur are loaded
673 :			// sh_stage: The misaligned values of ref1/2 are shifted...
674 :			// or_stage: ...and copied together.
675 :			// pavg_stage: The average of ref1 and ref2 is computed.
676 :			// up_stage: The result and the cur-values are converted to 16-bit.
677 :			// psub_stage: Those values are substracted...
678 :			// st_stage: ...and stored at the dct-adresses.
679 :
680 :	ia64p	256
681 :			.Loop_8to16sub2:
682 :			{.mii
683 :			(ld_stage[0]) ld8 c[0] = [cur], stride
684 :			(sh_stage[0]) shr.u ref1_l[0] = ref1_vl[LL], offset_1
685 :			(sh_stage[0]) shl ref1_h[0] = ref1_vh[LL], aoffset_1
686 :			}
687 :			{.mii
688 :			(ld_stage[0]) ld8 ref1_vl[0] = [ref1_al], stride
689 :			(sh_stage[0]) shr.u ref2_l[0] = ref2_vl[LL], offset_2
690 :			(sh_stage[0]) shl ref2_h[0] = ref2_vh[LL], aoffset_2
691 :			}
692 :			{.mii
693 :			(ld_stage[0]) ld8 ref1_vh[0] = [ref1_ah], stride
694 :			(or_stage[0]) or ref1_aligned[0] = ref1_h[SHL], ref1_l[SHL]
695 :			(or_stage[0]) or ref2_aligned[0] = ref2_h[SHL], ref2_l[SHL]
696 :			}
697 :			{.mii
698 :			(ld_stage[0]) ld8 ref2_vl[0] = [ref2_al], stride
699 :			(pavg_stage[0]) pavg1.raz r[0] = ref1_aligned[OL], ref2_aligned[OL]
700 :			(up_stage[0]) unpack1.l r16_r[0] = zero, r[PAVGL]
701 :			}
702 :			{.mii
703 :			(ld_stage[0]) ld8 ref2_vh[0] = [ref2_ah], stride
704 :			(up_stage[0]) unpack1.h r16_l[0] = zero, r[PAVGL]
705 :			(up_stage[0]) unpack1.l c16_r[0] = zero, c[LL+SHL+OL+PAVGL]
706 :			}
707 :			{.mii
708 :			(st_stage[0]) st8 [dct_ar] = dct16_r[PSL], 16
709 :			(up_stage[0]) unpack1.h c16_l[0] = zero, c[LL+SHL+OL+PAVGL]
710 :			(psub_stage[0]) psub2.sss dct16_l[0] = c16_l[UL], r16_l[UL]
711 :			}
712 :			{.mib
713 :			(st_stage[0]) st8 [dct_al] = dct16_l[PSL], 16
714 :			(psub_stage[0]) psub2.sss dct16_r[0] = c16_r[UL], r16_r[UL]
715 :			br.ctop.sptk.few .Loop_8to16sub2 // Und hopp
716 :			;;
717 :			}
718 :
719 :			// * Restore old LC and PRs *
720 :			mov ar.lc = oldLC
721 :			mov pr = oldPR, -1
722 :
723 :	ia64p	205	br.ret.sptk.many b0
724 :			.endp transfer_8to16sub2_ia64#

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4