Annotation of /trunk/xvidcore/src/utils/ia64_asm/mem_transfer_ia64.s

Revision 1855 - (view) (download)

1 :	Isibaar	1855	// ****************************************************************************
2 :			// *
3 :			// * XVID MPEG-4 VIDEO CODEC
4 :			// * - IA64 8bit<->16bit transfer -
5 :			// *
6 :			// * Copyright(C) 2002 Sebastian Felis, Max Stengel
7 :			// *
8 :			// * This program is free software; you can redistribute it and/or modify it
9 :			// * under the terms of the GNU General Public License as published by
10 :			// * the Free Software Foundation; either version 2 of the License, or
11 :			// * (at your option) any later version.
12 :			// *
13 :			// * This program is distributed in the hope that it will be useful,
14 :			// * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 :			// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :			// * GNU General Public License for more details.
17 :			// *
18 :			// * You should have received a copy of the GNU General Public License
19 :			// * along with this program; if not, write to the Free Software
20 :			// * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :			// *
22 :			// * $Id: mem_transfer_ia64.s,v 1.6 2009-02-19 17:07:29 Isibaar Exp $
23 :			// *
24 :			// ***************************************************************************/
25 :			//
26 :			// ****************************************************************************
27 :			// *
28 :			// * mem_transfer_ia64.s, IA-64 8bit<->16bit transfer
29 :			// *
30 :			// * This version was implemented during an IA-64 practical training at
31 :			// * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/)
32 :			// *
33 :			// ****************************************************************************
34 :
35 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
36 :			//
37 :			// mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel,
38 :			// University of Karlsruhe, Germany, 03.06.2002, during the laboratory
39 :			// "IA-64 Video Codec Assember Parktikum" at IPD Goos.
40 :	Isibaar	1855
41 :	ia64p	305	///// History /////////////////////////////////////////////////////////////////
42 :			//
43 :			// - 16.07.2002: several minor changes for ecc-conformity
44 :			// - 03.06.2002: initial version
45 :			//
46 :	Isibaar	1855
47 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
48 :			//
49 :			// Annotations:
50 :			// ===========
51 :			//
52 :			// - All functions work on 8x8-matrices. While the C-code-functions treat each
53 :			// element seperatly, the functions in this assembler-code treat a whole line
54 :			// simultaneously. So one loop is saved.
55 :			// The remaining loop is relized by using softwarepipelining with rotating
56 :			// rregisters.
57 :			// - Register renaming is used for better readability
58 :			// - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both
59 :			// parts are shifted and joined together with an "OR"-Instruction.
60 :			// - First parameter is stored in GR 32, next in GR 33, and so on. They must be
61 :			// saved, as these GRs are used for register-rotation.
62 :			// - Some of the orininal, German comments used during development are left in
63 :			// in the code. They shouldn't bother anyone.
64 :			//
65 :			// Anmerkungen:
66 :			// ============
67 :			//
68 :			// - Alle Funtionen arbeiten mit 8x8-Matrizen. W�hrend die Funktionen im C-Code
69 :			// jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler-
70 :			// Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden.
71 :			// Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit
72 :			// rotierenden Registern realisiert.
73 :			// - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet.
74 :			// - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Bl�cke
75 :			// geladen, beide Teile mit "shift"-Operationen zurechter�ckt und mit einem
76 :			// logischen Oder zusammenkopiert.
77 :			// - Die Parameter werden in den Registern ab GR 32 �bergeben. Sie m�ssen ge-
78 :			// sichert werden, da die Register f�r die register-Rotation ben�tigt werden.
79 :			// - Einige der urspr�nglichen, deutschen Kommentare aus der Entwicklungsphase
80 :			// sind im Code verblieben. Sie sollten niemanden st�ren.
81 :			//
82 :			///////////////////////////////////////////////////////////////////////////////
83 :	ia64p	256
84 :
85 :			// * define Latencies for software pipilines *
86 :
87 :			LL = 3 // Load
88 :			SL = 3 // Store
89 :			PL = 1 // Pack
90 :			SHL = 1 // Shift
91 :			OL = 1 // Or
92 :			UL = 1 // Unpack
93 :			PAL = 1 // Parallel Add
94 :			PSL = 1 // Parallel Subtract
95 :			PAVGL = 1 // Parallel Avarage
96 :
97 :			.text
98 :
99 :
100 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
101 :			//
102 :			// transfer8x8_copy_ia64
103 :			//
104 :			// SRC is missaligned, to align the source load two 8-bytes-words, shift it,
105 :			// join them and store the aligned source into the destination address.
106 :			//
107 :			///////////////////////////////////////////////////////////////////////////////
108 :	ia64p	256
109 :	ia64p	205	.align 16
110 :	ia64p	256	.global transfer8x8_copy_ia64#
111 :			.proc transfer8x8_copy_ia64#
112 :
113 :			transfer8x8_copy_ia64:
114 :			.prologue
115 :
116 :			// * register renaming *
117 :			zero = r0
118 :
119 :			oldLC = r2
120 :			oldPR = r3
121 :
122 :			src_1 = r14 // left aligned address of src
123 :			src_2 = r15 // right aligned address of src
124 :			dst = r16 // destination address
125 :			stride = r17
126 :
127 :			offset = r18 // shift right offset
128 :			aoffset = r19 // shift left offset
129 :
130 :			// * Saving old Loop-Counter (LC) and Predicate Registers (PR) *
131 :			.save ar.lc, oldLC
132 :			mov oldLC = ar.lc
133 :			mov oldPR = pr
134 :	ia64p	305
135 :			.body
136 :
137 :	ia64p	256	// * Allocating new stackframe, initialize LC, Epilogue-Counter and PR *
138 :			alloc r9 = ar.pfs, 3, 29, 0, 32
139 :
140 :			// * Saving Parameters *
141 :			mov dst = r32
142 :			mov stride = r34
143 :
144 :			// * Misalingment-Treatment *
145 :			and src_1 = -8, r33 // Computing adress of first aligned block containing src-values
146 :			dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress
147 :			;;
148 :			sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl
149 :			add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values
150 :
151 :			// * init loop: set loop counter, epilog counter, predicates *
152 :			mov ar.lc = 7
153 :			mov ar.ec = LL + SHL + OL + 1
154 :			mov pr.rot = 1 << 16
155 :			;;
156 :
157 :			// * define register arrays and predicate array for software pipeline *
158 :			// src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left
159 :			.rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1]
160 :			.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1]
161 :	ia64p	305
162 :
163 :			// Software pipelined loop:
164 :			// Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2
165 :			// Stage 2: Shift both values of source to SHD_R and SHD_L
166 :			// Stage 3: Join both parts together with OR
167 :			// Stage 4: Store aligned date to destination and add stride to destination address
168 :
169 :
170 :	ia64p	256	.Loop_8x8copy:
171 :			{.mii
172 :			(ld_stage[0]) ld8 src_v1[0] = [src_1], stride
173 :			(sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset
174 :			}
175 :			{.mii
176 :			(ld_stage[0]) ld8 src_v2[0] = [src_2], stride
177 :			(sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset
178 :			(or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL]
179 :			}
180 :			{.mib
181 :			(st_stage[0]) st8 [dst] = value[OL]
182 :			(st_stage[0]) add dst = dst, stride
183 :			br.ctop.sptk.few .Loop_8x8copy
184 :			;;
185 :			}
186 :
187 :			// * Restore old LC and PRs *
188 :			mov ar.lc = oldLC
189 :			mov pr = oldPR, -1
190 :
191 :			br.ret.sptk.many b0
192 :
193 :			.endp transfer8x8_copy_ia64#
194 :
195 :
196 :
197 :
198 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
199 :			//
200 :			// transfer_8to16copy_ia64
201 :			//
202 :			// SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values,
203 :			// UNPACK is used. So 8 bytes are loaded from source, unpacked to two
204 :			// 4 x 16 bit values and stored to the destination. Destination is a continuous
205 :			// array of 64 x 16 bit signed data. To store the next line, only 16 must be
206 :			// added to the destination address.
207 :			///////////////////////////////////////////////////////////////////////////////
208 :	ia64p	256
209 :			.align 16
210 :	ia64p	205	.global transfer_8to16copy_ia64#
211 :			.proc transfer_8to16copy_ia64#
212 :	ia64p	256
213 :
214 :	ia64p	205	transfer_8to16copy_ia64:
215 :			.prologue
216 :	ia64p	256
217 :			// * register renaming *
218 :			oldLC = r2
219 :			oldPR = r3
220 :
221 :			zero = r0 // damit ist die Zahl "zero" = 0 gemeint
222 :
223 :			dst_1 = r14 // destination address for first 4 x 16 bit values
224 :			dst_2 = r15 // destination address for second 4 x 16 bit values
225 :			src = r16
226 :			stride = r17
227 :
228 :			// * Saving old Loop-Counter (LC) and Predicate Registers (PR) *
229 :			.save ar.lc, oldLC
230 :			mov oldLC = ar.lc
231 :			mov oldPR = pr
232 :
233 :	ia64p	305
234 :			.body
235 :
236 :	ia64p	256	// * Allocating new stackframe, define rotating registers *
237 :			alloc r9 = ar.pfs, 4, 92, 0, 96
238 :
239 :			// * Saving Paramters *
240 :			mov dst_1 = r32 // fist 4 x 16 bit values
241 :			add dst_2 = 8, r32 // second 4 x 16 bit values
242 :			mov src = r33
243 :			mov stride = r34
244 :
245 :			// * init loop: set loop counter, epilog counter, predicates *
246 :			mov ar.lc = 7
247 :			mov ar.ec = LL + UL + 1
248 :			mov pr.rot = 1 << 16
249 :	ia64p	205	;;
250 :	ia64p	256
251 :			// * define register arrays and predicate array for software pipeline *
252 :			// src_v = source value, dst_v1 = destination value 1
253 :			.rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1]
254 :			.rotp ld_stage[LL], upack_stage[UL], st_stage[1]
255 :	ia64p	305
256 :	ia64p	256
257 :	ia64p	305	// Software pipelined loop:
258 :			// Stage 1: Load value of SRC
259 :			// Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data
260 :			// Stage 3: Store both 8 byte of 16 bit data
261 :
262 :
263 :	ia64p	256	.Loop_8to16copy:
264 :			{.mii
265 :			(ld_stage[0]) ld8 src_v[0] = [src], stride
266 :			(upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL]
267 :			(upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL]
268 :			}
269 :			{.mmb
270 :			(st_stage[0]) st8 [dst_1] = dst_v1[UL], 16
271 :			(st_stage[0]) st8 [dst_2] = dst_v2[UL], 16
272 :			br.ctop.sptk.few .Loop_8to16copy
273 :			;;
274 :			}
275 :
276 :			// * Restore old LC and PRs *
277 :			mov ar.lc = oldLC
278 :			mov pr = oldPR, -1
279 :
280 :	ia64p	205	br.ret.sptk.many b0
281 :			.endp transfer_8to16copy_ia64#
282 :	ia64p	256
283 :
284 :
285 :
286 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
287 :			//
288 :			// transfer_16to8copy_ia64
289 :			//
290 :			// src is a 64 x 16 bit signed continuous array. To convert the 16 bit
291 :			// values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of
292 :			// 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word
293 :			// of 8 x 8 unsigned data to the destination.
294 :			///////////////////////////////////////////////////////////////////////////////
295 :	ia64p	256
296 :	ia64p	205	.align 16
297 :			.global transfer_16to8copy_ia64#
298 :			.proc transfer_16to8copy_ia64#
299 :			transfer_16to8copy_ia64:
300 :			.prologue
301 :	ia64p	256
302 :			// * register renaming *
303 :			dst = r14
304 :			src_1 = r15
305 :			src_2 = r17
306 :			stride = r16
307 :
308 :			// * Saving old Loop-Counter (LC) and Predicate Registers (PR) *
309 :			.save ar.lc, oldLC
310 :			mov oldLC = ar.lc
311 :			mov oldPR = pr
312 :
313 :	ia64p	305
314 :			.body
315 :
316 :	ia64p	256	// * Allocating new stackframe, define rotating registers *
317 :			alloc r9 = ar.pfs, 4, 92, 0, 96
318 :
319 :			// * Saving Paramters *
320 :			mov dst = r32
321 :			mov src_1 = r33
322 :			add src_2 = 8, r33
323 :			mov stride = r34
324 :
325 :			// * init loop: set loop counter, epilog counter, predicates *
326 :			mov ar.lc = 7
327 :			mov ar.ec = LL + PL + 1
328 :			mov pr.rot = 1 << 16
329 :	ia64p	205	;;
330 :	ia64p	256
331 :			// * define register arrays and predicate array for software pipeline *
332 :			// src_v1 = source value 1, dst_v = destination value
333 :			.rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1]
334 :			.rotp ld_stage[LL], pack_stage[PL], st_stage[1]
335 :
336 :
337 :	ia64p	305	// Software pipelined loop:
338 :			// Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data
339 :			// Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data
340 :			// Stage 3: Store the 8 byte to the destination address and add stride to
341 :			// destination address (to get the next 8 byte line of destination)
342 :
343 :
344 :	ia64p	256	.Loop_16to8copy:
345 :			{.mmi
346 :			(ld_stage[0]) ld8 src_v1[0] = [src_1], 16
347 :			(ld_stage[0]) ld8 src_v2[0] = [src_2], 16
348 :			(pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL]
349 :			}
350 :			{.mib
351 :			(st_stage[0]) st8 [dst] = dst_v[PL]
352 :			(st_stage[0]) add dst = dst, stride
353 :			br.ctop.sptk.few .Loop_16to8copy
354 :			;;
355 :			}
356 :
357 :			// * Restore old LC and PRs *
358 :			mov ar.lc = oldLC
359 :			mov pr = oldPR, -1
360 :
361 :	ia64p	205	br.ret.sptk.many b0
362 :			.endp transfer_16to8copy_ia64#
363 :	ia64p	256
364 :
365 :
366 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
367 :			//
368 :			// transfer_16to8add_ia64
369 :			//
370 :			// The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16-
371 :			// bit-values. These are "parallel-added" to the values of src. The result is
372 :			// converted into 8-bit-values using "PACK" and stored at the adress of dst.
373 :			// We assume that there is no misalignment.
374 :			//
375 :			///////////////////////////////////////////////////////////////////////////////
376 :	ia64p	256
377 :	ia64p	205	.align 16
378 :	ia64p	256	.global transfer_16to8add_ia64#
379 :			.proc transfer_16to8add_ia64#
380 :
381 :			transfer_16to8add_ia64:
382 :			.prologue
383 :
384 :			// * register renaming *
385 :			dst = r14
386 :			src = r15
387 :			stride = r16
388 :
389 :			_src = r17
390 :
391 :			// * Saving old Loop-Counter (LC) and Predicate Registers (PR) *
392 :			.save ar.lc, r2
393 :			mov oldLC = ar.lc
394 :			mov oldPR = pr
395 :
396 :	ia64p	305
397 :			.body
398 :
399 :	ia64p	256	// * Allocating new stackframe, initialize LC, Epilogue-Counter and PR *
400 :			alloc r9 = ar.pfs, 4, 92, 0, 96
401 :
402 :			// * Saving Paramters *
403 :			mov dst = r32
404 :			mov src = r33
405 :			mov stride = r34
406 :			add _src = 8, r33
407 :
408 :			// * init loop: set loop counter, epilog counter, predicates *
409 :			mov ar.lc = 7
410 :			mov ar.ec = LL + UL + PAL + PL + 1
411 :			mov pr.rot = 1 << 16
412 :			;;
413 :
414 :			// * define register arrays and predicate array for software pipeline *
415 :			.rotr _dst[LL+UL+PAL+PL+1], dst8[PL+1], pixel_1[PAL+1], pixel_2[PAL+1], w_dst16_1[UL+1], w_src_1[LL+UL+1], w_dst16_2[UL+1], w_src_2[LL+UL+1], w_dst8[LL+1]
416 :			.rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1]
417 :
418 :
419 :	ia64p	305	// Software pipelined loop:
420 :			// s1_p: The values of src and dst are loaded
421 :			// s2_p: The dst-values are converted to 16-bit-values
422 :			// s3_p: The values of src and dst are added
423 :			// s4_p: The Results are packed into 8-bit-values
424 :			// s5_p: The 8-bit-values are stored at the dst-adresses
425 :	ia64p	256
426 :	ia64p	305
427 :	ia64p	256	.Loop_16to8add:
428 :			{.mii
429 :			(s1_p[0]) ld8 w_src_1[0] = [src], 16 // l�d die 1. H�lfte der j. Zeile von src (i = 0..3)
430 :			(s1_p[0]) mov _dst[0] = dst // erh�ht die Adresse von dst um stride
431 :			(s3_p[0]) padd2.sss pixel_1[0] = w_dst16_1[UL], w_src_1[LL+UL] // parallele Addition von scr und dst
432 :			}
433 :			{.mii
434 :			(s1_p[0]) ld8 w_dst8[0] = [dst], stride // l�d die j. Zeile von dst
435 :			(s2_p[0]) unpack1.l w_dst16_1[0] = r0, w_dst8[LL]; // dst wird f�r i = 0..3 in 16-Bit umgewandelt
436 :			(s2_p[0]) unpack1.h w_dst16_2[0] = r0, w_dst8[LL]; // dst wird f�r i = 4..7 in 16-Bit umgewandelt
437 :			}
438 :			{.mii
439 :			(s1_p[0]) ld8 w_src_2[0] = [_src], 16 // l�d die 2. H�lfte der j. Zeile von src (i = 4..7)
440 :			(s3_p[0]) padd2.sss pixel_2[0] = w_dst16_2[UL], w_src_2[LL+UL] // parallele Addition von scr und dst
441 :			(s4_p[0]) pack2.uss dst8[0] = pixel_1[PAL], pixel_2[PAL] // wandelt die Summen (pixel) in 8-Bit Werte um. Die �berpr�fung der Wertebereiche erfolgt automatisch
442 :			}
443 :			{.mmb
444 :			(s5_p[0]) st8 [_dst[LL+UL+PAL+PL]] = dst8[PL] // speichert dst ab
445 :			(s1_p[0]) nop.m 0
446 :			br.ctop.sptk.few .Loop_16to8add
447 :			;;
448 :			}
449 :
450 :			// * Restore old LC and PRs *
451 :			mov ar.lc = oldLC
452 :			mov pr = oldPR, -1
453 :
454 :			br.ret.sptk.many b0
455 :			.endp transfer_16to8add_ia64#
456 :
457 :
458 :
459 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
460 :			//
461 :			// transfer_8to16sub_ia64
462 :			//
463 :			// The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The
464 :			// Difference of cur and ref ist stored at the dct-adresses and cur is copied
465 :			// into the ref-array.
466 :			//
467 :			// You must assume, that the data adressed by 'ref' are misaligned in memory.
468 :			// But you can assume, that the other data are aligned (at least I hope so).
469 :			//
470 :			///////////////////////////////////////////////////////////////////////////////
471 :	ia64p	256
472 :			.align 16
473 :	ia64p	205	.global transfer_8to16sub_ia64#
474 :			.proc transfer_8to16sub_ia64#
475 :	ia64p	256
476 :
477 :	ia64p	205	transfer_8to16sub_ia64:
478 :			.prologue
479 :	ia64p	256
480 :			// * register renaming *
481 :			oldLC = r2
482 :			oldPR = r3
483 :
484 :			zero = r0 // damit ist die Zahl "zero" = 0 gemeint
485 :
486 :			//Die folgenden Register erhalten die gleichen Namen, wie die Variablen in der C-Vorlage
487 :			dct = r14
488 :			cur = r15
489 :			ref = r34 // muss nicht extra gesichert werden, deswegen bleibt das �bergabeRegister in dieser Liste
490 :			stride = r16
491 :
492 :			offset = r17 // Offset der falsch ausgerichteten Daten zum zurechtr�cken
493 :			aoffset = r18 // Gegenst�ck zum Offset,
494 :			ref_a1 = r19 // Adresse des ersten 64-Bit Blocks von ref
495 :			ref_a2 = r20 // Adresse des zweiten 64-Bit Blocks von ref
496 :
497 :			_dct = r21 // Register f�r die Zieladressen des 2. dct-Blocks
498 :
499 :			// * Saving old Loop-Counter (LC) and Predicate Registers (PR) *
500 :			.save ar.lc, r2
501 :			mov oldLC = ar.lc
502 :			mov oldPR = pr
503 :
504 :	ia64p	305
505 :			.body
506 :
507 :	ia64p	256	// * Allocating new stackframe, define rotating registers *
508 :			alloc r9 = ar.pfs, 4, 92, 0, 96
509 :
510 :			// * Saving Paramters *
511 :			mov dct = r32
512 :			mov cur = r33
513 :			// mov ref = r34: ref is unaligned, get aligned ref below...
514 :			mov stride = r35
515 :
516 :			and ref_a1 = -8, ref // Die Adresse des ersten 64-Bit Blocks, in dem ref liegt, wird berechnet (entspricht mod 8)
517 :			dep offset = ref, zero, 3, 3
518 :	ia64p	205	;;
519 :	ia64p	256	add ref_a2 = 8, ref_a1
520 :			sub aoffset = 64, offset // Gegenst�ck zum Offset wird berechnet
521 :			add _dct = 8, dct // Die Adresse f�r den 2. dct-Block wird berechnet, um 8 Byte (= 64 Bit) h�her als beim 1. Block
522 :
523 :			// * init loop: set loop counter, epilog counter, predicates *
524 :			mov ar.lc = 7
525 :			mov ar.ec = LL + SHL + OL + UL + PSL + 1
526 :			mov pr.rot = 1 << 16
527 :	ia64p	205	;;
528 :	ia64p	256
529 :			// * define register arrays and predicate array for software pipeline *
530 :			.rotr c[LL+1], ref_v1[LL+1], ref_v2[LL+1], c16_1[SHL+OL+UL+1], c16_2[SHL+OL+UL+1], ref_shdr[SHL+1], ref_shdl[SHL+1], r[OL+1], r16_1[UL+1], r16_2[UL+1], dct_1[PSL+1], dct_2[PSL+1], _cur[LL+SHL+OL+UL+1]
531 :			.rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1]
532 :
533 :
534 :	ia64p	305	// Software pipelined loop:
535 :			// s1_p: The values of ref and cur ale loaded, a copy of cur is made.
536 :			// s2_p: cur is converted to 16-bit and thehe misaligned values of ref are
537 :			// shifted...
538 :			// s3_p: ... and copied together.
539 :			// s4_p: This ref-value is converted to 16-bit. The values of cur are stored
540 :			// at the ref-adresses.
541 :			// s5_p: the ref- abd cur-values are substracted...
542 :			// s6_p: ...and the result is stored at the dct-adresses.
543 :
544 :	ia64p	256
545 :			loop_8to16sub:
546 :			{.mii
547 :			(s1_p[0]) ld8 ref_v1[0] = [ref_a1], stride // l�d den 1. 64-Bit-Block, der einen Teil der ref-Daten enth�lt
548 :			(s1_p[0]) mov _cur[0] = cur // cur wird f�r sp�tere Verwendung gesichert
549 :			(s2_p[0]) shr.u ref_shdr[0] = ref_v1[LL], offset // Die rechte H�lfte wird zurechtger�ckt
550 :			}
551 :			{.mii
552 :			(s1_p[0]) ld8 ref_v2[0] = [ref_a2], stride // l�d den 2. 64-Bit-Block
553 :			(s2_p[0]) shl ref_shdl[0] = ref_v2[LL], aoffset // Die linke H�lfte wird zurechtger�ckt
554 :			(s3_p[0]) or r[0] = ref_shdr[SHL], ref_shdl[SHL] // Die zurechtger�ckten Daten werden in r zusammenkopiert
555 :			}
556 :			{.mii
557 :			(s1_p[0]) ld8 c[0] = [cur], stride //l�d die j. Zeile von cur komplett
558 :			(s2_p[0]) unpack1.l c16_1[0] = zero, c[LL]; // c wird f�r i = 0..3 in 16-Bit umgewandelt
559 :			(s2_p[0]) unpack1.h c16_2[0] = zero, c[LL]; // c wird f�r i = 4..7 in 16-Bit umgewandelt
560 :			}
561 :			{.mii
562 :			(s4_p[0]) st8 [_cur[LL+SHL+OL]] = r[OL] // cur wird auf den Wert von r gesetzt
563 :			//Umwandeln der 8-Bit r und c -Werte in 16-bit Werte
564 :			(s4_p[0]) unpack1.l r16_1[0] = zero, r[OL]; // r wird f�r i = 0..3 in 16-Bit umgewandelt
565 :			(s4_p[0]) unpack1.h r16_2[0] = zero, r[OL]; // r wird f�r i = 4..7 in 16-Bit umgewandelt
566 :			}
567 :			{.mii
568 :			(s5_p[0]) psub2.sss dct_1[0] = c16_1[SHL+OL+UL], r16_1[UL] // Subtraktion der 1. H�fte der j. Zeile
569 :			(s5_p[0]) psub2.sss dct_2[0] = c16_2[SHL+OL+UL], r16_2[UL] // Subtraktion der 2. H�lfte
570 :			}
571 :			{.mmb
572 :			(s6_p[0]) st8 [dct] = dct_1[PSL], 16 // speichert den 1. 64-Bit-Block an der vorgesehenen Adresse, erh�hen der Adresse um 16 Byte f�r den n�chsten Wert
573 :			(s6_p[0]) st8 [_dct] = dct_2[PSL], 16 // speichert den 2. 64-Bit-Block an der vorgesehenen Adresse, erh�hen der Adresse um 16 Byte f�r den n�chsten Wert
574 :			br.ctop.sptk.few loop_8to16sub // Und hopp
575 :			;;
576 :			}
577 :
578 :			// * Restore old LC and PRs *
579 :			mov ar.lc = oldLC
580 :			mov pr = oldPR, -1
581 :
582 :	ia64p	205	br.ret.sptk.many b0
583 :			.endp transfer_8to16sub_ia64#
584 :	ia64p	256
585 :
586 :
587 :
588 :
589 :	ia64p	305	///////////////////////////////////////////////////////////////////////////////
590 :			//
591 :			// transfer_8to16sub2_ia64
592 :			//
593 :			// At the time, this function was written, it was not yet in use.
594 :			// We assume that the values of ref1/2 are misaligned.
595 :			//
596 :			// The values of ref1/2 and cur are loaded, the ref-values need misalignment-
597 :			// treatment. The values are converted to 16-bit using unpack. The average of
598 :			// ref1 and ref2 is computed with pavg and substacted from cur. The results are
599 :			// stored at the dct-adresses.
600 :			// pavg1.raz is used to get the same results as the C-code-function.
601 :			//
602 :			///////////////////////////////////////////////////////////////////////////////
603 :	ia64p	256
604 :			.text
605 :	ia64p	205	.align 16
606 :			.global transfer_8to16sub2_ia64#
607 :			.proc transfer_8to16sub2_ia64#
608 :	ia64p	256
609 :	ia64p	205	transfer_8to16sub2_ia64:
610 :			.prologue
611 :	ia64p	256
612 :			// * register renaming *
613 :			// We've tried to keep the C-Code names as often as possible, at least as
614 :			// part of register-names
615 :			oldLC = r2
616 :			oldPR = r3
617 :
618 :			zero = r0
619 :
620 :			dct_al = r14 // dct: adress of left block in one line
621 :			dct_ar = r15 // dct: adress of right block in one line
622 :			cur = r16
623 :			ref1_al = r17 // ref1: aligned adress of lower part
624 :			ref1_ah = r18 // ref1: aligned adress of higher part
625 :			ref2_al = r19 // ref2: aligned adress of lower part
626 :			ref2_ah = r20 // ref2: aligned adress of higher part
627 :			stride = r21
628 :
629 :			offset_1 = r22
630 :			offset_2 = r23
631 :			aoffset_1 = r24
632 :			aoffset_2 = r25
633 :
634 :			// * Saving old Loop-Counter (LC) and Predicate Registers (PR) *
635 :	ia64p	205	.save ar.lc, r2
636 :	ia64p	256	mov oldLC = ar.lc
637 :			mov oldPR = pr
638 :
639 :	ia64p	305
640 :			.body
641 :
642 :	ia64p	256	// * Saving Paramters *
643 :			// * (as inputregisters r32 + are needed for register-rotation) *
644 :			mov dct_ar = r32
645 :			add dct_al = 8, r32
646 :			mov cur = r33
647 :
648 :			and ref1_al = -8, r34
649 :			and ref2_al = -8, r35 // ref2 aligned adrress of lower part
650 :
651 :			mov stride = r36
652 :
653 :			// * Calculations for Misaligment-Handling *
654 :			dep offset_1 = r34, zero, 3, 3
655 :			dep offset_2 = r35, zero, 3, 3
656 :	ia64p	205	;;
657 :	ia64p	256	add ref1_ah = 8, ref1_al
658 :			add ref2_ah = 8, ref2_al
659 :			sub aoffset_1 = 64, offset_1
660 :			sub aoffset_2 = 64, offset_2
661 :	ia64p	205	;;
662 :	ia64p	256
663 :			// * Allocating new stackframe, define rotating registers *
664 :			alloc r9 = ar.pfs, 5, 91, 0, 96
665 :
666 :			// * init loop: set loop counter, epilog counter, predicates *
667 :			mov ar.lc = 7
668 :			mov ar.ec = LL + SHL + OL + PAVGL + UL +PSL + 1
669 :			mov pr.rot = 1 << 16
670 :	ia64p	205	;;
671 :	ia64p	256
672 :			// * define register arrays and predicate array for software pipeline *
673 :			.rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1]
674 :			.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1]
675 :	ia64p	305
676 :	ia64p	256
677 :	ia64p	305	// software pipelined loop:
678 :			// ld_stage: The values of ref1, ref2, cur are loaded
679 :			// sh_stage: The misaligned values of ref1/2 are shifted...
680 :			// or_stage: ...and copied together.
681 :			// pavg_stage: The average of ref1 and ref2 is computed.
682 :			// up_stage: The result and the cur-values are converted to 16-bit.
683 :			// psub_stage: Those values are substracted...
684 :			// st_stage: ...and stored at the dct-adresses.
685 :
686 :	ia64p	256
687 :			.Loop_8to16sub2:
688 :			{.mii
689 :			(ld_stage[0]) ld8 c[0] = [cur], stride
690 :			(sh_stage[0]) shr.u ref1_l[0] = ref1_vl[LL], offset_1
691 :			(sh_stage[0]) shl ref1_h[0] = ref1_vh[LL], aoffset_1
692 :			}
693 :			{.mii
694 :			(ld_stage[0]) ld8 ref1_vl[0] = [ref1_al], stride
695 :			(sh_stage[0]) shr.u ref2_l[0] = ref2_vl[LL], offset_2
696 :			(sh_stage[0]) shl ref2_h[0] = ref2_vh[LL], aoffset_2
697 :			}
698 :			{.mii
699 :			(ld_stage[0]) ld8 ref1_vh[0] = [ref1_ah], stride
700 :			(or_stage[0]) or ref1_aligned[0] = ref1_h[SHL], ref1_l[SHL]
701 :			(or_stage[0]) or ref2_aligned[0] = ref2_h[SHL], ref2_l[SHL]
702 :			}
703 :			{.mii
704 :			(ld_stage[0]) ld8 ref2_vl[0] = [ref2_al], stride
705 :			(pavg_stage[0]) pavg1.raz r[0] = ref1_aligned[OL], ref2_aligned[OL]
706 :			(up_stage[0]) unpack1.l r16_r[0] = zero, r[PAVGL]
707 :			}
708 :			{.mii
709 :			(ld_stage[0]) ld8 ref2_vh[0] = [ref2_ah], stride
710 :			(up_stage[0]) unpack1.h r16_l[0] = zero, r[PAVGL]
711 :			(up_stage[0]) unpack1.l c16_r[0] = zero, c[LL+SHL+OL+PAVGL]
712 :			}
713 :			{.mii
714 :			(st_stage[0]) st8 [dct_ar] = dct16_r[PSL], 16
715 :			(up_stage[0]) unpack1.h c16_l[0] = zero, c[LL+SHL+OL+PAVGL]
716 :			(psub_stage[0]) psub2.sss dct16_l[0] = c16_l[UL], r16_l[UL]
717 :			}
718 :			{.mib
719 :			(st_stage[0]) st8 [dct_al] = dct16_l[PSL], 16
720 :			(psub_stage[0]) psub2.sss dct16_r[0] = c16_r[UL], r16_r[UL]
721 :			br.ctop.sptk.few .Loop_8to16sub2 // Und hopp
722 :			;;
723 :			}
724 :
725 :			// * Restore old LC and PRs *
726 :			mov ar.lc = oldLC
727 :			mov pr = oldPR, -1
728 :
729 :	ia64p	205	br.ret.sptk.many b0
730 :			.endp transfer_8to16sub2_ia64#

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4