[svn] / trunk / xvidcore / src / motion / ia64_asm / sad_ia64.s Repository: Repository Listing svn

Thu Jun 20 14:25:44 2002 UTC (20 years, 11 months ago) by ia64p
File size: 12017 byte(s)
```dev16 is optimized, now.
```
```.text
.align 16

_LL=3
_SL=1
_OL=1
_PL=1
_AL=1

alloc r9=ar.pfs,4,44,0,48

mov r8 = r0

mov r20 = ar.lc
mov r21 = pr

dep.z r22		= r32, 3, 3 // erste 3 Bit mit 8 multiplizieren
dep.z r23		= r33, 3, 3 // in r22 und r23 -> Schiebeflags

and r14		= -8, r32 // Parameter in untere Register kopieren
and r15		= -8, r33 // Ref Cur mit 11111...1000 and-en
mov r16		= r34
mov r17		= r35
;;

sub r24		= 64, r22 // Schiftanzahl ausrechnen
sub r25		= 64, r23

// Loop-counter initialisieren
mov ar.lc = 15			// Loop 16 mal durchlaufen
mov ar.ec = _LL + _SL + _OL + _PL + _AL + _AL			// Die Loop am Schluss noch neun mal durchlaufen

// Rotating Predicate Register zuruecksetzen und P16 auf 1
mov pr.rot = 1 << 16
;;

// Array-Konstrukte initialisieren
.rotr _ald1[_LL+1], _ald2[_LL+1], _ald3[_LL+1], _ald4[_LL+1], _ald5[_LL+1], _ald6[_LL+1], _shru1[_SL+1], _shl1[_SL+1], _shru2[_SL], _shl2[_SL], _shru3[_SL], _shl3[_SL], _shru4[_SL], _shl4[_SL+1], _or1[_OL], _or2[_OL], _or3[_OL], _or4[_OL+1], _psadr1[_PL+1], _psadr2[_PL+1], _addr1[_AL+1]

.L_loop_16:
{.mmi
(_aldp[0]) ld8 _ald1[0] = [r14], r16	// Cur Erste 8 Byte
(_aldp[0]) ld8 _ald2[0] = [r18], r16    // Cur Zweite 8 Byte
}
{.mmi
(_aldp[0]) ld8 _ald3[0] = [r26], r16    // Cur Dritte 8 Byte
(_aldp[0]) ld8 _ald4[0] = [r15], r16	// Ref Erste 8 Byte
}
{.mmi
(_aldp[0]) ld8 _ald5[0] = [r19], r16    // Ref Zweite 8 Byte
(_aldp[0]) ld8 _ald6[0] = [r27], r16    // Ref Dritte 8 Byte
(_shp[0]) shr.u _shru1[0] = _ald1[_LL], r22
}
{.mii
(_orp[0]) or _or1[0]     = _shl2[0], _shru3[0] // _shru2 + 1 und _shl2 + 1
(_shp[0]) shl _shl1[0]   = _ald2[_LL], r24
(_shp[0]) shr.u _shru2[0] = _ald2[_LL], r22
}
{.mii
(_orp[0]) or _or2[0]  = _shl3[0], _shru4[0]  // _shru3 + 1 und _shl3 + 1
(_shp[0]) shl _shl2[0] = _ald3[_LL], r24
(_shp[0]) shr.u _shru3[0] = _ald4[_LL], r23
}
{.mii
(_orp[0]) or _or3[0]  = _shl4[0], _shl4[_SL] //_shru4 + 1 und _shl4 + 1
(_shp[0]) shl _shl3[0] = _ald5[_LL], r25
(_shp[0]) shr.u _shru4[0] = _ald5[_LL], r23
}
{.mmi
(_orp[0]) or _or4[0]  = _shru1[_SL], _shl1[_SL]
(_shp[0]) shl _shl4[0]= _ald6[_LL], r25
}
{.mmb
br.ctop.sptk.few .L_loop_16
;;
}
// Register zurueckschreiben
mov ar.lc = r20
mov pr = r21,-1
br.ret.sptk.many rp

.align 16

LL=3
SL=1
OL=1
PL=1
AL=1

alloc r9=ar.pfs,3,29,0,32
mov r20 = ar.lc
mov r21 = pr

dep.z r22		= r32, 3, 3 // erste 3 Bit mit 8 multiplizieren
dep.z r23		= r33, 3, 3 // in r22 und r23 -> Schiebeflags

mov r8 = r0		     //   .   .   .   .
and r14		= -8, r32 // 0xFFFFFFFFFFFFFFF8, r32
and r15		= -8, r33 // 0xFFFFFFFFFFFFFFF8, r33
mov r16		= r34
//	mov r17		= r35
;;

sub r24		= 64, r22
sub r25		= 64, r23

// Loop-counter initialisieren
mov ar.lc = 7			// Loop 7 mal durchlaufen
mov ar.ec = LL + SL + OL + PL + AL			// Die Loop am Schluss noch zehn mal durchlaufen

// Rotating Predicate Register zuruecksetzen und P16 auf 1
mov pr.rot = 1 << 16
;;
.rotr ald1[LL+1], ald2[LL+1], ald3[LL+1], ald4[LL+1], shru1[SL+1], shl1[SL+1], shru2[SL+1], shl2[SL+1], or1[OL+1], or2[OL+1], psadr[PL+1], addr[AL+1]
.L_loop_8:
{.mmi
(aldp[0]) ld8 ald1[0] = [r14], r16	// Cur laden
(aldp[0]) ld8 ald2[0] = [r18], r16
(shp[0]) shr.u shru1[0] = ald1[LL], r22	// mergen
}
{.mii
(orp[0]) or or1[0] = shru1[SL], shl1[SL]
(shp[0]) shl shl1[0] = ald2[LL], r24
(shp[0]) shr.u shru2[0] = ald3[LL], r23	// mergen
}
{.mmi
(aldp[0]) ld8 ald3[0] = [r15], r16	// Ref laden
(aldp[0]) ld8 ald4[0] = [r19], r16
(shp[0]) shl shl2[0]  = ald4[LL], r25
}
{.mmi
(orp[0]) or or2[0] = shru2[SL], shl2[SL]
}
{.mbb
br.ctop.sptk.few .L_loop_8
;;
}

mov ar.lc = r20
mov pr = r21,-1
br.ret.sptk.many b0

.align 16
.prologue
.save ar.lc, r2
mov r2 = ar.lc
.body
zxt4 r35 = r35
mov r8 = r0
mov r23 = r0
.L21:
mov r19 = r32
mov r21 = r34
mov r20 = r33
;;
mov ar.lc = r14
;;
.L105:
mov r17 = r20
mov r18 = r21
;;
ld1 r14 = [r17], 1
ld1 r15 = [r18], 1
;;
;;
;;
shr.u r16 = r14, 1
;;
cmp4.le p6, p7 = r0, r16
;;
(p7) mov r16 = r0
(p7) br.cond.dpnt .L96
;;
cmp4.ge p6, p7 = r22, r16
;;
(p7) addl r16 = 255, r0
.L96:
ld1 r14 = [r19]
;;
sub r15 = r14, r16
;;
cmp4.ge p6, p7 = 0, r15
;;
(p6) sub r14 = r16, r14
(p7) add r8 = r8, r15
;;
(p6) add r8 = r8, r14
ld1 r15 = [r18]
ld1 r14 = [r17]
;;
;;
;;
shr.u r16 = r14, 1
;;
cmp4.le p6, p7 = r0, r16
;;
(p7) mov r16 = r0
(p7) br.cond.dpnt .L102
;;
cmp4.ge p6, p7 = r22, r16
;;
(p7) addl r16 = 255, r0
.L102:
ld1 r14 = [r17]
;;
sub r15 = r14, r16
;;
cmp4.ge p6, p7 = 0, r15
;;
(p7) add r8 = r8, r15
(p6) sub r14 = r16, r14
;;
(p6) add r8 = r8, r14
br.cloop.sptk.few .L105
;;
cmp4.geu p6, p7 = 15, r23
(p6) br.cond.dptk .L21
mov ar.lc = r2
br.ret.sptk.many b0

.text
.align 16
.global dev16_ia64#
.proc dev16_ia64#
.auto
dev16_ia64:
stride = r18
pfs = r19			//for saving previous function state
cura0 = r20			//address of first 8-byte block of cur
cura1 = r21			//address of second 8-byte block of cur
mean0 = r22			//registers for calculating the sum in parallel
mean1 = r23
mean2 = r24
mean3 = r25
dev0 = r26			//same for the deviation
dev1 = r27
dev2 = r28
dev3 = r29

.body
alloc pfs = ar.pfs, 2, 38, 0, 40

mov cura0  = in0
mov stride = in1

.rotr c[32], psad[8] 		// just using rotating registers to get an array ;-)

.explicit
{.mmi
ld8 c[0] = [cura0], stride	// load them ...
ld8 c[1] = [cura1], stride
;;
}
{.mmi
ld8 c[2] = [cura0], stride
ld8 c[3] = [cura1], stride
;;
}
{.mmi
ld8 c[4] = [cura0], stride
ld8 c[5] = [cura1], stride
;;
}
{.mmi
ld8 c[6] = [cura0], stride
ld8 c[7] = [cura1], stride
;;
}
{.mmi
ld8 c[8] = [cura0], stride
ld8 c[9] = [cura1], stride
;;
}
{.mmi
ld8 c[10] = [cura0], stride
ld8 c[11] = [cura1], stride
;;
}
{.mii
ld8 c[12] = [cura0], stride
psad1 mean0 = c[0], r0		// get the sum of them ...
}
{.mmi
ld8 c[13] = [cura1], stride
;;
ld8 c[14] = [cura0], stride
}
{.mii
ld8 c[15] = [cura1], stride
;;
}
{.mmi
ld8 c[16] = [cura0], stride
ld8 c[17] = [cura1], stride
;;
}
{.mii
ld8 c[18] = [cura0], stride
}
{.mmi
ld8 c[19] = [cura1], stride
;;
ld8 c[20] = [cura0], stride
}
{.mii
ld8 c[21] = [cura1], stride
;;
}
{.mmi
ld8 c[22] = [cura0], stride
ld8 c[23] = [cura1], stride
;;
}
{.mii
ld8 c[24] = [cura0], stride
}
{.mmi
ld8 c[25] = [cura1], stride
;;
ld8 c[26] = [cura0], stride
}
{.mii
ld8 c[27] = [cura1], stride
;;
}
{.mmi
ld8 c[28] = [cura0], stride
ld8 c[29] = [cura1], stride
;;
}
{.mii
ld8 c[30] = [cura0]
}
{.mmi
ld8 c[31] = [cura1]
;;
}
{.mii
;;
}
{.mmi
;;
}
{.mii
}
{.mmi
;;
}
{.mii
;;
}
{.mmi
;;
}
{.mii
}
{.mmi
;;
}
{.mii
;;
}
{.mmi
;;
}
{.mii
}
{.mmi
;;
}
{.mbb
nop.b 1
nop.b 1
;;
}
{.mib
nop.b 1
;;
}
{.mib
// add mean2 = 127, mean2	// this could make our division more exact, but does not help much
;;
}
{.mib
;;
}

{.mib
shr.u mean0 = mean0, 8		// divide them ...
;;
}
{.mib
mux1 mean0 = mean0, @brcst
;;
}
{.mii
nop.m 0
psad1 dev0 = c[0], mean0	// and do a sad again ...
}
{.mii
nop.m 0
}
{.mii
nop.m 0
}
{.mii
nop.m 0
}
{.mii
nop.m 0
;;
}
{.mii
}
{.mmi

}
{.mii
;;
}
{.mmi
}
{.mii
}
{.mmi
;;
}
{.mii

}
{.mmi
;;
}
{.mii
}
{.mmi

}
{.mii
;;
}
{.mmi
}
{.mii
}
{.mmi
;;
}
{.mii
;;
}
{.mmi
;;
}
{.mii
;;
}
{.mib