25 |
and r14 = -8,r33 // align src |
and r14 = -8,r33 // align src |
26 |
mov r15 = r32 // get dest |
mov r15 = r32 // get dest |
27 |
mov r16 = r34 // stride |
mov r16 = r34 // stride |
28 |
sub r17 = 1,r35 // 1-rounding |
// sub r17 = 0,r0 // 1-rounding |
29 |
|
|
30 |
;; |
;; |
31 |
|
|
32 |
add r18 = 8,r14 |
add r18 = 8,r14 |
33 |
mux1 r17 = r17, @brcst // broadcast 1-rounding |
// mux1 r17 = r17, @brcst // broadcast 1-rounding |
34 |
|
|
35 |
sub r24 = 64,r22 // lshift of src |
sub r24 = 64,r22 // lshift of src |
36 |
add r26 = 8,r22 // rshift of src+1 |
add r26 = 8,r22 // rshift of src+1 |
37 |
sub r27 = 56,r22 // lshift of src+1 |
sub r27 = 56,r22 // lshift of src+1 |
38 |
|
|
39 |
mov ar.lc = 7 // loopcounter |
mov ar.lc = 7 // loopcounter |
40 |
mov ar.ec = LL + SL +OL + AVL + AL + STL // sum of latencies |
mov ar.ec = LL + SL +OL + AVL + STL // sum of latencies |
41 |
mov pr.rot = 1 << 16 // init pr regs for sw-pipeling |
mov pr.rot = 1 << 16 // init pr regs for sw-pipeling |
42 |
|
|
43 |
;; |
;; |
44 |
.rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],add1[AL+1],avg[AVL+1] |
.rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1] |
45 |
.rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL], pavg1p[AVL],stp[STL] |
.rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL] |
46 |
|
|
47 |
|
|
48 |
.Lloop_interpolate: |
.Lloop_interpolate: |
57 |
(or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things |
(or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things |
58 |
(or1p[0]) or or2[0] = shru2[SL],shl1[SL] |
(or1p[0]) or or2[0] = shru2[SL],shl1[SL] |
59 |
|
|
60 |
(addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding |
// (addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding |
61 |
|
|
62 |
(pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] // parallel average |
(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] // parallel average |
63 |
|
|
64 |
(stp[0]) st8 [r15] = avg[AVL] // store results |
(stp[0]) st8 [r15] = avg[AVL] // store results |
65 |
(stp[0]) add r15 = r15,r16 |
(stp[0]) add r15 = r15,r16 |
97 |
and r14 = -8,r33 |
and r14 = -8,r33 |
98 |
mov r15 = r32 |
mov r15 = r32 |
99 |
mov r16 = r34 |
mov r16 = r34 |
100 |
sub r17 = 1,r35 |
// sub r17 = 0,r0 |
101 |
;; |
;; |
102 |
|
|
103 |
add r18 = 8,r14 |
add r18 = 8,r14 |
104 |
add r19 = r14,r16 // src + stride |
add r19 = r14,r16 // src + stride |
105 |
mux1 r17 = r17, @brcst |
// mux1 r17 = r17, @brcst |
106 |
|
|
107 |
sub r24 = 64,r22 |
sub r24 = 64,r22 |
108 |
;; |
;; |
109 |
add r26 = 8,r19 // src + stride + 8 |
add r26 = 8,r19 // src + stride + 8 |
110 |
|
|
111 |
mov ar.lc = 7 |
mov ar.lc = 7 |
112 |
mov ar.ec = LL + SL +OL + AVL + AL + STL |
mov ar.ec = LL + SL +OL + AVL + STL |
113 |
mov pr.rot = 1 << 16 |
mov pr.rot = 1 << 16 |
114 |
|
|
115 |
;; |
;; |
116 |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],add1[AL+1],avg[AVL+1] |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1] |
117 |
.rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL], pavg1p[AVL],stp[STL] |
.rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL] |
118 |
|
|
119 |
|
|
120 |
.Lloop_interpolate2: |
.Lloop_interpolate2: |
131 |
(or1p[0]) or or1[0] = shru1[SL],shl1[SL] |
(or1p[0]) or or1[0] = shru1[SL],shl1[SL] |
132 |
(or1p[0]) or or2[0] = shru2[SL],shl2[SL] |
(or1p[0]) or or2[0] = shru2[SL],shl2[SL] |
133 |
|
|
134 |
(addp[0]) padd1.uus add1[0] = or1[OL],r17 |
// (addp[0]) padd1.uus add1[0] = or1[OL],r17 |
135 |
|
|
136 |
(pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] |
(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] |
137 |
|
|
138 |
(stp[0]) st8 [r15] = avg[AVL] |
(stp[0]) st8 [r15] = avg[AVL] |
139 |
(stp[0]) add r15 = r15,r16 |
(stp[0]) add r15 = r15,r16 |
171 |
and r14 = -8,r33 |
and r14 = -8,r33 |
172 |
mov r15 = r32 |
mov r15 = r32 |
173 |
mov r16 = r34 |
mov r16 = r34 |
174 |
sub r17 = 1,r35 |
// sub r17 = 0,r0 |
175 |
;; |
;; |
176 |
|
|
177 |
add r18 = 8,r14 |
add r18 = 8,r14 |
178 |
add r19 = r14,r16 |
add r19 = r14,r16 |
179 |
mux1 r17 = r17, @brcst |
// mux1 r17 = r17, @brcst |
180 |
|
|
181 |
add r27 = 8,r22 |
add r27 = 8,r22 |
182 |
sub r28 = 56,r22 |
sub r28 = 56,r22 |
185 |
add r26 = 8,r19 |
add r26 = 8,r19 |
186 |
|
|
187 |
mov ar.lc = 7 |
mov ar.lc = 7 |
188 |
mov ar.ec = LL + SL +OL + 2*AVL + AL + STL |
mov ar.ec = LL + SL +OL + 2*AVL + STL |
189 |
mov pr.rot = 1 << 16 |
mov pr.rot = 1 << 16 |
190 |
|
|
191 |
;; |
;; |
192 |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],add1[AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1] |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1] |
193 |
.rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL],pavg1p[AVL],pavg2p[AVL],stp[STL] |
.rotp aldp[LL], sh1p[SL], or1p[OL],pavg1p[AVL],pavg2p[AVL],stp[STL] |
194 |
|
|
195 |
|
|
196 |
.Lloop_interpolate3: |
.Lloop_interpolate3: |
214 |
(or1p[0]) or or3[0] = shru3[SL],shl3[SL] |
(or1p[0]) or or3[0] = shru3[SL],shl3[SL] |
215 |
(or1p[0]) or or4[0] = shru4[SL],shl4[SL] |
(or1p[0]) or or4[0] = shru4[SL],shl4[SL] |
216 |
|
|
217 |
(addp[0]) padd1.uus add1[0] = or1[OL],r17 |
// (addp[0]) padd1.uus add1[0] = or1[OL],r17 |
218 |
|
|
219 |
(pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] |
(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] |
220 |
(pavg1p[0]) pavg1 avg1[0] = or3[OL+AL],or4[OL+AL] |
(pavg1p[0]) pavg1 avg1[0] = or3[OL],or4[OL] |
221 |
|
|
222 |
(pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL] |
(pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL] |
223 |
|
|