25 |
and r14 = -8,r33 // align src |
and r14 = -8,r33 // align src |
26 |
mov r15 = r32 // get dest |
mov r15 = r32 // get dest |
27 |
mov r16 = r34 // stride |
mov r16 = r34 // stride |
28 |
sub r17 = 1,r35 // 1-rounding |
// sub r17 = 0,r0 // 1-rounding |
29 |
|
|
30 |
;; |
;; |
31 |
|
|
32 |
add r18 = 8,r14 |
add r18 = 8,r14 |
33 |
mux1 r17 = r17, @brcst // broadcast 1-rounding |
// mux1 r17 = r17, @brcst // broadcast 1-rounding |
34 |
|
|
35 |
sub r24 = 64,r22 // lshift of src |
sub r24 = 64,r22 // lshift of src |
36 |
add r26 = 8,r22 // rshift of src+1 |
add r26 = 8,r22 // rshift of src+1 |
37 |
sub r27 = 56,r22 // lshift of src+1 |
sub r27 = 56,r22 // lshift of src+1 |
38 |
|
|
39 |
mov ar.lc = 7 // loopcounter |
mov ar.lc = 7 // loopcounter |
40 |
mov ar.ec = LL + SL +OL + AVL + AL + STL // sum of latencies |
mov ar.ec = LL + SL +OL + AVL + STL // sum of latencies |
41 |
mov pr.rot = 1 << 16 // init pr regs for sw-pipeling |
mov pr.rot = 1 << 16 // init pr regs for sw-pipeling |
42 |
|
|
43 |
;; |
;; |
44 |
.rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],add1[AL+1],avg[AVL+1] |
.rotr ald1[LL+1],ald2[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1] |
45 |
.rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL], pavg1p[AVL],stp[STL] |
.rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL] |
46 |
|
|
47 |
|
|
48 |
loop_interpolate: |
.Lloop_interpolate: |
49 |
(aldp[0]) ld8 ald1[0] = [r14],r16 // load aligned src |
(aldp[0]) ld8 ald1[0] = [r14],r16 // load aligned src |
50 |
(aldp[0]) ld8 ald2[0] = [r18],r16 // and aligned src+8 |
(aldp[0]) ld8 ald2[0] = [r18],r16 // and aligned src+8 |
51 |
|
|
57 |
(or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things |
(or1p[0]) or or1[0] = shru1[SL],shl2[SL] // merge things |
58 |
(or1p[0]) or or2[0] = shru2[SL],shl1[SL] |
(or1p[0]) or or2[0] = shru2[SL],shl1[SL] |
59 |
|
|
60 |
(addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding |
// (addp[0]) padd1.uus add1[0] = or1[OL],r17 // add 1-rounding |
61 |
|
|
62 |
(pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] // parallel average |
(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] // parallel average |
63 |
|
|
64 |
(stp[0]) st8 [r15] = avg[AVL] // store results |
(stp[0]) st8 [r15] = avg[AVL] // store results |
65 |
(stp[0]) add r15 = r15,r16 |
(stp[0]) add r15 = r15,r16 |
67 |
|
|
68 |
|
|
69 |
|
|
70 |
br.ctop.sptk.few loop_interpolate |
br.ctop.sptk.few .Lloop_interpolate |
71 |
;; |
;; |
72 |
mov ar.lc = r20 |
mov ar.lc = r20 |
73 |
mov pr = r21,-1 |
mov pr = r21,-1 |
97 |
and r14 = -8,r33 |
and r14 = -8,r33 |
98 |
mov r15 = r32 |
mov r15 = r32 |
99 |
mov r16 = r34 |
mov r16 = r34 |
100 |
sub r17 = 1,r35 |
// sub r17 = 0,r0 |
101 |
;; |
;; |
102 |
|
|
103 |
add r18 = 8,r14 |
add r18 = 8,r14 |
104 |
add r19 = r14,r16 // src + stride |
add r19 = r14,r16 // src + stride |
105 |
mux1 r17 = r17, @brcst |
// mux1 r17 = r17, @brcst |
106 |
|
|
107 |
sub r24 = 64,r22 |
sub r24 = 64,r22 |
108 |
;; |
;; |
109 |
add r26 = 8,r19 // src + stride + 8 |
add r26 = 8,r19 // src + stride + 8 |
110 |
|
|
111 |
mov ar.lc = 7 |
mov ar.lc = 7 |
112 |
mov ar.ec = LL + SL +OL + AVL + AL + STL |
mov ar.ec = LL + SL +OL + AVL + STL |
113 |
mov pr.rot = 1 << 16 |
mov pr.rot = 1 << 16 |
114 |
|
|
115 |
;; |
;; |
116 |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],add1[AL+1],avg[AVL+1] |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],or1[OL+1],or2[OL+1+AL],avg[AVL+1] |
117 |
.rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL], pavg1p[AVL],stp[STL] |
.rotp aldp[LL], sh1p[SL], or1p[OL], pavg1p[AVL],stp[STL] |
118 |
|
|
119 |
|
|
120 |
loop_interpolate2: |
.Lloop_interpolate2: |
121 |
(aldp[0]) ld8 ald1[0] = [r14],r16 |
(aldp[0]) ld8 ald1[0] = [r14],r16 |
122 |
(aldp[0]) ld8 ald2[0] = [r18],r16 |
(aldp[0]) ld8 ald2[0] = [r18],r16 |
123 |
(aldp[0]) ld8 ald3[0] = [r19],r16 |
(aldp[0]) ld8 ald3[0] = [r19],r16 |
131 |
(or1p[0]) or or1[0] = shru1[SL],shl1[SL] |
(or1p[0]) or or1[0] = shru1[SL],shl1[SL] |
132 |
(or1p[0]) or or2[0] = shru2[SL],shl2[SL] |
(or1p[0]) or or2[0] = shru2[SL],shl2[SL] |
133 |
|
|
134 |
(addp[0]) padd1.uus add1[0] = or1[OL],r17 |
// (addp[0]) padd1.uus add1[0] = or1[OL],r17 |
135 |
|
|
136 |
(pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] |
(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] |
137 |
|
|
138 |
(stp[0]) st8 [r15] = avg[AVL] |
(stp[0]) st8 [r15] = avg[AVL] |
139 |
(stp[0]) add r15 = r15,r16 |
(stp[0]) add r15 = r15,r16 |
141 |
|
|
142 |
|
|
143 |
|
|
144 |
br.ctop.sptk.few loop_interpolate2 |
br.ctop.sptk.few .Lloop_interpolate2 |
145 |
;; |
;; |
146 |
mov ar.lc = r20 |
mov ar.lc = r20 |
147 |
mov pr = r21,-1 |
mov pr = r21,-1 |
171 |
and r14 = -8,r33 |
and r14 = -8,r33 |
172 |
mov r15 = r32 |
mov r15 = r32 |
173 |
mov r16 = r34 |
mov r16 = r34 |
174 |
sub r17 = 1,r35 |
// sub r17 = 0,r0 |
175 |
;; |
;; |
176 |
|
|
177 |
add r18 = 8,r14 |
add r18 = 8,r14 |
178 |
add r19 = r14,r16 |
add r19 = r14,r16 |
179 |
mux1 r17 = r17, @brcst |
// mux1 r17 = r17, @brcst |
180 |
|
|
181 |
add r27 = 8,r22 |
add r27 = 8,r22 |
182 |
sub r28 = 56,r22 |
sub r28 = 56,r22 |
185 |
add r26 = 8,r19 |
add r26 = 8,r19 |
186 |
|
|
187 |
mov ar.lc = 7 |
mov ar.lc = 7 |
188 |
mov ar.ec = LL + SL +OL + 2*AVL + AL + STL |
mov ar.ec = LL + SL +OL + 2*AVL + STL |
189 |
mov pr.rot = 1 << 16 |
mov pr.rot = 1 << 16 |
190 |
|
|
191 |
;; |
;; |
192 |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],add1[AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1] |
.rotr ald1[LL+1],ald2[LL+1],ald3[LL+1],ald4[LL+1],shru1[SL+1],shl1[SL+1],shru2[SL+1],shl2[SL+1],shl3[SL+1],shru3[SL+1],shl4[SL+1],shru4[SL+1],or1[OL+1],or2[OL+1+AL],or3[OL+AL+1],or4[OL+AL+1],avg[AVL+1],avg1[AVL+1],avg2[AVL+1] |
193 |
.rotp aldp[LL], sh1p[SL], or1p[OL], addp[AL],pavg1p[AVL],pavg2p[AVL],stp[STL] |
.rotp aldp[LL], sh1p[SL], or1p[OL],pavg1p[AVL],pavg2p[AVL],stp[STL] |
194 |
|
|
195 |
|
|
196 |
loop_interpolate3: |
.Lloop_interpolate3: |
197 |
(aldp[0]) ld8 ald1[0] = [r14],r16 |
(aldp[0]) ld8 ald1[0] = [r14],r16 |
198 |
(aldp[0]) ld8 ald2[0] = [r18],r16 |
(aldp[0]) ld8 ald2[0] = [r18],r16 |
199 |
(aldp[0]) ld8 ald3[0] = [r19],r16 |
(aldp[0]) ld8 ald3[0] = [r19],r16 |
214 |
(or1p[0]) or or3[0] = shru3[SL],shl3[SL] |
(or1p[0]) or or3[0] = shru3[SL],shl3[SL] |
215 |
(or1p[0]) or or4[0] = shru4[SL],shl4[SL] |
(or1p[0]) or or4[0] = shru4[SL],shl4[SL] |
216 |
|
|
217 |
(addp[0]) padd1.uus add1[0] = or1[OL],r17 |
// (addp[0]) padd1.uus add1[0] = or1[OL],r17 |
218 |
|
|
219 |
(pavg1p[0]) pavg1 avg[0] = add1[AL],or2[OL+AL] |
(pavg1p[0]) pavg1 avg[0] = or1[OL],or2[OL] |
220 |
(pavg1p[0]) pavg1 avg1[0] = or3[OL+AL],or4[OL+AL] |
(pavg1p[0]) pavg1 avg1[0] = or3[OL],or4[OL] |
221 |
|
|
222 |
(pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL] |
(pavg2p[0]) pavg1 avg2[0] = avg[AVL],avg1[AVL] |
223 |
|
|
227 |
|
|
228 |
|
|
229 |
|
|
230 |
br.ctop.sptk.few loop_interpolate3 |
br.ctop.sptk.few .Lloop_interpolate3 |
231 |
;; |
;; |
232 |
mov ar.lc = r20 |
mov ar.lc = r20 |
233 |
mov pr = r21,-1 |
mov pr = r21,-1 |