20 |
; * along with this program ; if not, write to the Free Software |
; * along with this program ; if not, write to the Free Software |
21 |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
22 |
; * |
; * |
23 |
; * $Id: quantize_h263_mmx.asm,v 1.1.2.2 2003-10-09 18:50:22 edgomez Exp $ |
; * $Id: quantize_h263_mmx.asm,v 1.1.2.3 2003-10-28 22:23:03 edgomez Exp $ |
24 |
; * |
; * |
25 |
; ****************************************************************************/ |
; ****************************************************************************/ |
26 |
|
|
27 |
; enable dequant saturate [-2048,2047], test purposes only. |
; enable dequant saturate [-2048,2047], test purposes only. |
28 |
%define SATURATE |
%define SATURATE |
29 |
|
|
30 |
; data/text alignment |
BITS 32 |
|
%define ALIGN 8 |
|
|
|
|
|
bits 32 |
|
31 |
|
|
32 |
%macro cglobal 1 |
%macro cglobal 1 |
33 |
%ifdef PREFIX |
%ifdef PREFIX |
38 |
%endif |
%endif |
39 |
%endmacro |
%endmacro |
40 |
|
|
41 |
;*************************************************************************** |
;============================================================================= |
42 |
; Local data |
; Read only Local data |
43 |
;*************************************************************************** |
;============================================================================= |
44 |
|
|
45 |
%ifdef FORMAT_COFF |
SECTION .rodata |
|
section .data data |
|
|
%else |
|
|
section .data data align=16 |
|
|
%endif |
|
46 |
|
|
47 |
align 16 |
ALIGN 16 |
48 |
plus_one: |
plus_one: |
49 |
times 8 dw 1 |
times 8 dw 1 |
50 |
|
|
51 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
52 |
; |
; |
53 |
; subtract by Q/2 table |
; subtract by Q/2 table |
54 |
; |
; |
55 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
56 |
|
|
57 |
align 16 |
ALIGN 16 |
58 |
mmx_sub: |
mmx_sub: |
59 |
%assign quant 1 |
%assign quant 1 |
60 |
%rep 31 |
%rep 31 |
62 |
%assign quant quant+1 |
%assign quant quant+1 |
63 |
%endrep |
%endrep |
64 |
|
|
65 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
66 |
; |
; |
67 |
; divide by 2Q table |
; divide by 2Q table |
68 |
; |
; |
70 |
; for q=1, _pmulhw_ will overflow so it is treated seperately |
; for q=1, _pmulhw_ will overflow so it is treated seperately |
71 |
; (3dnow2 provides _pmulhuw_ which wont cause overflow) |
; (3dnow2 provides _pmulhuw_ which wont cause overflow) |
72 |
; |
; |
73 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
74 |
|
|
75 |
align 16 |
ALIGN 16 |
76 |
mmx_div: |
mmx_div: |
77 |
%assign quant 1 |
%assign quant 1 |
78 |
%rep 31 |
%rep 31 |
80 |
%assign quant quant+1 |
%assign quant quant+1 |
81 |
%endrep |
%endrep |
82 |
|
|
83 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
84 |
; |
; |
85 |
; add by (odd(Q) ? Q : Q - 1) table |
; add by (odd(Q) ? Q : Q - 1) table |
86 |
; |
; |
87 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
88 |
|
|
89 |
align 16 |
ALIGN 16 |
90 |
mmx_add: |
mmx_add: |
91 |
%assign quant 1 |
%assign quant 1 |
92 |
%rep 31 |
%rep 31 |
98 |
%assign quant quant+1 |
%assign quant quant+1 |
99 |
%endrep |
%endrep |
100 |
|
|
101 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
102 |
; |
; |
103 |
; multiple by 2Q table |
; multiple by 2Q table |
104 |
; |
; |
105 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
106 |
|
|
107 |
align 16 |
ALIGN 16 |
108 |
mmx_mul: |
mmx_mul: |
109 |
%assign quant 1 |
%assign quant 1 |
110 |
%rep 31 |
%rep 31 |
112 |
%assign quant quant+1 |
%assign quant quant+1 |
113 |
%endrep |
%endrep |
114 |
|
|
115 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
116 |
; |
; |
117 |
; saturation limits |
; saturation limits |
118 |
; |
; |
119 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
120 |
|
|
121 |
align 16 |
ALIGN 16 |
122 |
sse2_2047: |
sse2_2047: |
123 |
times 8 dw 2047 |
times 8 dw 2047 |
124 |
|
|
125 |
align 16 |
ALIGN 16 |
126 |
mmx_2047: |
mmx_2047: |
127 |
times 4 dw 2047 |
times 4 dw 2047 |
128 |
|
|
129 |
align 8 |
ALIGN 8 |
130 |
mmx_32768_minus_2048: |
mmx_32768_minus_2048: |
131 |
times 4 dw (32768-2048) |
times 4 dw (32768-2048) |
132 |
|
|
134 |
times 4 dw (32767-2047) |
times 4 dw (32767-2047) |
135 |
|
|
136 |
|
|
137 |
;*************************************************************************** |
;============================================================================= |
138 |
; Code |
; Code |
139 |
;*************************************************************************** |
;============================================================================= |
140 |
|
|
141 |
section .text |
SECTION .text |
142 |
|
|
143 |
|
cglobal quant_h263_intra_mmx |
144 |
|
cglobal quant_h263_intra_sse2 |
145 |
|
cglobal quant_h263_inter_mmx |
146 |
|
cglobal quant_h263_inter_sse2 |
147 |
|
cglobal dequant_h263_intra_mmx |
148 |
|
cglobal dequant_h263_intra_xmm |
149 |
|
cglobal dequant_h263_intra_sse2 |
150 |
|
cglobal dequant_h263_inter_mmx |
151 |
|
cglobal dequant_h263_inter_xmm |
152 |
|
cglobal dequant_h263_inter_sse2 |
153 |
|
|
154 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
155 |
; |
; |
156 |
; uint32_t quant_h263_intra_mmx(int16_t * coeff, |
; uint32_t quant_h263_intra_mmx(int16_t * coeff, |
157 |
; const int16_t const * data, |
; const int16_t const * data, |
158 |
; const uint32_t quant, |
; const uint32_t quant, |
159 |
; const uint32_t dcscalar); |
; const uint32_t dcscalar); |
160 |
; |
; |
161 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
162 |
|
|
163 |
align ALIGN |
ALIGN 16 |
|
cglobal quant_h263_intra_mmx |
|
164 |
quant_h263_intra_mmx: |
quant_h263_intra_mmx: |
165 |
|
|
166 |
push ecx |
push ecx |
177 |
|
|
178 |
movq mm7, [mmx_div + eax * 8 - 8] |
movq mm7, [mmx_div + eax * 8 - 8] |
179 |
|
|
180 |
align ALIGN |
ALIGN 16 |
181 |
.loop |
.loop |
182 |
movq mm0, [esi + 8*ecx] ; mm0 = [1st] |
movq mm0, [esi + 8*ecx] ; mm0 = [1st] |
183 |
movq mm3, [esi + 8*ecx + 8] |
movq mm3, [esi + 8*ecx + 8] |
229 |
|
|
230 |
ret |
ret |
231 |
|
|
232 |
align ALIGN |
ALIGN 16 |
233 |
.q1loop |
.q1loop |
234 |
movq mm0, [esi + 8*ecx] ; mm0 = [1st] |
movq mm0, [esi + 8*ecx] ; mm0 = [1st] |
235 |
movq mm3, [esi + 8*ecx + 8] |
movq mm3, [esi + 8*ecx + 8] |
257 |
|
|
258 |
|
|
259 |
|
|
260 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
261 |
; |
; |
262 |
; uint32_t quant_h263_intra_sse2(int16_t * coeff, |
; uint32_t quant_h263_intra_sse2(int16_t * coeff, |
263 |
; const int16_t const * data, |
; const int16_t const * data, |
264 |
; const uint32_t quant, |
; const uint32_t quant, |
265 |
; const uint32_t dcscalar); |
; const uint32_t dcscalar); |
266 |
; |
; |
267 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
268 |
|
|
269 |
align ALIGN |
ALIGN 16 |
|
cglobal quant_h263_intra_sse2 |
|
270 |
quant_h263_intra_sse2: |
quant_h263_intra_sse2: |
271 |
|
|
272 |
push esi |
push esi |
285 |
movq2dq xmm7, mm7 |
movq2dq xmm7, mm7 |
286 |
movlhps xmm7, xmm7 |
movlhps xmm7, xmm7 |
287 |
|
|
288 |
align 16 |
ALIGN 16 |
289 |
.qas2_loop |
.qas2_loop |
290 |
movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] |
movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] |
291 |
movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] |
movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] |
336 |
|
|
337 |
ret |
ret |
338 |
|
|
339 |
align 16 |
ALIGN 16 |
340 |
.qas2_q1loop |
.qas2_q1loop |
341 |
movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] |
movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] |
342 |
movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] |
movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] |
364 |
|
|
365 |
|
|
366 |
|
|
367 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
368 |
; |
; |
369 |
; uint32_t quant_h263_inter_mmx(int16_t * coeff, |
; uint32_t quant_h263_inter_mmx(int16_t * coeff, |
370 |
; const int16_t const * data, |
; const int16_t const * data, |
371 |
; const uint32_t quant); |
; const uint32_t quant); |
372 |
; |
; |
373 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
374 |
|
|
375 |
align ALIGN |
ALIGN 16 |
|
cglobal quant_h263_inter_mmx |
|
376 |
quant_h263_inter_mmx: |
quant_h263_inter_mmx: |
377 |
|
|
378 |
push ecx |
push ecx |
393 |
|
|
394 |
movq mm7, [mmx_div + eax * 8 - 8] ; divider |
movq mm7, [mmx_div + eax * 8 - 8] ; divider |
395 |
|
|
396 |
align ALIGN |
ALIGN 8 |
397 |
.loop |
.loop |
398 |
movq mm0, [esi + 8*ecx] ; mm0 = [1st] |
movq mm0, [esi + 8*ecx] ; mm0 = [1st] |
399 |
movq mm3, [esi + 8*ecx + 8] |
movq mm3, [esi + 8*ecx + 8] |
435 |
|
|
436 |
ret |
ret |
437 |
|
|
438 |
align ALIGN |
ALIGN 8 |
439 |
.q1loop |
.q1loop |
440 |
movq mm0, [esi + 8*ecx] ; mm0 = [1st] |
movq mm0, [esi + 8*ecx] ; mm0 = [1st] |
441 |
movq mm3, [esi + 8*ecx+ 8] ; |
movq mm3, [esi + 8*ecx+ 8] ; |
468 |
|
|
469 |
|
|
470 |
|
|
471 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
472 |
; |
; |
473 |
; uint32_t quant_h263_inter_sse2(int16_t * coeff, |
; uint32_t quant_h263_inter_sse2(int16_t * coeff, |
474 |
; const int16_t const * data, |
; const int16_t const * data, |
475 |
; const uint32_t quant); |
; const uint32_t quant); |
476 |
; |
; |
477 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
478 |
|
|
479 |
align 16 |
ALIGN 16 |
|
cglobal quant_h263_inter_sse2 |
|
480 |
quant_h263_inter_sse2: |
quant_h263_inter_sse2: |
481 |
|
|
482 |
push esi |
push esi |
502 |
movq2dq xmm7, mm0 |
movq2dq xmm7, mm0 |
503 |
movlhps xmm7, xmm7 |
movlhps xmm7, xmm7 |
504 |
|
|
505 |
align 16 |
ALIGN 16 |
506 |
.qes2_loop |
.qes2_loop |
507 |
movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] |
movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] |
508 |
movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] |
movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] |
549 |
|
|
550 |
ret |
ret |
551 |
|
|
552 |
align 16 |
ALIGN 16 |
553 |
.qes2_q1loop |
.qes2_q1loop |
554 |
movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] |
movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] |
555 |
movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] |
movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] |
580 |
jmp .qes2_done |
jmp .qes2_done |
581 |
|
|
582 |
|
|
583 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
584 |
; |
; |
585 |
; uint32_t dequant_h263_intra_mmx(int16_t *data, |
; uint32_t dequant_h263_intra_mmx(int16_t *data, |
586 |
; const int16_t const *coeff, |
; const int16_t const *coeff, |
587 |
; const uint32_t quant, |
; const uint32_t quant, |
588 |
; const uint32_t dcscalar); |
; const uint32_t dcscalar); |
589 |
; |
; |
590 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
591 |
|
|
592 |
; note: we only saturate to +2047 *before* restoring the sign. |
; note: we only saturate to +2047 *before* restoring the sign. |
593 |
; Hence, final clamp really is [-2048,2047] |
; Hence, final clamp really is [-2048,2047] |
594 |
|
|
595 |
align ALIGN |
ALIGN 16 |
|
cglobal dequant_h263_intra_mmx |
|
596 |
dequant_h263_intra_mmx: |
dequant_h263_intra_mmx: |
597 |
|
|
598 |
mov edx, [esp+ 4] ; data |
mov edx, [esp+ 4] ; data |
602 |
movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant |
movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant |
603 |
mov eax, -16 |
mov eax, -16 |
604 |
|
|
605 |
align ALIGN |
ALIGN 16 |
606 |
.loop |
.loop |
607 |
movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] |
movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] |
608 |
movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] |
movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] |
656 |
xor eax, eax ; return(0); |
xor eax, eax ; return(0); |
657 |
ret |
ret |
658 |
|
|
659 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
660 |
; |
; |
661 |
; uint32_t dequant_h263_intra_xmm(int16_t *data, |
; uint32_t dequant_h263_intra_xmm(int16_t *data, |
662 |
; const int16_t const *coeff, |
; const int16_t const *coeff, |
663 |
; const uint32_t quant, |
; const uint32_t quant, |
664 |
; const uint32_t dcscalar); |
; const uint32_t dcscalar); |
665 |
; |
; |
666 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
667 |
|
|
668 |
; this is the same as dequant_inter_mmx, except that we're |
; this is the same as dequant_inter_mmx, except that we're |
669 |
; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster) |
; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster) |
670 |
|
|
671 |
align ALIGN |
ALIGN 16 |
|
cglobal dequant_h263_intra_xmm |
|
672 |
dequant_h263_intra_xmm: |
dequant_h263_intra_xmm: |
673 |
|
|
674 |
mov edx, [esp+ 4] ; data |
mov edx, [esp+ 4] ; data |
678 |
movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant |
movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant |
679 |
mov eax, -16 |
mov eax, -16 |
680 |
|
|
681 |
align ALIGN |
ALIGN 16 |
682 |
.loop |
.loop |
683 |
movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] |
movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] |
684 |
movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] |
movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] |
731 |
ret |
ret |
732 |
|
|
733 |
|
|
734 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
735 |
; |
; |
736 |
; uint32_t dequant_h263_intra_sse2(int16_t *data, |
; uint32_t dequant_h263_intra_sse2(int16_t *data, |
737 |
; const int16_t const *coeff, |
; const int16_t const *coeff, |
738 |
; const uint32_t quant, |
; const uint32_t quant, |
739 |
; const uint32_t dcscalar); |
; const uint32_t dcscalar); |
740 |
; |
; |
741 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
742 |
|
|
743 |
align ALIGN |
ALIGN 16 |
|
cglobal dequant_h263_intra_sse2 |
|
744 |
dequant_h263_intra_sse2: |
dequant_h263_intra_sse2: |
745 |
mov edx, [esp+ 4] ; data |
mov edx, [esp+ 4] ; data |
746 |
mov ecx, [esp+ 8] ; coeff |
mov ecx, [esp+ 8] ; coeff |
753 |
movlhps xmm7, xmm7 |
movlhps xmm7, xmm7 |
754 |
mov eax, -16 |
mov eax, -16 |
755 |
|
|
756 |
align ALIGN |
ALIGN 16 |
757 |
.loop |
.loop |
758 |
movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i] |
movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i] |
759 |
movdqa xmm3, [ecx + 8*16 + 8*eax+ 16] |
movdqa xmm3, [ecx + 8*16 + 8*eax+ 16] |
805 |
xor eax, eax |
xor eax, eax |
806 |
ret |
ret |
807 |
|
|
808 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
809 |
; |
; |
810 |
; uint32t dequant_h263_inter_mmx(int16_t * data, |
; uint32t dequant_h263_inter_mmx(int16_t * data, |
811 |
; const int16_t * const coeff, |
; const int16_t * const coeff, |
812 |
; const uint32_t quant); |
; const uint32_t quant); |
813 |
; |
; |
814 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
815 |
|
|
816 |
align ALIGN |
ALIGN 16 |
|
cglobal dequant_h263_inter_mmx |
|
817 |
dequant_h263_inter_mmx: |
dequant_h263_inter_mmx: |
818 |
|
|
819 |
mov edx, [esp+ 4] ; data |
mov edx, [esp+ 4] ; data |
823 |
movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant |
movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant |
824 |
mov eax, -16 |
mov eax, -16 |
825 |
|
|
826 |
align ALIGN |
ALIGN 16 |
827 |
.loop |
.loop |
828 |
movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] |
movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] |
829 |
movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] |
movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] |
864 |
xor eax, eax |
xor eax, eax |
865 |
ret |
ret |
866 |
|
|
867 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
868 |
; |
; |
869 |
; uint32_t dequant_h263_inter_xmm(int16_t * data, |
; uint32_t dequant_h263_inter_xmm(int16_t * data, |
870 |
; const int16_t * const coeff, |
; const int16_t * const coeff, |
871 |
; const uint32_t quant); |
; const uint32_t quant); |
872 |
; |
; |
873 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
874 |
|
|
875 |
; this is the same as dequant_inter_mmx, |
; this is the same as dequant_inter_mmx, |
876 |
; except that we're saturating using 'pminsw' (saves 2 cycles/loop) |
; except that we're saturating using 'pminsw' (saves 2 cycles/loop) |
877 |
|
|
878 |
align ALIGN |
ALIGN 16 |
|
cglobal dequant_h263_inter_xmm |
|
879 |
dequant_h263_inter_xmm: |
dequant_h263_inter_xmm: |
880 |
|
|
881 |
mov edx, [esp+ 4] ; data |
mov edx, [esp+ 4] ; data |
885 |
movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant |
movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant |
886 |
mov eax, -16 |
mov eax, -16 |
887 |
|
|
888 |
align ALIGN |
ALIGN 16 |
889 |
.loop |
.loop |
890 |
movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] |
movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] |
891 |
movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] |
movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] |
924 |
xor eax, eax |
xor eax, eax |
925 |
ret |
ret |
926 |
|
|
927 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
928 |
; |
; |
929 |
; uint32_t dequant_h263_inter_sse2(int16_t * data, |
; uint32_t dequant_h263_inter_sse2(int16_t * data, |
930 |
; const int16_t * const coeff, |
; const int16_t * const coeff, |
931 |
; const uint32_t quant); |
; const uint32_t quant); |
932 |
; |
; |
933 |
;=========================================================================== |
;----------------------------------------------------------------------------- |
934 |
|
|
935 |
align ALIGN |
ALIGN 16 |
|
cglobal dequant_h263_inter_sse2 |
|
936 |
dequant_h263_inter_sse2: |
dequant_h263_inter_sse2: |
937 |
mov edx, [esp + 4] ; data |
mov edx, [esp + 4] ; data |
938 |
mov ecx, [esp + 8] ; coeff |
mov ecx, [esp + 8] ; coeff |
945 |
movlhps xmm7, xmm7 |
movlhps xmm7, xmm7 |
946 |
mov eax, -16 |
mov eax, -16 |
947 |
|
|
948 |
align ALIGN |
ALIGN 16 |
949 |
.loop |
.loop |
950 |
movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i] |
movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i] |
951 |
movdqa xmm3, [ecx + 8*16 + 8*eax + 16] |
movdqa xmm3, [ecx + 8*16 + 8*eax + 16] |