30 |
%ifdef MARK_FUNCS |
%ifdef MARK_FUNCS |
31 |
global _%1:function %1.endfunc-%1 |
global _%1:function %1.endfunc-%1 |
32 |
%define %1 _%1:function %1.endfunc-%1 |
%define %1 _%1:function %1.endfunc-%1 |
33 |
|
%define ENDFUNC .endfunc |
34 |
%else |
%else |
35 |
global _%1 |
global _%1 |
36 |
%define %1 _%1 |
%define %1 _%1 |
37 |
|
%define ENDFUNC |
38 |
%endif |
%endif |
39 |
%else |
%else |
40 |
%ifdef MARK_FUNCS |
%ifdef MARK_FUNCS |
41 |
global %1:function %1.endfunc-%1 |
global %1:function %1.endfunc-%1 |
42 |
|
%define ENDFUNC .endfunc |
43 |
%else |
%else |
44 |
global %1 |
global %1 |
45 |
|
%define ENDFUNC |
46 |
%endif |
%endif |
47 |
%endif |
%endif |
48 |
%endmacro |
%endmacro |
131 |
COPY_H_SSE_RND0 |
COPY_H_SSE_RND0 |
132 |
ret |
ret |
133 |
|
|
134 |
.rounding1 |
.rounding1: |
135 |
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
136 |
movq mm7, [mmx_one wrt rip] |
movq mm7, [mmx_one wrt rip] |
137 |
COPY_H_SSE_RND1 |
COPY_H_SSE_RND1 |
142 |
lea rcx,[rcx+2*rdx] |
lea rcx,[rcx+2*rdx] |
143 |
COPY_H_SSE_RND1 |
COPY_H_SSE_RND1 |
144 |
ret |
ret |
145 |
.endfunc |
ENDFUNC |
146 |
|
|
147 |
;=========================================================================== |
;=========================================================================== |
148 |
; |
; |
202 |
COPY_V_SSE_RND0 |
COPY_V_SSE_RND0 |
203 |
ret |
ret |
204 |
|
|
205 |
.rounding1 |
.rounding1: |
206 |
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
207 |
movq mm7, [mmx_one wrt rip] |
movq mm7, [mmx_one wrt rip] |
208 |
movq mm2, [rax] ; loop invariant |
movq mm2, [rax] ; loop invariant |
216 |
lea rcx,[rcx+2*rdx] |
lea rcx,[rcx+2*rdx] |
217 |
COPY_V_SSE_RND1 |
COPY_V_SSE_RND1 |
218 |
ret |
ret |
219 |
.endfunc |
ENDFUNC |
220 |
|
|
221 |
;=========================================================================== |
;=========================================================================== |
222 |
; |
; |
346 |
COPY_HV_SSE_RND0 |
COPY_HV_SSE_RND0 |
347 |
ret |
ret |
348 |
|
|
349 |
.rounding1 |
.rounding1: |
350 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
351 |
add rcx, rdx |
add rcx, rdx |
352 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
355 |
add rcx, rdx |
add rcx, rdx |
356 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
357 |
ret |
ret |
358 |
.endfunc |
ENDFUNC |
359 |
|
|
360 |
;=========================================================================== |
;=========================================================================== |
361 |
; |
; |
415 |
lea rcx,[rcx+2*rdx] |
lea rcx,[rcx+2*rdx] |
416 |
ADD_FF 0, rdx |
ADD_FF 0, rdx |
417 |
EPILOG |
EPILOG |
418 |
.endfunc |
ENDFUNC |
419 |
|
|
420 |
;=========================================================================== |
;=========================================================================== |
421 |
; |
; |
477 |
ADD_FH_RND0 0, rdx |
ADD_FH_RND0 0, rdx |
478 |
EPILOG |
EPILOG |
479 |
|
|
480 |
.Loop1 |
.Loop1: |
481 |
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
482 |
; movq mm7, [mmx_one wrt rip] |
; movq mm7, [mmx_one wrt rip] |
483 |
ADD_FH_RND1 0, rdx |
ADD_FH_RND1 0, rdx |
491 |
lea rcx,[rcx+2*rdx] |
lea rcx,[rcx+2*rdx] |
492 |
ADD_FH_RND1 0, rdx |
ADD_FH_RND1 0, rdx |
493 |
EPILOG |
EPILOG |
494 |
.endfunc |
ENDFUNC |
495 |
|
|
496 |
|
|
497 |
;=========================================================================== |
;=========================================================================== |
552 |
ADD_8_HF_RND0 |
ADD_8_HF_RND0 |
553 |
EPILOG |
EPILOG |
554 |
|
|
555 |
.Loop1 |
.Loop1: |
556 |
movq mm0, [rax] ; loop invariant |
movq mm0, [rax] ; loop invariant |
557 |
movq mm7, [mmx_one wrt rip] |
movq mm7, [mmx_one wrt rip] |
558 |
|
|
567 |
lea rcx,[rcx+2*rdx] |
lea rcx,[rcx+2*rdx] |
568 |
ADD_8_HF_RND1 |
ADD_8_HF_RND1 |
569 |
EPILOG |
EPILOG |
570 |
.endfunc |
ENDFUNC |
571 |
|
|
572 |
; The trick is to correct the result of 'pavgb' with some combination of the |
; The trick is to correct the result of 'pavgb' with some combination of the |
573 |
; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). |
; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). |
697 |
ADD_HH_RND0 |
ADD_HH_RND0 |
698 |
EPILOG |
EPILOG |
699 |
|
|
700 |
.Loop1 |
.Loop1: |
701 |
ADD_HH_RND1 |
ADD_HH_RND1 |
702 |
add rcx, rdx |
add rcx, rdx |
703 |
ADD_HH_RND1 |
ADD_HH_RND1 |
707 |
ADD_HH_RND1 |
ADD_HH_RND1 |
708 |
|
|
709 |
EPILOG |
EPILOG |
710 |
.endfunc |
ENDFUNC |
711 |
|
|
712 |
%ifidn __OUTPUT_FORMAT__,elf |
%ifidn __OUTPUT_FORMAT__,elf |
713 |
section ".note.GNU-stack" noalloc noexec nowrite progbits |
section ".note.GNU-stack" noalloc noexec nowrite progbits |