6 |
; * Copyright(C) 2004 Peter Ross <pross@xvid.org> |
; * Copyright(C) 2004 Peter Ross <pross@xvid.org> |
7 |
; * 2004 Dcoder <dcoder@alexandria.cc> |
; * 2004 Dcoder <dcoder@alexandria.cc> |
8 |
; * |
; * |
9 |
; * XviD is free software; you can redistribute it and/or modify it |
; * Xvid is free software; you can redistribute it and/or modify it |
10 |
; * under the terms of the GNU General Public License as published by |
; * under the terms of the GNU General Public License as published by |
11 |
; * the Free Software Foundation; either version 2 of the License, or |
; * the Free Software Foundation; either version 2 of the License, or |
12 |
; * (at your option) any later version. |
; * (at your option) any later version. |
22 |
; * |
; * |
23 |
; *************************************************************************/ |
; *************************************************************************/ |
24 |
|
|
25 |
BITS 32 |
%include "nasm.inc" |
26 |
|
|
27 |
%macro cglobal 1 |
;=========================================================================== |
28 |
%ifdef PREFIX |
; read only data |
29 |
%ifdef MARK_FUNCS |
;=========================================================================== |
|
global _%1:function |
|
|
%define %1 _%1:function |
|
|
%else |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%endif |
|
|
%else |
|
|
%ifdef MARK_FUNCS |
|
|
global %1:function |
|
|
%else |
|
|
global %1 |
|
|
%endif |
|
|
%endif |
|
|
%endmacro |
|
30 |
|
|
31 |
%macro FILLBYTES 2 |
DATA |
32 |
|
|
33 |
mov [%1], %2 |
xmm_0x80: |
34 |
|
times 16 db 0x80 |
35 |
|
|
36 |
|
;============================================================================= |
37 |
|
; Code |
38 |
|
;============================================================================= |
39 |
|
|
40 |
|
TEXT |
41 |
|
|
42 |
|
cglobal image_brightness_sse2 |
43 |
|
|
44 |
|
;////////////////////////////////////////////////////////////////////// |
45 |
|
;// image_brightness_sse2 |
46 |
|
;////////////////////////////////////////////////////////////////////// |
47 |
|
|
48 |
|
%macro CREATE_OFFSET_VECTOR 2 |
49 |
|
mov [%1 + 0], %2 |
50 |
mov [%1 + 1], %2 |
mov [%1 + 1], %2 |
51 |
mov [%1 + 2], %2 |
mov [%1 + 2], %2 |
52 |
mov [%1 + 3], %2 |
mov [%1 + 3], %2 |
62 |
mov [%1 + 13], %2 |
mov [%1 + 13], %2 |
63 |
mov [%1 + 14], %2 |
mov [%1 + 14], %2 |
64 |
mov [%1 + 15], %2 |
mov [%1 + 15], %2 |
|
|
|
65 |
%endmacro |
%endmacro |
66 |
|
|
67 |
|
ALIGN SECTION_ALIGN |
68 |
;=========================================================================== |
image_brightness_sse2: |
69 |
; read only data |
%ifdef ARCH_IS_X86_64 |
70 |
;=========================================================================== |
XVID_MOVSXD _EAX, prm5d |
|
|
|
|
%ifdef FORMAT_COFF |
|
|
SECTION .rodata |
|
71 |
%else |
%else |
72 |
SECTION .rodata align=16 |
mov eax, prm5 ; brightness offset value |
73 |
%endif |
%endif |
74 |
|
mov TMP1, prm1 ; Dst |
75 |
|
mov TMP0, prm2 ; stride |
76 |
|
|
77 |
xmm_0x80: |
push _ESI |
78 |
times 16 db 0x80 |
push _EDI ; 8 bytes offset for push |
79 |
|
sub _ESP, 32 ; 32 bytes for local data (16bytes will be used, 16bytes more to align correctly mod 16) |
80 |
offset_xmm: |
|
81 |
times 16 db 0x00 |
movdqa xmm2, [xmm_0x80] |
82 |
|
|
83 |
;============================================================================= |
; Create a offset...offset vector |
84 |
; Code |
mov _ESI, _ESP ; TMP1 will be esp aligned mod 16 |
85 |
;============================================================================= |
add _ESI, 15 ; TMP1 = esp + 15 |
86 |
|
and _ESI, ~15 ; TMP1 = (esp + 15)&(~15) |
87 |
SECTION .text |
CREATE_OFFSET_VECTOR _ESI, al |
88 |
|
movdqa xmm3, [_ESI] |
89 |
cglobal image_brightness_sse2 |
|
90 |
|
%ifdef ARCH_IS_X86_64 |
91 |
|
mov _ESI, prm3 |
92 |
;////////////////////////////////////////////////////////////////////// |
mov _EDI, prm4 |
93 |
;// image_brightness_sse2 |
%else |
94 |
;////////////////////////////////////////////////////////////////////// |
mov _ESI, [_ESP+8+32+12] ; width |
95 |
|
mov _EDI, [_ESP+8+32+16] ; height |
96 |
align 16 |
%endif |
|
image_brightness_sse2: |
|
|
|
|
|
push esi |
|
|
push edi |
|
|
|
|
|
movdqa xmm6, [xmm_0x80] |
|
|
|
|
|
mov eax, [esp+8+20] ; offset |
|
|
|
|
|
FILLBYTES offset_xmm, al |
|
|
|
|
|
movdqa xmm7, [offset_xmm] |
|
|
|
|
|
mov edx, [esp+8+4] ; Dst |
|
|
mov ecx, [esp+8+8] ; stride |
|
|
mov esi, [esp+8+12] ; width |
|
|
mov edi, [esp+8+16] ; height |
|
|
|
|
|
.yloop |
|
|
xor eax, eax |
|
97 |
|
|
98 |
.xloop |
.yloop: |
99 |
movdqa xmm0, [edx + eax] |
xor _EAX, _EAX |
|
movdqa xmm1, [edx + eax + 16] ; xmm0 = [dst] |
|
100 |
|
|
101 |
paddb xmm0, xmm6 ; unsigned -> signed domain |
.xloop: |
102 |
paddb xmm1, xmm6 |
movdqa xmm0, [TMP1 + _EAX] |
103 |
paddsb xmm0, xmm7 |
movdqa xmm1, [TMP1 + _EAX + 16] ; xmm0 = [dst] |
104 |
paddsb xmm1, xmm7 ; xmm0 += offset |
|
105 |
psubb xmm0, xmm6 |
paddb xmm0, xmm2 ; unsigned -> signed domain |
106 |
psubb xmm1, xmm6 ; signed -> unsigned domain |
paddb xmm1, xmm2 |
107 |
|
paddsb xmm0, xmm3 |
108 |
|
paddsb xmm1, xmm3 ; xmm0 += offset |
109 |
|
psubb xmm0, xmm2 |
110 |
|
psubb xmm1, xmm2 ; signed -> unsigned domain |
111 |
|
|
112 |
movdqa [edx + eax], xmm0 |
movdqa [TMP1 + _EAX], xmm0 |
113 |
movdqa [edx + eax + 16], xmm1 ; [dst] = xmm0 |
movdqa [TMP1 + _EAX + 16], xmm1 ; [dst] = xmm0 |
114 |
|
|
115 |
add eax,32 |
add _EAX,32 |
116 |
cmp eax,esi |
cmp _EAX,_ESI |
117 |
jl .xloop |
jl .xloop |
118 |
|
|
119 |
add edx, ecx ; dst += stride |
add TMP1, TMP0 ; dst += stride |
120 |
sub edi, 1 |
dec _EDI |
121 |
jg .yloop |
jg .yloop |
122 |
|
|
123 |
pop edi |
add _ESP, 32 |
124 |
pop esi |
pop _EDI |
125 |
|
pop _ESI |
126 |
|
|
127 |
ret |
ret |
128 |
|
ENDFUNC |
129 |
;////////////////////////////////////////////////////////////////////// |
;////////////////////////////////////////////////////////////////////// |
130 |
|
|
131 |
|
NON_EXEC_STACK |