;/**************************************************************************** ; * ; * XVID MPEG-4 VIDEO CODEC ; * - optimized SSIM routines - ; * ; * Copyright(C) 2006 Johannes Reinhardt ; * ; * This program is free software; you can redistribute it and/or modify it ; * under the terms of the GNU General Public License as published by ; * the Free Software Foundation; either version 2 of the License, or ; * (at your option) any later version. ; * ; * This program is distributed in the hope that it will be useful, ; * but WITHOUT ANY WARRANTY; without even the implied warranty of ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; * GNU General Public License for more details. ; * ; * You should have received a copy of the GNU General Public License ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * ; * ; ***************************************************************************/ BITS 32 %macro cglobal 1 %ifdef PREFIX %ifdef MARK_FUNCS global _%1:function %1.endfunc-%1 %define %1 _%1:function %1.endfunc-%1 %else global _%1 %define %1 _%1 %endif %else %ifdef MARK_FUNCS global %1:function %1.endfunc-%1 %else global %1 %endif %endif %endmacro %macro ACC_ROW 2 movq %1,[ ecx] movq %2,[ecx+edx] psadbw %1,mm0 psadbw %2,mm0 lea ecx, [ecx+2*edx] paddw %1, %2 %endmacro %macro CONSIM_1x8_SSE2 0 movdqu xmm0,[ecx] movdqu xmm1,[edx] ;unpack to words punpcklbw xmm0,xmm2 punpcklbw xmm1,xmm2 movaps xmm3,xmm0 movaps xmm4,xmm1 pmaddwd xmm0,xmm0;orig pmaddwd xmm1,xmm1;comp pmaddwd xmm3,xmm4;corr paddd xmm5,xmm0 paddd xmm6,xmm1 paddd xmm7,xmm3 %endmacro %macro CONSIM_1x8_MMX 0 movq mm0,[ecx];orig movq mm1,[edx];comp ;unpack low half of qw to words punpcklbw mm0,mm2 punpcklbw mm1,mm2 movq mm3,mm0 pmaddwd mm3,mm0 paddd mm5,mm3; movq mm4,mm1 pmaddwd mm4,mm1 paddd mm6,mm4; pmaddwd mm1,mm0 paddd mm7,mm1 movq mm0,[ecx];orig movq mm1,[edx];comp ;unpack high half of qw to words punpckhbw mm0,mm2 punpckhbw mm1,mm2 movq mm3,mm0 pmaddwd mm3,mm0 paddd mm5,mm3; movq mm4,mm1 pmaddwd mm4,mm1 paddd mm6,mm4; pmaddwd mm1,mm0 paddd mm7,mm1 %endmacro %macro CONSIM_WRITEOUT 3 mov eax,[esp + 16];lumo mul eax; lumo^2 add eax, 32 shr eax,6; 64*lum0^2 movd ecx,%1 sub ecx,eax mov edx,[esp + 24]; pdevo mov [edx],ecx mov eax,[esp + 20];lumc mul eax; lumc^2 add eax, 32 shr eax,6; 64*lumc^2 movd ecx,%2 sub ecx,eax mov edx,[esp + 28]; pdevc mov [edx],ecx mov eax,[esp + 16];lumo mul dword [esp + 20]; lumo*lumc, should fit in eax add eax, 32 shr eax,6; 64*lumo*lumc movd ecx,%3 sub ecx,eax mov edx,[esp + 32]; pcorr mov [edx],ecx %endmacro SECTION .text cglobal lum_8x8_mmx cglobal consim_sse2 cglobal consim_mmx ;int lum_8x8_c(uint8_t* ptr, uint32_t stride) ALIGN 16 lum_8x8_mmx: mov ecx, [esp + 4] ;ptr mov edx, [esp + 8];stride pxor mm0,mm0 ACC_ROW mm1, mm2 ACC_ROW mm3, mm4 ACC_ROW mm5, mm6 ACC_ROW mm7, mm4 paddw mm1, mm3 paddw mm5, mm7 paddw mm1, mm5 movd eax,mm1 ret .endfunc ALIGN 16 consim_sse2: mov ecx,[esp+4] ;ptro mov edx,[esp+8] ;ptrc mov eax,[esp+12];stride pxor xmm2,xmm2;null vektor pxor xmm5,xmm5;devo pxor xmm6,xmm6;devc pxor xmm7,xmm7;corr ;broadcast lumo/c punpcklbw xmm6,xmm6 punpcklwd xmm6,xmm6 pshufd xmm6,xmm6,00000000b;or shufps punpcklbw xmm7,xmm7 punpcklwd xmm7,xmm7 pshufd xmm7,xmm7,00000000b CONSIM_1x8_SSE2 add ecx,eax add edx,eax CONSIM_1x8_SSE2 add ecx,eax add edx,eax CONSIM_1x8_SSE2 add ecx,eax add edx,eax CONSIM_1x8_SSE2 add ecx,eax add edx,eax CONSIM_1x8_SSE2 add ecx,eax add edx,eax CONSIM_1x8_SSE2 add ecx,eax add edx,eax CONSIM_1x8_SSE2 add ecx,eax add edx,eax CONSIM_1x8_SSE2 ;accumulate xmm5-7 pshufd xmm0, xmm5, 0EH paddd xmm5, xmm0 pshufd xmm0, xmm5, 01H paddd xmm5, xmm0 pshufd xmm1, xmm6, 0EH paddd xmm6, xmm1 pshufd xmm1, xmm6, 01H paddd xmm6, xmm1 pshufd xmm2, xmm7, 0EH paddd xmm7, xmm2 pshufd xmm2, xmm7, 01H paddd xmm7, xmm2 CONSIM_WRITEOUT xmm5,xmm6,xmm7 ret .endfunc ALIGN 16 consim_mmx: mov ecx,[esp+4] ;ptro mov edx,[esp+8] ;ptrc mov eax,[esp+12];stride pxor mm2,mm2;null pxor mm5,mm5;devo pxor mm6,mm6;devc pxor mm7,mm7;corr CONSIM_1x8_MMX add ecx,eax add edx,eax CONSIM_1x8_MMX add ecx,eax add edx,eax CONSIM_1x8_MMX add ecx,eax add edx,eax CONSIM_1x8_MMX add ecx,eax add edx,eax CONSIM_1x8_MMX add ecx,eax add edx,eax CONSIM_1x8_MMX add ecx,eax add edx,eax CONSIM_1x8_MMX add ecx,eax add edx,eax CONSIM_1x8_MMX movq mm0,mm5 psrlq mm0,32 paddd mm5,mm0 movq mm1,mm6 psrlq mm1,32 paddd mm6,mm1 movq mm2,mm7 psrlq mm2,32 paddd mm7,mm2 CONSIM_WRITEOUT mm5,mm6,mm7 ret .endfunc