--- trunk/xvidcore/src/motion/x86_asm/sad_mmx.asm 2004/03/22 22:36:25 1382 +++ trunk/xvidcore/src/motion/x86_asm/sad_mmx.asm 2004/04/12 15:49:56 1424 @@ -20,7 +20,7 @@ ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; * $Id: sad_mmx.asm,v 1.12 2004-03-22 22:36:24 edgomez Exp $ +; * $Id: sad_mmx.asm,v 1.13 2004-04-12 15:49:56 edgomez Exp $ ; * ; ***************************************************************************/ @@ -270,6 +270,7 @@ cglobal sad8bi_mmx cglobal dev16_mmx cglobal sse8_16bit_mmx +cglobal sse8_8bit_mmx ;----------------------------------------------------------------------------- ; @@ -630,7 +631,7 @@ ; ;----------------------------------------------------------------------------- -%macro ROW_SSE_MMX 2 +%macro ROW_SSE_16bit_MMX 2 movq mm0, [%1] movq mm1, [%1+8] psubw mm0, [%2] @@ -639,7 +640,7 @@ pmaddwd mm1, mm1 paddd mm2, mm0 paddd mm2, mm1 -%endmacro +%endmacro sse8_16bit_mmx: push esi @@ -654,30 +655,11 @@ pxor mm2, mm2 ;; Let's go - ROW_SSE_MMX esi, edi - lea esi, [esi+edx] - lea edi, [edi+edx] - ROW_SSE_MMX esi, edi - lea esi, [esi+edx] - lea edi, [edi+edx] - ROW_SSE_MMX esi, edi - lea esi, [esi+edx] - lea edi, [edi+edx] - ROW_SSE_MMX esi, edi - lea esi, [esi+edx] - lea edi, [edi+edx] - ROW_SSE_MMX esi, edi - lea esi, [esi+edx] - lea edi, [edi+edx] - ROW_SSE_MMX esi, edi - lea esi, [esi+edx] - lea edi, [edi+edx] - ROW_SSE_MMX esi, edi - lea esi, [esi+edx] - lea edi, [edi+edx] - ROW_SSE_MMX esi, edi +%rep 8 + ROW_SSE_16bit_MMX esi, edi lea esi, [esi+edx] lea edi, [edi+edx] +%endrep ;; Finish adding each dword of the accumulator movq mm3, mm2 @@ -689,3 +671,67 @@ pop edi pop esi ret + +;----------------------------------------------------------------------------- +; +; uint32_t sse8_8bit_mmx(const int8_t *b1, +; const int8_t *b2, +; const uint32_t stride); +; +;----------------------------------------------------------------------------- + +%macro ROW_SSE_8bit_MMX 2 + movq mm0, [%1] ; load a row + movq mm2, [%2] ; load a row + + movq mm1, mm0 ; copy row + movq mm3, mm2 ; copy row + + punpcklbw mm0, mm7 ; turn the 4low elements into 16bit + punpckhbw mm1, mm7 ; turn the 4high elements into 16bit + + punpcklbw mm2, mm7 ; turn the 4low elements into 16bit + punpckhbw mm3, mm7 ; turn the 4high elements into 16bit + + psubw mm0, mm2 ; low part of src-dst + psubw mm1, mm3 ; high part of src-dst + + pmaddwd mm0, mm0 ; compute the square sum + pmaddwd mm1, mm1 ; compute the square sum + + paddd mm6, mm0 ; add to the accumulator + paddd mm6, mm1 ; add to the accumulator +%endmacro + +sse8_8bit_mmx: + push esi + push edi + + ;; Load the function params + mov esi, [esp+8+4] + mov edi, [esp+8+8] + mov edx, [esp+8+12] + + ;; Reset the sse accumulator + pxor mm6, mm6 + + ;; Used to interleave 8bit data with 0x00 values + pxor mm7, mm7 + + ;; Let's go +%rep 8 + ROW_SSE_8bit_MMX esi, edi + lea esi, [esi+edx] + lea edi, [edi+edx] +%endrep + + ;; Finish adding each dword of the accumulator + movq mm7, mm6 + psrlq mm6, 32 + paddd mm6, mm7 + movd eax, mm6 + + ;; All done + pop edi + pop esi + ret