--- trunk/xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm 2002/11/17 00:51:11 654 +++ trunk/xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm 2003/02/15 15:22:19 851 @@ -3,52 +3,28 @@ ; * XVID MPEG-4 VIDEO CODEC ; * mmx 8bit<->16bit transfers ; * -; * This file is part of XviD, a free MPEG-4 video encoder/decoder -; * -; * XviD is free software; you can redistribute it and/or modify it -; * under the terms of the GNU General Public License as published by -; * the Free Software Foundation; either version 2 of the License, or -; * (at your option) any later version. -; * -; * This program is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; * GNU General Public License for more details. -; * -; * You should have received a copy of the GNU General Public License -; * along with this program; if not, write to the Free Software -; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -; * -; * Under section 8 of the GNU General Public License, the copyright -; * holders of XVID explicitly forbid distribution in the following -; * countries: -; * -; * - Japan -; * - United States of America -; * -; * Linking XviD statically or dynamically with other modules is making a -; * combined work based on XviD. Thus, the terms and conditions of the -; * GNU General Public License cover the whole combination. -; * -; * As a special exception, the copyright holders of XviD give you -; * permission to link XviD with independent modules that communicate with -; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the -; * license terms of these independent modules, and to copy and distribute -; * the resulting combined work under terms of your choice, provided that -; * every copy of the combined work is accompanied by a complete copy of -; * the source code of XviD (the version of XviD used to produce the -; * combined work), being distributed under the terms of the GNU General -; * Public License plus this exception. An independent module is a module -; * which is not derived from or based on XviD. -; * -; * Note that people who make modified versions of XviD are not obligated -; * to grant this special exception for their modified versions; it is -; * their choice whether to do so. The GNU General Public License gives -; * permission to release a modified version without this exception; this -; * exception also makes it possible to release a modified version which -; * carries forward this exception. -; * -; * $Id: mem_transfer_mmx.asm,v 1.8 2002-11-17 00:51:11 edgomez Exp $ +; * This program is an implementation of a part of one or more MPEG-4 +; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending +; * to use this software module in hardware or software products are +; * advised that its use may infringe existing patents or copyrights, and +; * any such use would be at such party's own risk. The original +; * developer of this software module and his/her company, and subsequent +; * editors and their companies, will have no liability for use of this +; * software or modifications or derivatives thereof. +; * +; * This program is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. +; * +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ; * ; *************************************************************************/ @@ -59,7 +35,7 @@ ; * 04.06.2002 speed enhancement (unroll+overlap). -Skal- ; * + added transfer_8to16sub2_mmx/xmm ; * 07.01.2002 merge functions from compensate_mmx; rename functions -; * 07.11.2001 initial version; (c)2001 peter ross +; * 07.11.2001 initial version; (c)2001 peter ross ; * ; *************************************************************************/ @@ -81,6 +57,7 @@ cglobal transfer_8to16copy_mmx cglobal transfer_16to8copy_mmx cglobal transfer_8to16sub_mmx +cglobal transfer_8to16subro_mmx cglobal transfer_8to16sub2_mmx cglobal transfer_8to16sub2_xmm cglobal transfer_16to8add_mmx @@ -175,11 +152,12 @@ ; * 02.12.2001 loop unrolled, code runs 10% faster now (Isibaar) ; * 30.11.2001 16 pixels are processed per iteration (Isibaar) ; * 30.11.2001 .text missing -; * 06.11.2001 inital version; (c)2001 peter ross +; * 06.11.2001 inital version; (c)2001 peter ross ; * ; *************************************************************************/ -%macro COPY_8_TO_16_SUB 1 +; when second argument == 1, reference (ebx) block is to current (eax) +%macro COPY_8_TO_16_SUB 2 movq mm0, [eax] ; cur movq mm2, [eax+edx] movq mm1, mm0 @@ -188,13 +166,15 @@ punpcklbw mm0, mm7 punpcklbw mm2, mm7 movq mm4, [ebx] ; ref - punpckhbw mm1, mm7 - punpckhbw mm3, mm7 + punpckhbw mm1, mm7 + punpckhbw mm3, mm7 movq mm5, [ebx+edx] ; ref movq mm6, mm4 +%if %2 == 1 movq [eax], mm4 movq [eax+edx], mm5 +%endif punpcklbw mm4, mm7 punpckhbw mm6, mm7 psubsw mm0, mm4 @@ -208,9 +188,9 @@ lea ebx,[ebx+2*edx] movq [ecx+%1*32+ 0], mm0 ; dst - movq [ecx+%1*32+ 8], mm1 - movq [ecx+%1*32+16], mm2 - movq [ecx+%1*32+24], mm3 + movq [ecx+%1*32+ 8], mm1 + movq [ecx+%1*32+16], mm2 + movq [ecx+%1*32+24], mm3 %endmacro align 16 @@ -222,14 +202,33 @@ mov edx, [esp+4+16] ; Stride pxor mm7, mm7 - COPY_8_TO_16_SUB 0 - COPY_8_TO_16_SUB 1 - COPY_8_TO_16_SUB 2 - COPY_8_TO_16_SUB 3 + COPY_8_TO_16_SUB 0, 1 + COPY_8_TO_16_SUB 1, 1 + COPY_8_TO_16_SUB 2, 1 + COPY_8_TO_16_SUB 3, 1 + + pop ebx + ret + + +align 16 +transfer_8to16subro_mmx: + mov ecx, [esp + 4] ; Dst + mov eax, [esp + 8] ; Cur + push ebx + mov ebx, [esp+4+12] ; Ref + mov edx, [esp+4+16] ; Stride + pxor mm7, mm7 + + COPY_8_TO_16_SUB 0, 0 + COPY_8_TO_16_SUB 1, 0 + COPY_8_TO_16_SUB 2, 0 + COPY_8_TO_16_SUB 3, 0 pop ebx ret + ;=========================================================================== ; ; void transfer_8to16sub2_mmx(int16_t * const dct,