Parent Directory | Revision Log
Revision 1586 - (view) (download)
1 : | edgomez | 1586 | ;/**************************************************************************** |
2 : | ; * | ||
3 : | ; * XVID MPEG-4 VIDEO CODEC | ||
4 : | ; * - K7 optimized SAD operators - | ||
5 : | ; * | ||
6 : | ; * Copyright(C) 2001 Peter Ross <pross@xvid.org> | ||
7 : | ; * 2002 Pascal Massimino <skal@planet-d.net> | ||
8 : | ; * 2004 Andre Werthmann <wertmann@aei.mpg.de> | ||
9 : | ; * | ||
10 : | ; * This program is free software; you can redistribute it and/or modify it | ||
11 : | ; * under the terms of the GNU General Public License as published by | ||
12 : | ; * the Free Software Foundation; either version 2 of the License, or | ||
13 : | ; * (at your option) any later version. | ||
14 : | ; * | ||
15 : | ; * This program is distributed in the hope that it will be useful, | ||
16 : | ; * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 : | ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 : | ; * GNU General Public License for more details. | ||
19 : | ; * | ||
20 : | ; * You should have received a copy of the GNU General Public License | ||
21 : | ; * along with this program; if not, write to the Free Software | ||
22 : | ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
23 : | ; * | ||
24 : | ; * $Id: sad_mmx.asm,v 1.1 2005-01-05 23:02:15 edgomez Exp $ | ||
25 : | ; * | ||
26 : | ; ***************************************************************************/ | ||
27 : | |||
28 : | BITS 64 | ||
29 : | |||
30 : | %macro cglobal 1 | ||
31 : | %ifdef PREFIX | ||
32 : | %ifdef MARK_FUNCS | ||
33 : | global _%1:function %1.endfunc-%1 | ||
34 : | %define %1 _%1:function %1.endfunc-%1 | ||
35 : | %else | ||
36 : | global _%1 | ||
37 : | %define %1 _%1 | ||
38 : | %endif | ||
39 : | %else | ||
40 : | %ifdef MARK_FUNCS | ||
41 : | global %1:function %1.endfunc-%1 | ||
42 : | %else | ||
43 : | global %1 | ||
44 : | %endif | ||
45 : | %endif | ||
46 : | %endmacro | ||
47 : | |||
48 : | ;============================================================================= | ||
49 : | ; Read only data | ||
50 : | ;============================================================================= | ||
51 : | |||
52 : | %ifdef FORMAT_COFF | ||
53 : | SECTION .rodata | ||
54 : | %else | ||
55 : | SECTION .rodata align=16 | ||
56 : | %endif | ||
57 : | |||
58 : | ;============================================================================= | ||
59 : | ; Code | ||
60 : | ;============================================================================= | ||
61 : | |||
62 : | SECTION .text align=16 | ||
63 : | |||
64 : | cglobal sse8_16bit_x86_64 | ||
65 : | cglobal sse8_8bit_x86_64 | ||
66 : | |||
67 : | ;----------------------------------------------------------------------------- | ||
68 : | ; | ||
69 : | ; uint32_t sse8_16bit_x86_64x(const int16_t *b1, | ||
70 : | ; const int16_t *b2, | ||
71 : | ; const uint32_t stride); | ||
72 : | ; | ||
73 : | ;----------------------------------------------------------------------------- | ||
74 : | |||
75 : | %macro ROW_SSE_16Bit_MMX 2 | ||
76 : | movq mm0, [%1] | ||
77 : | movq mm1, [%1+8] | ||
78 : | psubw mm0, [%2] | ||
79 : | psubw mm1, [%2+8] | ||
80 : | pmaddwd mm0, mm0 | ||
81 : | pmaddwd mm1, mm1 | ||
82 : | paddd mm2, mm0 | ||
83 : | paddd mm2, mm1 | ||
84 : | %endmacro | ||
85 : | |||
86 : | sse8_16bit_x86_64: | ||
87 : | |||
88 : | ; rdx is stride | ||
89 : | ; rsi is b2 | ||
90 : | ; rdi is b1 | ||
91 : | |||
92 : | ;; Reset the sse accumulator | ||
93 : | pxor mm2, mm2 | ||
94 : | |||
95 : | ;; Let's go | ||
96 : | %rep 8 | ||
97 : | ROW_SSE_16Bit_MMX rsi, rdi | ||
98 : | lea rsi, [rsi+rdx] | ||
99 : | lea rdi, [rdi+rdx] | ||
100 : | %endrep | ||
101 : | |||
102 : | ;; Finish adding each dword of the accumulator | ||
103 : | movq mm3, mm2 | ||
104 : | psrlq mm2, 32 | ||
105 : | paddd mm2, mm3 | ||
106 : | movd eax, mm2 | ||
107 : | |||
108 : | ;; All done | ||
109 : | ret | ||
110 : | .endfunc | ||
111 : | |||
112 : | ;----------------------------------------------------------------------------- | ||
113 : | ; | ||
114 : | ; uint32_t sse8_8bit_x86_64(const int8_t *b1, | ||
115 : | ; const int8_t *b2, | ||
116 : | ; const uint32_t stride); | ||
117 : | ; | ||
118 : | ;----------------------------------------------------------------------------- | ||
119 : | |||
120 : | %macro ROW_SSE_8bit_MMX 2 | ||
121 : | movq mm0, [%1] ; load a row | ||
122 : | movq mm2, [%2] ; load a row | ||
123 : | |||
124 : | movq mm1, mm0 ; copy row | ||
125 : | movq mm3, mm2 ; copy row | ||
126 : | |||
127 : | punpcklbw mm0, mm7 ; turn the 4low elements into 16bit | ||
128 : | punpckhbw mm1, mm7 ; turn the 4high elements into 16bit | ||
129 : | |||
130 : | punpcklbw mm2, mm7 ; turn the 4low elements into 16bit | ||
131 : | punpckhbw mm3, mm7 ; turn the 4high elements into 16bit | ||
132 : | |||
133 : | psubw mm0, mm2 ; low part of src-dst | ||
134 : | psubw mm1, mm3 ; high part of src-dst | ||
135 : | |||
136 : | pmaddwd mm0, mm0 ; compute the square sum | ||
137 : | pmaddwd mm1, mm1 ; compute the square sum | ||
138 : | |||
139 : | paddd mm6, mm0 ; add to the accumulator | ||
140 : | paddd mm6, mm1 ; add to the accumulator | ||
141 : | %endmacro | ||
142 : | |||
143 : | sse8_8bit_x86_64: | ||
144 : | ;; Reset the sse accumulator | ||
145 : | pxor mm6, mm6 | ||
146 : | |||
147 : | ;; Used to interleave 8bit data with 0x00 values | ||
148 : | pxor mm7, mm7 | ||
149 : | |||
150 : | ;; Let's go | ||
151 : | %rep 8 | ||
152 : | ROW_SSE_8bit_MMX rsi, rdi | ||
153 : | lea rsi, [rsi+rdx] | ||
154 : | lea rdi, [rdi+rdx] | ||
155 : | %endrep | ||
156 : | |||
157 : | ;; Finish adding each dword of the accumulator | ||
158 : | movq mm7, mm6 | ||
159 : | psrlq mm6, 32 | ||
160 : | paddd mm6, mm7 | ||
161 : | movd eax, mm6 | ||
162 : | |||
163 : | ;; All done | ||
164 : | ret | ||
165 : | .endfunc |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |