• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/router/ffmpeg/libavcodec/x86/
1/*
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer
5 *
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8 * and improved by Zdenek Kabelac <kabi@users.sf.net>
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
28   clobber bug - now it will work with 2.95.2 and also with -fPIC
29 */
30static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
31{
32    __asm__ volatile(
33        "lea (%3, %3), %%"REG_a"        \n\t"
34        "1:                             \n\t"
35        "movq (%1), %%mm0               \n\t"
36        "movq (%1, %3), %%mm1           \n\t"
37        PAVGB" 1(%1), %%mm0             \n\t"
38        PAVGB" 1(%1, %3), %%mm1         \n\t"
39        "movq %%mm0, (%2)               \n\t"
40        "movq %%mm1, (%2, %3)           \n\t"
41        "add %%"REG_a", %1              \n\t"
42        "add %%"REG_a", %2              \n\t"
43        "movq (%1), %%mm0               \n\t"
44        "movq (%1, %3), %%mm1           \n\t"
45        PAVGB" 1(%1), %%mm0             \n\t"
46        PAVGB" 1(%1, %3), %%mm1         \n\t"
47        "add %%"REG_a", %1              \n\t"
48        "movq %%mm0, (%2)               \n\t"
49        "movq %%mm1, (%2, %3)           \n\t"
50        "add %%"REG_a", %2              \n\t"
51        "subl $4, %0                    \n\t"
52        "jnz 1b                         \n\t"
53        :"+g"(h), "+S"(pixels), "+D"(block)
54        :"r" ((x86_reg)line_size)
55        :"%"REG_a, "memory");
56}
57
58static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
59{
60    __asm__ volatile(
61        "testl $1, %0                   \n\t"
62            " jz 1f                     \n\t"
63        "movd   (%1), %%mm0             \n\t"
64        "movd   (%2), %%mm1             \n\t"
65        "add    %4, %1                  \n\t"
66        "add    $4, %2                  \n\t"
67        PAVGB" %%mm1, %%mm0             \n\t"
68        "movd   %%mm0, (%3)             \n\t"
69        "add    %5, %3                  \n\t"
70        "decl   %0                      \n\t"
71        "1:                             \n\t"
72        "movd   (%1), %%mm0             \n\t"
73        "add    %4, %1                  \n\t"
74        "movd   (%1), %%mm1             \n\t"
75        "movd   (%2), %%mm2             \n\t"
76        "movd   4(%2), %%mm3            \n\t"
77        "add    %4, %1                  \n\t"
78        PAVGB" %%mm2, %%mm0             \n\t"
79        PAVGB" %%mm3, %%mm1             \n\t"
80        "movd   %%mm0, (%3)             \n\t"
81        "add    %5, %3                  \n\t"
82        "movd   %%mm1, (%3)             \n\t"
83        "add    %5, %3                  \n\t"
84        "movd   (%1), %%mm0             \n\t"
85        "add    %4, %1                  \n\t"
86        "movd   (%1), %%mm1             \n\t"
87        "movd   8(%2), %%mm2            \n\t"
88        "movd   12(%2), %%mm3           \n\t"
89        "add    %4, %1                  \n\t"
90        PAVGB" %%mm2, %%mm0             \n\t"
91        PAVGB" %%mm3, %%mm1             \n\t"
92        "movd   %%mm0, (%3)             \n\t"
93        "add    %5, %3                  \n\t"
94        "movd   %%mm1, (%3)             \n\t"
95        "add    %5, %3                  \n\t"
96        "add    $16, %2                 \n\t"
97        "subl   $4, %0                  \n\t"
98        "jnz    1b                      \n\t"
99#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
100        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
101#else
102        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
103#endif
104        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
105        :"memory");
106}
107
108
109static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
110{
111    __asm__ volatile(
112        "testl $1, %0                   \n\t"
113            " jz 1f                     \n\t"
114        "movq   (%1), %%mm0             \n\t"
115        "movq   (%2), %%mm1             \n\t"
116        "add    %4, %1                  \n\t"
117        "add    $8, %2                  \n\t"
118        PAVGB" %%mm1, %%mm0             \n\t"
119        "movq   %%mm0, (%3)             \n\t"
120        "add    %5, %3                  \n\t"
121        "decl   %0                      \n\t"
122        "1:                             \n\t"
123        "movq   (%1), %%mm0             \n\t"
124        "add    %4, %1                  \n\t"
125        "movq   (%1), %%mm1             \n\t"
126        "add    %4, %1                  \n\t"
127        PAVGB" (%2), %%mm0              \n\t"
128        PAVGB" 8(%2), %%mm1             \n\t"
129        "movq   %%mm0, (%3)             \n\t"
130        "add    %5, %3                  \n\t"
131        "movq   %%mm1, (%3)             \n\t"
132        "add    %5, %3                  \n\t"
133        "movq   (%1), %%mm0             \n\t"
134        "add    %4, %1                  \n\t"
135        "movq   (%1), %%mm1             \n\t"
136        "add    %4, %1                  \n\t"
137        PAVGB" 16(%2), %%mm0            \n\t"
138        PAVGB" 24(%2), %%mm1            \n\t"
139        "movq   %%mm0, (%3)             \n\t"
140        "add    %5, %3                  \n\t"
141        "movq   %%mm1, (%3)             \n\t"
142        "add    %5, %3                  \n\t"
143        "add    $32, %2                 \n\t"
144        "subl   $4, %0                  \n\t"
145        "jnz    1b                      \n\t"
146#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
147        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
148#else
149        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
150#endif
151        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
152        :"memory");
153//the following should be used, though better not with gcc ...
154/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
155        :"r"(src1Stride), "r"(dstStride)
156        :"memory");*/
157}
158
159static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
160{
161    __asm__ volatile(
162        "pcmpeqb %%mm6, %%mm6           \n\t"
163        "testl $1, %0                   \n\t"
164            " jz 1f                     \n\t"
165        "movq   (%1), %%mm0             \n\t"
166        "movq   (%2), %%mm1             \n\t"
167        "add    %4, %1                  \n\t"
168        "add    $8, %2                  \n\t"
169        "pxor %%mm6, %%mm0              \n\t"
170        "pxor %%mm6, %%mm1              \n\t"
171        PAVGB" %%mm1, %%mm0             \n\t"
172        "pxor %%mm6, %%mm0              \n\t"
173        "movq   %%mm0, (%3)             \n\t"
174        "add    %5, %3                  \n\t"
175        "decl   %0                      \n\t"
176        "1:                             \n\t"
177        "movq   (%1), %%mm0             \n\t"
178        "add    %4, %1                  \n\t"
179        "movq   (%1), %%mm1             \n\t"
180        "add    %4, %1                  \n\t"
181        "movq   (%2), %%mm2             \n\t"
182        "movq   8(%2), %%mm3            \n\t"
183        "pxor %%mm6, %%mm0              \n\t"
184        "pxor %%mm6, %%mm1              \n\t"
185        "pxor %%mm6, %%mm2              \n\t"
186        "pxor %%mm6, %%mm3              \n\t"
187        PAVGB" %%mm2, %%mm0             \n\t"
188        PAVGB" %%mm3, %%mm1             \n\t"
189        "pxor %%mm6, %%mm0              \n\t"
190        "pxor %%mm6, %%mm1              \n\t"
191        "movq   %%mm0, (%3)             \n\t"
192        "add    %5, %3                  \n\t"
193        "movq   %%mm1, (%3)             \n\t"
194        "add    %5, %3                  \n\t"
195        "movq   (%1), %%mm0             \n\t"
196        "add    %4, %1                  \n\t"
197        "movq   (%1), %%mm1             \n\t"
198        "add    %4, %1                  \n\t"
199        "movq   16(%2), %%mm2           \n\t"
200        "movq   24(%2), %%mm3           \n\t"
201        "pxor %%mm6, %%mm0              \n\t"
202        "pxor %%mm6, %%mm1              \n\t"
203        "pxor %%mm6, %%mm2              \n\t"
204        "pxor %%mm6, %%mm3              \n\t"
205        PAVGB" %%mm2, %%mm0             \n\t"
206        PAVGB" %%mm3, %%mm1             \n\t"
207        "pxor %%mm6, %%mm0              \n\t"
208        "pxor %%mm6, %%mm1              \n\t"
209        "movq   %%mm0, (%3)             \n\t"
210        "add    %5, %3                  \n\t"
211        "movq   %%mm1, (%3)             \n\t"
212        "add    %5, %3                  \n\t"
213        "add    $32, %2                 \n\t"
214        "subl   $4, %0                  \n\t"
215        "jnz    1b                      \n\t"
216#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
217        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
218#else
219        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
220#endif
221        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
222        :"memory");
223//the following should be used, though better not with gcc ...
224/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
225        :"r"(src1Stride), "r"(dstStride)
226        :"memory");*/
227}
228
229static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
230{
231    __asm__ volatile(
232        "testl $1, %0                   \n\t"
233            " jz 1f                     \n\t"
234        "movd   (%1), %%mm0             \n\t"
235        "movd   (%2), %%mm1             \n\t"
236        "add    %4, %1                  \n\t"
237        "add    $4, %2                  \n\t"
238        PAVGB" %%mm1, %%mm0             \n\t"
239        PAVGB" (%3), %%mm0              \n\t"
240        "movd   %%mm0, (%3)             \n\t"
241        "add    %5, %3                  \n\t"
242        "decl   %0                      \n\t"
243        "1:                             \n\t"
244        "movd   (%1), %%mm0             \n\t"
245        "add    %4, %1                  \n\t"
246        "movd   (%1), %%mm1             \n\t"
247        "add    %4, %1                  \n\t"
248        PAVGB" (%2), %%mm0              \n\t"
249        PAVGB" 4(%2), %%mm1             \n\t"
250        PAVGB" (%3), %%mm0              \n\t"
251        "movd   %%mm0, (%3)             \n\t"
252        "add    %5, %3                  \n\t"
253        PAVGB" (%3), %%mm1              \n\t"
254        "movd   %%mm1, (%3)             \n\t"
255        "add    %5, %3                  \n\t"
256        "movd   (%1), %%mm0             \n\t"
257        "add    %4, %1                  \n\t"
258        "movd   (%1), %%mm1             \n\t"
259        "add    %4, %1                  \n\t"
260        PAVGB" 8(%2), %%mm0             \n\t"
261        PAVGB" 12(%2), %%mm1            \n\t"
262        PAVGB" (%3), %%mm0              \n\t"
263        "movd   %%mm0, (%3)             \n\t"
264        "add    %5, %3                  \n\t"
265        PAVGB" (%3), %%mm1              \n\t"
266        "movd   %%mm1, (%3)             \n\t"
267        "add    %5, %3                  \n\t"
268        "add    $16, %2                 \n\t"
269        "subl   $4, %0                  \n\t"
270        "jnz    1b                      \n\t"
271#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
272        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
273#else
274        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
275#endif
276        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
277        :"memory");
278}
279
280
281static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
282{
283    __asm__ volatile(
284        "testl $1, %0                   \n\t"
285            " jz 1f                     \n\t"
286        "movq   (%1), %%mm0             \n\t"
287        "movq   (%2), %%mm1             \n\t"
288        "add    %4, %1                  \n\t"
289        "add    $8, %2                  \n\t"
290        PAVGB" %%mm1, %%mm0             \n\t"
291        PAVGB" (%3), %%mm0              \n\t"
292        "movq   %%mm0, (%3)             \n\t"
293        "add    %5, %3                  \n\t"
294        "decl   %0                      \n\t"
295        "1:                             \n\t"
296        "movq   (%1), %%mm0             \n\t"
297        "add    %4, %1                  \n\t"
298        "movq   (%1), %%mm1             \n\t"
299        "add    %4, %1                  \n\t"
300        PAVGB" (%2), %%mm0              \n\t"
301        PAVGB" 8(%2), %%mm1             \n\t"
302        PAVGB" (%3), %%mm0              \n\t"
303        "movq   %%mm0, (%3)             \n\t"
304        "add    %5, %3                  \n\t"
305        PAVGB" (%3), %%mm1              \n\t"
306        "movq   %%mm1, (%3)             \n\t"
307        "add    %5, %3                  \n\t"
308        "movq   (%1), %%mm0             \n\t"
309        "add    %4, %1                  \n\t"
310        "movq   (%1), %%mm1             \n\t"
311        "add    %4, %1                  \n\t"
312        PAVGB" 16(%2), %%mm0            \n\t"
313        PAVGB" 24(%2), %%mm1            \n\t"
314        PAVGB" (%3), %%mm0              \n\t"
315        "movq   %%mm0, (%3)             \n\t"
316        "add    %5, %3                  \n\t"
317        PAVGB" (%3), %%mm1              \n\t"
318        "movq   %%mm1, (%3)             \n\t"
319        "add    %5, %3                  \n\t"
320        "add    $32, %2                 \n\t"
321        "subl   $4, %0                  \n\t"
322        "jnz    1b                      \n\t"
323#if !HAVE_EBX_AVAILABLE  //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
324        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
325#else
326        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
327#endif
328        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
329        :"memory");
330//the following should be used, though better not with gcc ...
331/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
332        :"r"(src1Stride), "r"(dstStride)
333        :"memory");*/
334}
335
336static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
337{
338    __asm__ volatile(
339        "lea (%3, %3), %%"REG_a"        \n\t"
340        "1:                             \n\t"
341        "movq (%1), %%mm0               \n\t"
342        "movq (%1, %3), %%mm1           \n\t"
343        "movq 8(%1), %%mm2              \n\t"
344        "movq 8(%1, %3), %%mm3          \n\t"
345        PAVGB" 1(%1), %%mm0             \n\t"
346        PAVGB" 1(%1, %3), %%mm1         \n\t"
347        PAVGB" 9(%1), %%mm2             \n\t"
348        PAVGB" 9(%1, %3), %%mm3         \n\t"
349        "movq %%mm0, (%2)               \n\t"
350        "movq %%mm1, (%2, %3)           \n\t"
351        "movq %%mm2, 8(%2)              \n\t"
352        "movq %%mm3, 8(%2, %3)          \n\t"
353        "add %%"REG_a", %1              \n\t"
354        "add %%"REG_a", %2              \n\t"
355        "movq (%1), %%mm0               \n\t"
356        "movq (%1, %3), %%mm1           \n\t"
357        "movq 8(%1), %%mm2              \n\t"
358        "movq 8(%1, %3), %%mm3          \n\t"
359        PAVGB" 1(%1), %%mm0             \n\t"
360        PAVGB" 1(%1, %3), %%mm1         \n\t"
361        PAVGB" 9(%1), %%mm2             \n\t"
362        PAVGB" 9(%1, %3), %%mm3         \n\t"
363        "add %%"REG_a", %1              \n\t"
364        "movq %%mm0, (%2)               \n\t"
365        "movq %%mm1, (%2, %3)           \n\t"
366        "movq %%mm2, 8(%2)              \n\t"
367        "movq %%mm3, 8(%2, %3)          \n\t"
368        "add %%"REG_a", %2              \n\t"
369        "subl $4, %0                    \n\t"
370        "jnz 1b                         \n\t"
371        :"+g"(h), "+S"(pixels), "+D"(block)
372        :"r" ((x86_reg)line_size)
373        :"%"REG_a, "memory");
374}
375
376static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
377{
378    __asm__ volatile(
379        "testl $1, %0                   \n\t"
380            " jz 1f                     \n\t"
381        "movq   (%1), %%mm0             \n\t"
382        "movq   8(%1), %%mm1            \n\t"
383        PAVGB" (%2), %%mm0              \n\t"
384        PAVGB" 8(%2), %%mm1             \n\t"
385        "add    %4, %1                  \n\t"
386        "add    $16, %2                 \n\t"
387        "movq   %%mm0, (%3)             \n\t"
388        "movq   %%mm1, 8(%3)            \n\t"
389        "add    %5, %3                  \n\t"
390        "decl   %0                      \n\t"
391        "1:                             \n\t"
392        "movq   (%1), %%mm0             \n\t"
393        "movq   8(%1), %%mm1            \n\t"
394        "add    %4, %1                  \n\t"
395        PAVGB" (%2), %%mm0              \n\t"
396        PAVGB" 8(%2), %%mm1             \n\t"
397        "movq   %%mm0, (%3)             \n\t"
398        "movq   %%mm1, 8(%3)            \n\t"
399        "add    %5, %3                  \n\t"
400        "movq   (%1), %%mm0             \n\t"
401        "movq   8(%1), %%mm1            \n\t"
402        "add    %4, %1                  \n\t"
403        PAVGB" 16(%2), %%mm0            \n\t"
404        PAVGB" 24(%2), %%mm1            \n\t"
405        "movq   %%mm0, (%3)             \n\t"
406        "movq   %%mm1, 8(%3)            \n\t"
407        "add    %5, %3                  \n\t"
408        "add    $32, %2                 \n\t"
409        "subl   $2, %0                  \n\t"
410        "jnz    1b                      \n\t"
411#if !HAVE_EBX_AVAILABLE  //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
412        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
413#else
414        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
415#endif
416        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
417        :"memory");
418//the following should be used, though better not with gcc ...
419/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
420        :"r"(src1Stride), "r"(dstStride)
421        :"memory");*/
422}
423
424static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
425{
426    __asm__ volatile(
427        "testl $1, %0                   \n\t"
428            " jz 1f                     \n\t"
429        "movq   (%1), %%mm0             \n\t"
430        "movq   8(%1), %%mm1            \n\t"
431        PAVGB" (%2), %%mm0              \n\t"
432        PAVGB" 8(%2), %%mm1             \n\t"
433        "add    %4, %1                  \n\t"
434        "add    $16, %2                 \n\t"
435        PAVGB" (%3), %%mm0              \n\t"
436        PAVGB" 8(%3), %%mm1             \n\t"
437        "movq   %%mm0, (%3)             \n\t"
438        "movq   %%mm1, 8(%3)            \n\t"
439        "add    %5, %3                  \n\t"
440        "decl   %0                      \n\t"
441        "1:                             \n\t"
442        "movq   (%1), %%mm0             \n\t"
443        "movq   8(%1), %%mm1            \n\t"
444        "add    %4, %1                  \n\t"
445        PAVGB" (%2), %%mm0              \n\t"
446        PAVGB" 8(%2), %%mm1             \n\t"
447        PAVGB" (%3), %%mm0              \n\t"
448        PAVGB" 8(%3), %%mm1             \n\t"
449        "movq   %%mm0, (%3)             \n\t"
450        "movq   %%mm1, 8(%3)            \n\t"
451        "add    %5, %3                  \n\t"
452        "movq   (%1), %%mm0             \n\t"
453        "movq   8(%1), %%mm1            \n\t"
454        "add    %4, %1                  \n\t"
455        PAVGB" 16(%2), %%mm0            \n\t"
456        PAVGB" 24(%2), %%mm1            \n\t"
457        PAVGB" (%3), %%mm0              \n\t"
458        PAVGB" 8(%3), %%mm1             \n\t"
459        "movq   %%mm0, (%3)             \n\t"
460        "movq   %%mm1, 8(%3)            \n\t"
461        "add    %5, %3                  \n\t"
462        "add    $32, %2                 \n\t"
463        "subl   $2, %0                  \n\t"
464        "jnz    1b                      \n\t"
465#if !HAVE_EBX_AVAILABLE  //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
466        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
467#else
468        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
469#endif
470        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
471        :"memory");
472//the following should be used, though better not with gcc ...
473/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
474        :"r"(src1Stride), "r"(dstStride)
475        :"memory");*/
476}
477
478static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
479{
480    __asm__ volatile(
481        "pcmpeqb %%mm6, %%mm6           \n\t"
482        "testl $1, %0                   \n\t"
483            " jz 1f                     \n\t"
484        "movq   (%1), %%mm0             \n\t"
485        "movq   8(%1), %%mm1            \n\t"
486        "movq   (%2), %%mm2             \n\t"
487        "movq   8(%2), %%mm3            \n\t"
488        "pxor %%mm6, %%mm0              \n\t"
489        "pxor %%mm6, %%mm1              \n\t"
490        "pxor %%mm6, %%mm2              \n\t"
491        "pxor %%mm6, %%mm3              \n\t"
492        PAVGB" %%mm2, %%mm0             \n\t"
493        PAVGB" %%mm3, %%mm1             \n\t"
494        "pxor %%mm6, %%mm0              \n\t"
495        "pxor %%mm6, %%mm1              \n\t"
496        "add    %4, %1                  \n\t"
497        "add    $16, %2                 \n\t"
498        "movq   %%mm0, (%3)             \n\t"
499        "movq   %%mm1, 8(%3)            \n\t"
500        "add    %5, %3                  \n\t"
501        "decl   %0                      \n\t"
502        "1:                             \n\t"
503        "movq   (%1), %%mm0             \n\t"
504        "movq   8(%1), %%mm1            \n\t"
505        "add    %4, %1                  \n\t"
506        "movq   (%2), %%mm2             \n\t"
507        "movq   8(%2), %%mm3            \n\t"
508        "pxor %%mm6, %%mm0              \n\t"
509        "pxor %%mm6, %%mm1              \n\t"
510        "pxor %%mm6, %%mm2              \n\t"
511        "pxor %%mm6, %%mm3              \n\t"
512        PAVGB" %%mm2, %%mm0             \n\t"
513        PAVGB" %%mm3, %%mm1             \n\t"
514        "pxor %%mm6, %%mm0              \n\t"
515        "pxor %%mm6, %%mm1              \n\t"
516        "movq   %%mm0, (%3)             \n\t"
517        "movq   %%mm1, 8(%3)            \n\t"
518        "add    %5, %3                  \n\t"
519        "movq   (%1), %%mm0             \n\t"
520        "movq   8(%1), %%mm1            \n\t"
521        "add    %4, %1                  \n\t"
522        "movq   16(%2), %%mm2           \n\t"
523        "movq   24(%2), %%mm3           \n\t"
524        "pxor %%mm6, %%mm0              \n\t"
525        "pxor %%mm6, %%mm1              \n\t"
526        "pxor %%mm6, %%mm2              \n\t"
527        "pxor %%mm6, %%mm3              \n\t"
528        PAVGB" %%mm2, %%mm0             \n\t"
529        PAVGB" %%mm3, %%mm1             \n\t"
530        "pxor %%mm6, %%mm0              \n\t"
531        "pxor %%mm6, %%mm1              \n\t"
532        "movq   %%mm0, (%3)             \n\t"
533        "movq   %%mm1, 8(%3)            \n\t"
534        "add    %5, %3                  \n\t"
535        "add    $32, %2                 \n\t"
536        "subl   $2, %0                  \n\t"
537        "jnz    1b                      \n\t"
538#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
539        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
540#else
541        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
542#endif
543        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
544        :"memory");
545//the following should be used, though better not with gcc ...
546/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
547        :"r"(src1Stride), "r"(dstStride)
548        :"memory");*/
549}
550
551/* GL: this function does incorrect rounding if overflow */
552static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
553{
554    MOVQ_BONE(mm6);
555    __asm__ volatile(
556        "lea (%3, %3), %%"REG_a"        \n\t"
557        "1:                             \n\t"
558        "movq (%1), %%mm0               \n\t"
559        "movq (%1, %3), %%mm2           \n\t"
560        "movq 1(%1), %%mm1              \n\t"
561        "movq 1(%1, %3), %%mm3          \n\t"
562        "add %%"REG_a", %1              \n\t"
563        "psubusb %%mm6, %%mm0           \n\t"
564        "psubusb %%mm6, %%mm2           \n\t"
565        PAVGB" %%mm1, %%mm0             \n\t"
566        PAVGB" %%mm3, %%mm2             \n\t"
567        "movq %%mm0, (%2)               \n\t"
568        "movq %%mm2, (%2, %3)           \n\t"
569        "movq (%1), %%mm0               \n\t"
570        "movq 1(%1), %%mm1              \n\t"
571        "movq (%1, %3), %%mm2           \n\t"
572        "movq 1(%1, %3), %%mm3          \n\t"
573        "add %%"REG_a", %2              \n\t"
574        "add %%"REG_a", %1              \n\t"
575        "psubusb %%mm6, %%mm0           \n\t"
576        "psubusb %%mm6, %%mm2           \n\t"
577        PAVGB" %%mm1, %%mm0             \n\t"
578        PAVGB" %%mm3, %%mm2             \n\t"
579        "movq %%mm0, (%2)               \n\t"
580        "movq %%mm2, (%2, %3)           \n\t"
581        "add %%"REG_a", %2              \n\t"
582        "subl $4, %0                    \n\t"
583        "jnz 1b                         \n\t"
584        :"+g"(h), "+S"(pixels), "+D"(block)
585        :"r" ((x86_reg)line_size)
586        :"%"REG_a, "memory");
587}
588
589static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
590{
591    __asm__ volatile(
592        "lea (%3, %3), %%"REG_a"        \n\t"
593        "movq (%1), %%mm0               \n\t"
594        "sub %3, %2                     \n\t"
595        "1:                             \n\t"
596        "movq (%1, %3), %%mm1           \n\t"
597        "movq (%1, %%"REG_a"), %%mm2    \n\t"
598        "add %%"REG_a", %1              \n\t"
599        PAVGB" %%mm1, %%mm0             \n\t"
600        PAVGB" %%mm2, %%mm1             \n\t"
601        "movq %%mm0, (%2, %3)           \n\t"
602        "movq %%mm1, (%2, %%"REG_a")    \n\t"
603        "movq (%1, %3), %%mm1           \n\t"
604        "movq (%1, %%"REG_a"), %%mm0    \n\t"
605        "add %%"REG_a", %2              \n\t"
606        "add %%"REG_a", %1              \n\t"
607        PAVGB" %%mm1, %%mm2             \n\t"
608        PAVGB" %%mm0, %%mm1             \n\t"
609        "movq %%mm2, (%2, %3)           \n\t"
610        "movq %%mm1, (%2, %%"REG_a")    \n\t"
611        "add %%"REG_a", %2              \n\t"
612        "subl $4, %0                    \n\t"
613        "jnz 1b                         \n\t"
614        :"+g"(h), "+S"(pixels), "+D" (block)
615        :"r" ((x86_reg)line_size)
616        :"%"REG_a, "memory");
617}
618
619/* GL: this function does incorrect rounding if overflow */
620static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
621{
622    MOVQ_BONE(mm6);
623    __asm__ volatile(
624        "lea (%3, %3), %%"REG_a"        \n\t"
625        "movq (%1), %%mm0               \n\t"
626        "sub %3, %2                     \n\t"
627        "1:                             \n\t"
628        "movq (%1, %3), %%mm1           \n\t"
629        "movq (%1, %%"REG_a"), %%mm2    \n\t"
630        "add %%"REG_a", %1              \n\t"
631        "psubusb %%mm6, %%mm1           \n\t"
632        PAVGB" %%mm1, %%mm0             \n\t"
633        PAVGB" %%mm2, %%mm1             \n\t"
634        "movq %%mm0, (%2, %3)           \n\t"
635        "movq %%mm1, (%2, %%"REG_a")    \n\t"
636        "movq (%1, %3), %%mm1           \n\t"
637        "movq (%1, %%"REG_a"), %%mm0    \n\t"
638        "add %%"REG_a", %2              \n\t"
639        "add %%"REG_a", %1              \n\t"
640        "psubusb %%mm6, %%mm1           \n\t"
641        PAVGB" %%mm1, %%mm2             \n\t"
642        PAVGB" %%mm0, %%mm1             \n\t"
643        "movq %%mm2, (%2, %3)           \n\t"
644        "movq %%mm1, (%2, %%"REG_a")    \n\t"
645        "add %%"REG_a", %2              \n\t"
646        "subl $4, %0                    \n\t"
647        "jnz 1b                         \n\t"
648        :"+g"(h), "+S"(pixels), "+D" (block)
649        :"r" ((x86_reg)line_size)
650        :"%"REG_a, "memory");
651}
652
653static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
654{
655    __asm__ volatile(
656        "lea (%3, %3), %%"REG_a"        \n\t"
657        "1:                             \n\t"
658        "movq (%2), %%mm0               \n\t"
659        "movq (%2, %3), %%mm1           \n\t"
660        PAVGB" (%1), %%mm0              \n\t"
661        PAVGB" (%1, %3), %%mm1          \n\t"
662        "movq %%mm0, (%2)               \n\t"
663        "movq %%mm1, (%2, %3)           \n\t"
664        "add %%"REG_a", %1              \n\t"
665        "add %%"REG_a", %2              \n\t"
666        "movq (%2), %%mm0               \n\t"
667        "movq (%2, %3), %%mm1           \n\t"
668        PAVGB" (%1), %%mm0              \n\t"
669        PAVGB" (%1, %3), %%mm1          \n\t"
670        "add %%"REG_a", %1              \n\t"
671        "movq %%mm0, (%2)               \n\t"
672        "movq %%mm1, (%2, %3)           \n\t"
673        "add %%"REG_a", %2              \n\t"
674        "subl $4, %0                    \n\t"
675        "jnz 1b                         \n\t"
676        :"+g"(h), "+S"(pixels), "+D"(block)
677        :"r" ((x86_reg)line_size)
678        :"%"REG_a, "memory");
679}
680
681static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
682{
683    __asm__ volatile(
684        "lea (%3, %3), %%"REG_a"        \n\t"
685        "1:                             \n\t"
686        "movq (%1), %%mm0               \n\t"
687        "movq (%1, %3), %%mm2           \n\t"
688        PAVGB" 1(%1), %%mm0             \n\t"
689        PAVGB" 1(%1, %3), %%mm2         \n\t"
690        PAVGB" (%2), %%mm0              \n\t"
691        PAVGB" (%2, %3), %%mm2          \n\t"
692        "add %%"REG_a", %1              \n\t"
693        "movq %%mm0, (%2)               \n\t"
694        "movq %%mm2, (%2, %3)           \n\t"
695        "movq (%1), %%mm0               \n\t"
696        "movq (%1, %3), %%mm2           \n\t"
697        PAVGB" 1(%1), %%mm0             \n\t"
698        PAVGB" 1(%1, %3), %%mm2         \n\t"
699        "add %%"REG_a", %2              \n\t"
700        "add %%"REG_a", %1              \n\t"
701        PAVGB" (%2), %%mm0              \n\t"
702        PAVGB" (%2, %3), %%mm2          \n\t"
703        "movq %%mm0, (%2)               \n\t"
704        "movq %%mm2, (%2, %3)           \n\t"
705        "add %%"REG_a", %2              \n\t"
706        "subl $4, %0                    \n\t"
707        "jnz 1b                         \n\t"
708        :"+g"(h), "+S"(pixels), "+D"(block)
709        :"r" ((x86_reg)line_size)
710        :"%"REG_a, "memory");
711}
712
713static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
714{
715    __asm__ volatile(
716        "lea (%3, %3), %%"REG_a"        \n\t"
717        "movq (%1), %%mm0               \n\t"
718        "sub %3, %2                     \n\t"
719        "1:                             \n\t"
720        "movq (%1, %3), %%mm1           \n\t"
721        "movq (%1, %%"REG_a"), %%mm2    \n\t"
722        "add %%"REG_a", %1              \n\t"
723        PAVGB" %%mm1, %%mm0             \n\t"
724        PAVGB" %%mm2, %%mm1             \n\t"
725        "movq (%2, %3), %%mm3           \n\t"
726        "movq (%2, %%"REG_a"), %%mm4    \n\t"
727        PAVGB" %%mm3, %%mm0             \n\t"
728        PAVGB" %%mm4, %%mm1             \n\t"
729        "movq %%mm0, (%2, %3)           \n\t"
730        "movq %%mm1, (%2, %%"REG_a")    \n\t"
731        "movq (%1, %3), %%mm1           \n\t"
732        "movq (%1, %%"REG_a"), %%mm0    \n\t"
733        PAVGB" %%mm1, %%mm2             \n\t"
734        PAVGB" %%mm0, %%mm1             \n\t"
735        "add %%"REG_a", %2              \n\t"
736        "add %%"REG_a", %1              \n\t"
737        "movq (%2, %3), %%mm3           \n\t"
738        "movq (%2, %%"REG_a"), %%mm4    \n\t"
739        PAVGB" %%mm3, %%mm2             \n\t"
740        PAVGB" %%mm4, %%mm1             \n\t"
741        "movq %%mm2, (%2, %3)           \n\t"
742        "movq %%mm1, (%2, %%"REG_a")    \n\t"
743        "add %%"REG_a", %2              \n\t"
744        "subl $4, %0                    \n\t"
745        "jnz 1b                         \n\t"
746        :"+g"(h), "+S"(pixels), "+D"(block)
747        :"r" ((x86_reg)line_size)
748        :"%"REG_a, "memory");
749}
750
751/* Note this is not correctly rounded, but this function is only
752 * used for B-frames so it does not matter. */
753static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
754{
755    MOVQ_BONE(mm6);
756    __asm__ volatile(
757        "lea (%3, %3), %%"REG_a"        \n\t"
758        "movq (%1), %%mm0               \n\t"
759        PAVGB" 1(%1), %%mm0             \n\t"
760         ASMALIGN(3)
761        "1:                             \n\t"
762        "movq (%1, %%"REG_a"), %%mm2    \n\t"
763        "movq (%1, %3), %%mm1           \n\t"
764        "psubusb %%mm6, %%mm2           \n\t"
765        PAVGB" 1(%1, %3), %%mm1         \n\t"
766        PAVGB" 1(%1, %%"REG_a"), %%mm2  \n\t"
767        "add %%"REG_a", %1              \n\t"
768        PAVGB" %%mm1, %%mm0             \n\t"
769        PAVGB" %%mm2, %%mm1             \n\t"
770        PAVGB" (%2), %%mm0              \n\t"
771        PAVGB" (%2, %3), %%mm1          \n\t"
772        "movq %%mm0, (%2)               \n\t"
773        "movq %%mm1, (%2, %3)           \n\t"
774        "movq (%1, %3), %%mm1           \n\t"
775        "movq (%1, %%"REG_a"), %%mm0    \n\t"
776        PAVGB" 1(%1, %3), %%mm1         \n\t"
777        PAVGB" 1(%1, %%"REG_a"), %%mm0  \n\t"
778        "add %%"REG_a", %2              \n\t"
779        "add %%"REG_a", %1              \n\t"
780        PAVGB" %%mm1, %%mm2             \n\t"
781        PAVGB" %%mm0, %%mm1             \n\t"
782        PAVGB" (%2), %%mm2              \n\t"
783        PAVGB" (%2, %3), %%mm1          \n\t"
784        "movq %%mm2, (%2)               \n\t"
785        "movq %%mm1, (%2, %3)           \n\t"
786        "add %%"REG_a", %2              \n\t"
787        "subl $4, %0                    \n\t"
788        "jnz 1b                         \n\t"
789        :"+g"(h), "+S"(pixels), "+D"(block)
790        :"r" ((x86_reg)line_size)
791        :"%"REG_a,  "memory");
792}
793
794static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
795{
796    do {
797        __asm__ volatile(
798            "movd (%1), %%mm0               \n\t"
799            "movd (%1, %2), %%mm1           \n\t"
800            "movd (%1, %2, 2), %%mm2        \n\t"
801            "movd (%1, %3), %%mm3           \n\t"
802            PAVGB" (%0), %%mm0              \n\t"
803            PAVGB" (%0, %2), %%mm1          \n\t"
804            PAVGB" (%0, %2, 2), %%mm2       \n\t"
805            PAVGB" (%0, %3), %%mm3          \n\t"
806            "movd %%mm0, (%1)               \n\t"
807            "movd %%mm1, (%1, %2)           \n\t"
808            "movd %%mm2, (%1, %2, 2)        \n\t"
809            "movd %%mm3, (%1, %3)           \n\t"
810            ::"S"(pixels), "D"(block),
811             "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
812            :"memory");
813        block += 4*line_size;
814        pixels += 4*line_size;
815        h -= 4;
816    } while(h > 0);
817}
818
819//FIXME the following could be optimized too ...
820static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
821    DEF(put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
822    DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
823}
824static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
825    DEF(put_pixels8_y2)(block  , pixels  , line_size, h);
826    DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
827}
828static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
829    DEF(put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
830    DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
831}
832static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
833    DEF(avg_pixels8)(block  , pixels  , line_size, h);
834    DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
835}
836static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
837    DEF(avg_pixels8_x2)(block  , pixels  , line_size, h);
838    DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
839}
840static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
841    DEF(avg_pixels8_y2)(block  , pixels  , line_size, h);
842    DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
843}
844static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
845    DEF(avg_pixels8_xy2)(block  , pixels  , line_size, h);
846    DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
847}
848
849#define QPEL_2TAP_L3(OPNAME) \
850static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
851    __asm__ volatile(\
852        "1:                    \n\t"\
853        "movq   (%1,%2), %%mm0 \n\t"\
854        "movq  8(%1,%2), %%mm1 \n\t"\
855        PAVGB"  (%1,%3), %%mm0 \n\t"\
856        PAVGB" 8(%1,%3), %%mm1 \n\t"\
857        PAVGB"  (%1),    %%mm0 \n\t"\
858        PAVGB" 8(%1),    %%mm1 \n\t"\
859        STORE_OP( (%1,%4),%%mm0)\
860        STORE_OP(8(%1,%4),%%mm1)\
861        "movq  %%mm0,  (%1,%4) \n\t"\
862        "movq  %%mm1, 8(%1,%4) \n\t"\
863        "add   %5, %1          \n\t"\
864        "decl  %0              \n\t"\
865        "jnz   1b              \n\t"\
866        :"+g"(h), "+r"(src)\
867        :"r"((x86_reg)off1), "r"((x86_reg)off2),\
868         "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
869        :"memory"\
870    );\
871}\
872static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
873    __asm__ volatile(\
874        "1:                    \n\t"\
875        "movq   (%1,%2), %%mm0 \n\t"\
876        PAVGB"  (%1,%3), %%mm0 \n\t"\
877        PAVGB"  (%1),    %%mm0 \n\t"\
878        STORE_OP((%1,%4),%%mm0)\
879        "movq  %%mm0,  (%1,%4) \n\t"\
880        "add   %5, %1          \n\t"\
881        "decl  %0              \n\t"\
882        "jnz   1b              \n\t"\
883        :"+g"(h), "+r"(src)\
884        :"r"((x86_reg)off1), "r"((x86_reg)off2),\
885         "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
886        :"memory"\
887    );\
888}
889
890#define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
891QPEL_2TAP_L3(avg_)
892#undef STORE_OP
893#define STORE_OP(a,b)
894QPEL_2TAP_L3(put_)
895#undef STORE_OP
896#undef QPEL_2TAP_L3
897