1/*
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer
5 *
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8 * and improved by Zdenek Kabelac <kabi@users.sf.net>
9 *
10 * This file is part of Libav.
11 *
12 * Libav is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * Libav is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with Libav; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
28   clobber bug - now it will work with 2.95.2 and also with -fPIC
29 */
30static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
31{
32    __asm__ volatile(
33        "lea (%3, %3), %%"REG_a"        \n\t"
34        "1:                             \n\t"
35        "movq (%1), %%mm0               \n\t"
36        "movq (%1, %3), %%mm1           \n\t"
37        PAVGB" 1(%1), %%mm0             \n\t"
38        PAVGB" 1(%1, %3), %%mm1         \n\t"
39        "movq %%mm0, (%2)               \n\t"
40        "movq %%mm1, (%2, %3)           \n\t"
41        "add %%"REG_a", %1              \n\t"
42        "add %%"REG_a", %2              \n\t"
43        "movq (%1), %%mm0               \n\t"
44        "movq (%1, %3), %%mm1           \n\t"
45        PAVGB" 1(%1), %%mm0             \n\t"
46        PAVGB" 1(%1, %3), %%mm1         \n\t"
47        "add %%"REG_a", %1              \n\t"
48        "movq %%mm0, (%2)               \n\t"
49        "movq %%mm1, (%2, %3)           \n\t"
50        "add %%"REG_a", %2              \n\t"
51        "subl $4, %0                    \n\t"
52        "jnz 1b                         \n\t"
53        :"+g"(h), "+S"(pixels), "+D"(block)
54        :"r" ((x86_reg)line_size)
55        :"%"REG_a, "memory");
56}
57
58static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
59{
60    __asm__ volatile(
61        "testl $1, %0                   \n\t"
62            " jz 1f                     \n\t"
63        "movd   (%1), %%mm0             \n\t"
64        "movd   (%2), %%mm1             \n\t"
65        "add    %4, %1                  \n\t"
66        "add    $4, %2                  \n\t"
67        PAVGB" %%mm1, %%mm0             \n\t"
68        "movd   %%mm0, (%3)             \n\t"
69        "add    %5, %3                  \n\t"
70        "decl   %0                      \n\t"
71        "1:                             \n\t"
72        "movd   (%1), %%mm0             \n\t"
73        "add    %4, %1                  \n\t"
74        "movd   (%1), %%mm1             \n\t"
75        "movd   (%2), %%mm2             \n\t"
76        "movd   4(%2), %%mm3            \n\t"
77        "add    %4, %1                  \n\t"
78        PAVGB" %%mm2, %%mm0             \n\t"
79        PAVGB" %%mm3, %%mm1             \n\t"
80        "movd   %%mm0, (%3)             \n\t"
81        "add    %5, %3                  \n\t"
82        "movd   %%mm1, (%3)             \n\t"
83        "add    %5, %3                  \n\t"
84        "movd   (%1), %%mm0             \n\t"
85        "add    %4, %1                  \n\t"
86        "movd   (%1), %%mm1             \n\t"
87        "movd   8(%2), %%mm2            \n\t"
88        "movd   12(%2), %%mm3           \n\t"
89        "add    %4, %1                  \n\t"
90        PAVGB" %%mm2, %%mm0             \n\t"
91        PAVGB" %%mm3, %%mm1             \n\t"
92        "movd   %%mm0, (%3)             \n\t"
93        "add    %5, %3                  \n\t"
94        "movd   %%mm1, (%3)             \n\t"
95        "add    %5, %3                  \n\t"
96        "add    $16, %2                 \n\t"
97        "subl   $4, %0                  \n\t"
98        "jnz    1b                      \n\t"
99#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
100        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
101#else
102        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
103#endif
104        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
105        :"memory");
106}
107
108
109static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
110{
111    __asm__ volatile(
112        "testl $1, %0                   \n\t"
113            " jz 1f                     \n\t"
114        "movq   (%1), %%mm0             \n\t"
115        "movq   (%2), %%mm1             \n\t"
116        "add    %4, %1                  \n\t"
117        "add    $8, %2                  \n\t"
118        PAVGB" %%mm1, %%mm0             \n\t"
119        "movq   %%mm0, (%3)             \n\t"
120        "add    %5, %3                  \n\t"
121        "decl   %0                      \n\t"
122        "1:                             \n\t"
123        "movq   (%1), %%mm0             \n\t"
124        "add    %4, %1                  \n\t"
125        "movq   (%1), %%mm1             \n\t"
126        "add    %4, %1                  \n\t"
127        PAVGB" (%2), %%mm0              \n\t"
128        PAVGB" 8(%2), %%mm1             \n\t"
129        "movq   %%mm0, (%3)             \n\t"
130        "add    %5, %3                  \n\t"
131        "movq   %%mm1, (%3)             \n\t"
132        "add    %5, %3                  \n\t"
133        "movq   (%1), %%mm0             \n\t"
134        "add    %4, %1                  \n\t"
135        "movq   (%1), %%mm1             \n\t"
136        "add    %4, %1                  \n\t"
137        PAVGB" 16(%2), %%mm0            \n\t"
138        PAVGB" 24(%2), %%mm1            \n\t"
139        "movq   %%mm0, (%3)             \n\t"
140        "add    %5, %3                  \n\t"
141        "movq   %%mm1, (%3)             \n\t"
142        "add    %5, %3                  \n\t"
143        "add    $32, %2                 \n\t"
144        "subl   $4, %0                  \n\t"
145        "jnz    1b                      \n\t"
146#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
147        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
148#else
149        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
150#endif
151        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
152        :"memory");
153//the following should be used, though better not with gcc ...
154/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
155        :"r"(src1Stride), "r"(dstStride)
156        :"memory");*/
157}
158
159static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
160{
161    __asm__ volatile(
162        "pcmpeqb %%mm6, %%mm6           \n\t"
163        "testl $1, %0                   \n\t"
164            " jz 1f                     \n\t"
165        "movq   (%1), %%mm0             \n\t"
166        "movq   (%2), %%mm1             \n\t"
167        "add    %4, %1                  \n\t"
168        "add    $8, %2                  \n\t"
169        "pxor %%mm6, %%mm0              \n\t"
170        "pxor %%mm6, %%mm1              \n\t"
171        PAVGB" %%mm1, %%mm0             \n\t"
172        "pxor %%mm6, %%mm0              \n\t"
173        "movq   %%mm0, (%3)             \n\t"
174        "add    %5, %3                  \n\t"
175        "decl   %0                      \n\t"
176        "1:                             \n\t"
177        "movq   (%1), %%mm0             \n\t"
178        "add    %4, %1                  \n\t"
179        "movq   (%1), %%mm1             \n\t"
180        "add    %4, %1                  \n\t"
181        "movq   (%2), %%mm2             \n\t"
182        "movq   8(%2), %%mm3            \n\t"
183        "pxor %%mm6, %%mm0              \n\t"
184        "pxor %%mm6, %%mm1              \n\t"
185        "pxor %%mm6, %%mm2              \n\t"
186        "pxor %%mm6, %%mm3              \n\t"
187        PAVGB" %%mm2, %%mm0             \n\t"
188        PAVGB" %%mm3, %%mm1             \n\t"
189        "pxor %%mm6, %%mm0              \n\t"
190        "pxor %%mm6, %%mm1              \n\t"
191        "movq   %%mm0, (%3)             \n\t"
192        "add    %5, %3                  \n\t"
193        "movq   %%mm1, (%3)             \n\t"
194        "add    %5, %3                  \n\t"
195        "movq   (%1), %%mm0             \n\t"
196        "add    %4, %1                  \n\t"
197        "movq   (%1), %%mm1             \n\t"
198        "add    %4, %1                  \n\t"
199        "movq   16(%2), %%mm2           \n\t"
200        "movq   24(%2), %%mm3           \n\t"
201        "pxor %%mm6, %%mm0              \n\t"
202        "pxor %%mm6, %%mm1              \n\t"
203        "pxor %%mm6, %%mm2              \n\t"
204        "pxor %%mm6, %%mm3              \n\t"
205        PAVGB" %%mm2, %%mm0             \n\t"
206        PAVGB" %%mm3, %%mm1             \n\t"
207        "pxor %%mm6, %%mm0              \n\t"
208        "pxor %%mm6, %%mm1              \n\t"
209        "movq   %%mm0, (%3)             \n\t"
210        "add    %5, %3                  \n\t"
211        "movq   %%mm1, (%3)             \n\t"
212        "add    %5, %3                  \n\t"
213        "add    $32, %2                 \n\t"
214        "subl   $4, %0                  \n\t"
215        "jnz    1b                      \n\t"
216#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
217        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
218#else
219        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
220#endif
221        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
222        :"memory");
223//the following should be used, though better not with gcc ...
224/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
225        :"r"(src1Stride), "r"(dstStride)
226        :"memory");*/
227}
228
229static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
230{
231    __asm__ volatile(
232        "testl $1, %0                   \n\t"
233            " jz 1f                     \n\t"
234        "movd   (%1), %%mm0             \n\t"
235        "movd   (%2), %%mm1             \n\t"
236        "add    %4, %1                  \n\t"
237        "add    $4, %2                  \n\t"
238        PAVGB" %%mm1, %%mm0             \n\t"
239        PAVGB" (%3), %%mm0              \n\t"
240        "movd   %%mm0, (%3)             \n\t"
241        "add    %5, %3                  \n\t"
242        "decl   %0                      \n\t"
243        "1:                             \n\t"
244        "movd   (%1), %%mm0             \n\t"
245        "add    %4, %1                  \n\t"
246        "movd   (%1), %%mm1             \n\t"
247        "add    %4, %1                  \n\t"
248        PAVGB" (%2), %%mm0              \n\t"
249        PAVGB" 4(%2), %%mm1             \n\t"
250        PAVGB" (%3), %%mm0              \n\t"
251        "movd   %%mm0, (%3)             \n\t"
252        "add    %5, %3                  \n\t"
253        PAVGB" (%3), %%mm1              \n\t"
254        "movd   %%mm1, (%3)             \n\t"
255        "add    %5, %3                  \n\t"
256        "movd   (%1), %%mm0             \n\t"
257        "add    %4, %1                  \n\t"
258        "movd   (%1), %%mm1             \n\t"
259        "add    %4, %1                  \n\t"
260        PAVGB" 8(%2), %%mm0             \n\t"
261        PAVGB" 12(%2), %%mm1            \n\t"
262        PAVGB" (%3), %%mm0              \n\t"
263        "movd   %%mm0, (%3)             \n\t"
264        "add    %5, %3                  \n\t"
265        PAVGB" (%3), %%mm1              \n\t"
266        "movd   %%mm1, (%3)             \n\t"
267        "add    %5, %3                  \n\t"
268        "add    $16, %2                 \n\t"
269        "subl   $4, %0                  \n\t"
270        "jnz    1b                      \n\t"
271#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
272        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
273#else
274        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
275#endif
276        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
277        :"memory");
278}
279
280
281static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
282{
283    __asm__ volatile(
284        "testl $1, %0                   \n\t"
285            " jz 1f                     \n\t"
286        "movq   (%1), %%mm0             \n\t"
287        "movq   (%2), %%mm1             \n\t"
288        "add    %4, %1                  \n\t"
289        "add    $8, %2                  \n\t"
290        PAVGB" %%mm1, %%mm0             \n\t"
291        PAVGB" (%3), %%mm0              \n\t"
292        "movq   %%mm0, (%3)             \n\t"
293        "add    %5, %3                  \n\t"
294        "decl   %0                      \n\t"
295        "1:                             \n\t"
296        "movq   (%1), %%mm0             \n\t"
297        "add    %4, %1                  \n\t"
298        "movq   (%1), %%mm1             \n\t"
299        "add    %4, %1                  \n\t"
300        PAVGB" (%2), %%mm0              \n\t"
301        PAVGB" 8(%2), %%mm1             \n\t"
302        PAVGB" (%3), %%mm0              \n\t"
303        "movq   %%mm0, (%3)             \n\t"
304        "add    %5, %3                  \n\t"
305        PAVGB" (%3), %%mm1              \n\t"
306        "movq   %%mm1, (%3)             \n\t"
307        "add    %5, %3                  \n\t"
308        "movq   (%1), %%mm0             \n\t"
309        "add    %4, %1                  \n\t"
310        "movq   (%1), %%mm1             \n\t"
311        "add    %4, %1                  \n\t"
312        PAVGB" 16(%2), %%mm0            \n\t"
313        PAVGB" 24(%2), %%mm1            \n\t"
314        PAVGB" (%3), %%mm0              \n\t"
315        "movq   %%mm0, (%3)             \n\t"
316        "add    %5, %3                  \n\t"
317        PAVGB" (%3), %%mm1              \n\t"
318        "movq   %%mm1, (%3)             \n\t"
319        "add    %5, %3                  \n\t"
320        "add    $32, %2                 \n\t"
321        "subl   $4, %0                  \n\t"
322        "jnz    1b                      \n\t"
323#if !HAVE_EBX_AVAILABLE  //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
324        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
325#else
326        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
327#endif
328        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
329        :"memory");
330//the following should be used, though better not with gcc ...
331/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
332        :"r"(src1Stride), "r"(dstStride)
333        :"memory");*/
334}
335
336static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
337{
338    __asm__ volatile(
339        "lea (%3, %3), %%"REG_a"        \n\t"
340        "1:                             \n\t"
341        "movq (%1), %%mm0               \n\t"
342        "movq (%1, %3), %%mm1           \n\t"
343        "movq 8(%1), %%mm2              \n\t"
344        "movq 8(%1, %3), %%mm3          \n\t"
345        PAVGB" 1(%1), %%mm0             \n\t"
346        PAVGB" 1(%1, %3), %%mm1         \n\t"
347        PAVGB" 9(%1), %%mm2             \n\t"
348        PAVGB" 9(%1, %3), %%mm3         \n\t"
349        "movq %%mm0, (%2)               \n\t"
350        "movq %%mm1, (%2, %3)           \n\t"
351        "movq %%mm2, 8(%2)              \n\t"
352        "movq %%mm3, 8(%2, %3)          \n\t"
353        "add %%"REG_a", %1              \n\t"
354        "add %%"REG_a", %2              \n\t"
355        "movq (%1), %%mm0               \n\t"
356        "movq (%1, %3), %%mm1           \n\t"
357        "movq 8(%1), %%mm2              \n\t"
358        "movq 8(%1, %3), %%mm3          \n\t"
359        PAVGB" 1(%1), %%mm0             \n\t"
360        PAVGB" 1(%1, %3), %%mm1         \n\t"
361        PAVGB" 9(%1), %%mm2             \n\t"
362        PAVGB" 9(%1, %3), %%mm3         \n\t"
363        "add %%"REG_a", %1              \n\t"
364        "movq %%mm0, (%2)               \n\t"
365        "movq %%mm1, (%2, %3)           \n\t"
366        "movq %%mm2, 8(%2)              \n\t"
367        "movq %%mm3, 8(%2, %3)          \n\t"
368        "add %%"REG_a", %2              \n\t"
369        "subl $4, %0                    \n\t"
370        "jnz 1b                         \n\t"
371        :"+g"(h), "+S"(pixels), "+D"(block)
372        :"r" ((x86_reg)line_size)
373        :"%"REG_a, "memory");
374}
375
376static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
377{
378    __asm__ volatile(
379        "testl $1, %0                   \n\t"
380            " jz 1f                     \n\t"
381        "movq   (%1), %%mm0             \n\t"
382        "movq   8(%1), %%mm1            \n\t"
383        PAVGB" (%2), %%mm0              \n\t"
384        PAVGB" 8(%2), %%mm1             \n\t"
385        "add    %4, %1                  \n\t"
386        "add    $16, %2                 \n\t"
387        "movq   %%mm0, (%3)             \n\t"
388        "movq   %%mm1, 8(%3)            \n\t"
389        "add    %5, %3                  \n\t"
390        "decl   %0                      \n\t"
391        "1:                             \n\t"
392        "movq   (%1), %%mm0             \n\t"
393        "movq   8(%1), %%mm1            \n\t"
394        "add    %4, %1                  \n\t"
395        PAVGB" (%2), %%mm0              \n\t"
396        PAVGB" 8(%2), %%mm1             \n\t"
397        "movq   %%mm0, (%3)             \n\t"
398        "movq   %%mm1, 8(%3)            \n\t"
399        "add    %5, %3                  \n\t"
400        "movq   (%1), %%mm0             \n\t"
401        "movq   8(%1), %%mm1            \n\t"
402        "add    %4, %1                  \n\t"
403        PAVGB" 16(%2), %%mm0            \n\t"
404        PAVGB" 24(%2), %%mm1            \n\t"
405        "movq   %%mm0, (%3)             \n\t"
406        "movq   %%mm1, 8(%3)            \n\t"
407        "add    %5, %3                  \n\t"
408        "add    $32, %2                 \n\t"
409        "subl   $2, %0                  \n\t"
410        "jnz    1b                      \n\t"
411#if !HAVE_EBX_AVAILABLE  //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
412        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
413#else
414        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
415#endif
416        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
417        :"memory");
418//the following should be used, though better not with gcc ...
419/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
420        :"r"(src1Stride), "r"(dstStride)
421        :"memory");*/
422}
423
424static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
425{
426    __asm__ volatile(
427        "testl $1, %0                   \n\t"
428            " jz 1f                     \n\t"
429        "movq   (%1), %%mm0             \n\t"
430        "movq   8(%1), %%mm1            \n\t"
431        PAVGB" (%2), %%mm0              \n\t"
432        PAVGB" 8(%2), %%mm1             \n\t"
433        "add    %4, %1                  \n\t"
434        "add    $16, %2                 \n\t"
435        PAVGB" (%3), %%mm0              \n\t"
436        PAVGB" 8(%3), %%mm1             \n\t"
437        "movq   %%mm0, (%3)             \n\t"
438        "movq   %%mm1, 8(%3)            \n\t"
439        "add    %5, %3                  \n\t"
440        "decl   %0                      \n\t"
441        "1:                             \n\t"
442        "movq   (%1), %%mm0             \n\t"
443        "movq   8(%1), %%mm1            \n\t"
444        "add    %4, %1                  \n\t"
445        PAVGB" (%2), %%mm0              \n\t"
446        PAVGB" 8(%2), %%mm1             \n\t"
447        PAVGB" (%3), %%mm0              \n\t"
448        PAVGB" 8(%3), %%mm1             \n\t"
449        "movq   %%mm0, (%3)             \n\t"
450        "movq   %%mm1, 8(%3)            \n\t"
451        "add    %5, %3                  \n\t"
452        "movq   (%1), %%mm0             \n\t"
453        "movq   8(%1), %%mm1            \n\t"
454        "add    %4, %1                  \n\t"
455        PAVGB" 16(%2), %%mm0            \n\t"
456        PAVGB" 24(%2), %%mm1            \n\t"
457        PAVGB" (%3), %%mm0              \n\t"
458        PAVGB" 8(%3), %%mm1             \n\t"
459        "movq   %%mm0, (%3)             \n\t"
460        "movq   %%mm1, 8(%3)            \n\t"
461        "add    %5, %3                  \n\t"
462        "add    $32, %2                 \n\t"
463        "subl   $2, %0                  \n\t"
464        "jnz    1b                      \n\t"
465#if !HAVE_EBX_AVAILABLE  //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
466        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
467#else
468        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
469#endif
470        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
471        :"memory");
472//the following should be used, though better not with gcc ...
473/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
474        :"r"(src1Stride), "r"(dstStride)
475        :"memory");*/
476}
477
478static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
479{
480    __asm__ volatile(
481        "pcmpeqb %%mm6, %%mm6           \n\t"
482        "testl $1, %0                   \n\t"
483            " jz 1f                     \n\t"
484        "movq   (%1), %%mm0             \n\t"
485        "movq   8(%1), %%mm1            \n\t"
486        "movq   (%2), %%mm2             \n\t"
487        "movq   8(%2), %%mm3            \n\t"
488        "pxor %%mm6, %%mm0              \n\t"
489        "pxor %%mm6, %%mm1              \n\t"
490        "pxor %%mm6, %%mm2              \n\t"
491        "pxor %%mm6, %%mm3              \n\t"
492        PAVGB" %%mm2, %%mm0             \n\t"
493        PAVGB" %%mm3, %%mm1             \n\t"
494        "pxor %%mm6, %%mm0              \n\t"
495        "pxor %%mm6, %%mm1              \n\t"
496        "add    %4, %1                  \n\t"
497        "add    $16, %2                 \n\t"
498        "movq   %%mm0, (%3)             \n\t"
499        "movq   %%mm1, 8(%3)            \n\t"
500        "add    %5, %3                  \n\t"
501        "decl   %0                      \n\t"
502        "1:                             \n\t"
503        "movq   (%1), %%mm0             \n\t"
504        "movq   8(%1), %%mm1            \n\t"
505        "add    %4, %1                  \n\t"
506        "movq   (%2), %%mm2             \n\t"
507        "movq   8(%2), %%mm3            \n\t"
508        "pxor %%mm6, %%mm0              \n\t"
509        "pxor %%mm6, %%mm1              \n\t"
510        "pxor %%mm6, %%mm2              \n\t"
511        "pxor %%mm6, %%mm3              \n\t"
512        PAVGB" %%mm2, %%mm0             \n\t"
513        PAVGB" %%mm3, %%mm1             \n\t"
514        "pxor %%mm6, %%mm0              \n\t"
515        "pxor %%mm6, %%mm1              \n\t"
516        "movq   %%mm0, (%3)             \n\t"
517        "movq   %%mm1, 8(%3)            \n\t"
518        "add    %5, %3                  \n\t"
519        "movq   (%1), %%mm0             \n\t"
520        "movq   8(%1), %%mm1            \n\t"
521        "add    %4, %1                  \n\t"
522        "movq   16(%2), %%mm2           \n\t"
523        "movq   24(%2), %%mm3           \n\t"
524        "pxor %%mm6, %%mm0              \n\t"
525        "pxor %%mm6, %%mm1              \n\t"
526        "pxor %%mm6, %%mm2              \n\t"
527        "pxor %%mm6, %%mm3              \n\t"
528        PAVGB" %%mm2, %%mm0             \n\t"
529        PAVGB" %%mm3, %%mm1             \n\t"
530        "pxor %%mm6, %%mm0              \n\t"
531        "pxor %%mm6, %%mm1              \n\t"
532        "movq   %%mm0, (%3)             \n\t"
533        "movq   %%mm1, 8(%3)            \n\t"
534        "add    %5, %3                  \n\t"
535        "add    $32, %2                 \n\t"
536        "subl   $2, %0                  \n\t"
537        "jnz    1b                      \n\t"
538#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
539        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
540#else
541        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
542#endif
543        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
544        :"memory");
545//the following should be used, though better not with gcc ...
546/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
547        :"r"(src1Stride), "r"(dstStride)
548        :"memory");*/
549}
550
551/* GL: this function does incorrect rounding if overflow */
552static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
553{
554    MOVQ_BONE(mm6);
555    __asm__ volatile(
556        "lea (%3, %3), %%"REG_a"        \n\t"
557        "1:                             \n\t"
558        "movq (%1), %%mm0               \n\t"
559        "movq (%1, %3), %%mm2           \n\t"
560        "movq 1(%1), %%mm1              \n\t"
561        "movq 1(%1, %3), %%mm3          \n\t"
562        "add %%"REG_a", %1              \n\t"
563        "psubusb %%mm6, %%mm0           \n\t"
564        "psubusb %%mm6, %%mm2           \n\t"
565        PAVGB" %%mm1, %%mm0             \n\t"
566        PAVGB" %%mm3, %%mm2             \n\t"
567        "movq %%mm0, (%2)               \n\t"
568        "movq %%mm2, (%2, %3)           \n\t"
569        "movq (%1), %%mm0               \n\t"
570        "movq 1(%1), %%mm1              \n\t"
571        "movq (%1, %3), %%mm2           \n\t"
572        "movq 1(%1, %3), %%mm3          \n\t"
573        "add %%"REG_a", %2              \n\t"
574        "add %%"REG_a", %1              \n\t"
575        "psubusb %%mm6, %%mm0           \n\t"
576        "psubusb %%mm6, %%mm2           \n\t"
577        PAVGB" %%mm1, %%mm0             \n\t"
578        PAVGB" %%mm3, %%mm2             \n\t"
579        "movq %%mm0, (%2)               \n\t"
580        "movq %%mm2, (%2, %3)           \n\t"
581        "add %%"REG_a", %2              \n\t"
582        "subl $4, %0                    \n\t"
583        "jnz 1b                         \n\t"
584        :"+g"(h), "+S"(pixels), "+D"(block)
585        :"r" ((x86_reg)line_size)
586        :"%"REG_a, "memory");
587}
588
589static void DEF(put_no_rnd_pixels8_x2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
590{
591    __asm__ volatile (
592        "pcmpeqb %%mm6, %%mm6           \n\t"
593        "1:                             \n\t"
594        "movq  (%1),     %%mm0          \n\t"
595        "movq  (%1, %3), %%mm2          \n\t"
596        "movq 1(%1),     %%mm1          \n\t"
597        "movq 1(%1, %3), %%mm3          \n\t"
598        "pxor  %%mm6, %%mm0             \n\t"
599        "pxor  %%mm6, %%mm2             \n\t"
600        "pxor  %%mm6, %%mm1             \n\t"
601        "pxor  %%mm6, %%mm3             \n\t"
602        PAVGB" %%mm1, %%mm0             \n\t"
603        PAVGB" %%mm3, %%mm2             \n\t"
604        "pxor  %%mm6, %%mm0             \n\t"
605        "pxor  %%mm6, %%mm2             \n\t"
606        "movq  %%mm0, (%2)              \n\t"
607        "movq  %%mm2, (%2, %3)          \n\t"
608        "movq  (%1, %3,2), %%mm0        \n\t"
609        "movq 1(%1, %3,2), %%mm1        \n\t"
610        "movq  (%1, %4),   %%mm2        \n\t"
611        "movq 1(%1, %4),   %%mm3        \n\t"
612        "pxor  %%mm6, %%mm0             \n\t"
613        "pxor  %%mm6, %%mm1             \n\t"
614        "pxor  %%mm6, %%mm2             \n\t"
615        "pxor  %%mm6, %%mm3             \n\t"
616        PAVGB" %%mm1, %%mm0             \n\t"
617        PAVGB" %%mm3, %%mm2             \n\t"
618        "pxor  %%mm6, %%mm0             \n\t"
619        "pxor  %%mm6, %%mm2             \n\t"
620        "movq  %%mm0, (%2, %3,2)        \n\t"
621        "movq  %%mm2, (%2, %4)          \n\t"
622        "lea   (%1, %3,4), %1           \n\t"
623        "lea   (%2, %3,4), %2           \n\t"
624        "subl  $4, %0                   \n\t"
625        "jg 1b                          \n\t"
626        : "+g"(h), "+r"(pixels), "+r"(block)
627        : "r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
628        : "memory"
629    );
630}
631
632static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
633{
634    __asm__ volatile(
635        "lea (%3, %3), %%"REG_a"        \n\t"
636        "movq (%1), %%mm0               \n\t"
637        "sub %3, %2                     \n\t"
638        "1:                             \n\t"
639        "movq (%1, %3), %%mm1           \n\t"
640        "movq (%1, %%"REG_a"), %%mm2    \n\t"
641        "add %%"REG_a", %1              \n\t"
642        PAVGB" %%mm1, %%mm0             \n\t"
643        PAVGB" %%mm2, %%mm1             \n\t"
644        "movq %%mm0, (%2, %3)           \n\t"
645        "movq %%mm1, (%2, %%"REG_a")    \n\t"
646        "movq (%1, %3), %%mm1           \n\t"
647        "movq (%1, %%"REG_a"), %%mm0    \n\t"
648        "add %%"REG_a", %2              \n\t"
649        "add %%"REG_a", %1              \n\t"
650        PAVGB" %%mm1, %%mm2             \n\t"
651        PAVGB" %%mm0, %%mm1             \n\t"
652        "movq %%mm2, (%2, %3)           \n\t"
653        "movq %%mm1, (%2, %%"REG_a")    \n\t"
654        "add %%"REG_a", %2              \n\t"
655        "subl $4, %0                    \n\t"
656        "jnz 1b                         \n\t"
657        :"+g"(h), "+S"(pixels), "+D" (block)
658        :"r" ((x86_reg)line_size)
659        :"%"REG_a, "memory");
660}
661
662/* GL: this function does incorrect rounding if overflow */
663static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
664{
665    MOVQ_BONE(mm6);
666    __asm__ volatile(
667        "lea (%3, %3), %%"REG_a"        \n\t"
668        "movq (%1), %%mm0               \n\t"
669        "sub %3, %2                     \n\t"
670        "1:                             \n\t"
671        "movq (%1, %3), %%mm1           \n\t"
672        "movq (%1, %%"REG_a"), %%mm2    \n\t"
673        "add %%"REG_a", %1              \n\t"
674        "psubusb %%mm6, %%mm1           \n\t"
675        PAVGB" %%mm1, %%mm0             \n\t"
676        PAVGB" %%mm2, %%mm1             \n\t"
677        "movq %%mm0, (%2, %3)           \n\t"
678        "movq %%mm1, (%2, %%"REG_a")    \n\t"
679        "movq (%1, %3), %%mm1           \n\t"
680        "movq (%1, %%"REG_a"), %%mm0    \n\t"
681        "add %%"REG_a", %2              \n\t"
682        "add %%"REG_a", %1              \n\t"
683        "psubusb %%mm6, %%mm1           \n\t"
684        PAVGB" %%mm1, %%mm2             \n\t"
685        PAVGB" %%mm0, %%mm1             \n\t"
686        "movq %%mm2, (%2, %3)           \n\t"
687        "movq %%mm1, (%2, %%"REG_a")    \n\t"
688        "add %%"REG_a", %2              \n\t"
689        "subl $4, %0                    \n\t"
690        "jnz 1b                         \n\t"
691        :"+g"(h), "+S"(pixels), "+D" (block)
692        :"r" ((x86_reg)line_size)
693        :"%"REG_a, "memory");
694}
695
696static void DEF(put_no_rnd_pixels8_y2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
697{
698    __asm__ volatile (
699        "movq     (%1), %%mm0           \n\t"
700        "pcmpeqb %%mm6, %%mm6           \n\t"
701        "add        %3, %1              \n\t"
702        "pxor    %%mm6, %%mm0           \n\t"
703        "1:                             \n\t"
704        "movq  (%1),     %%mm1          \n\t"
705        "movq  (%1, %3), %%mm2          \n\t"
706        "pxor  %%mm6, %%mm1             \n\t"
707        "pxor  %%mm6, %%mm2             \n\t"
708        PAVGB" %%mm1, %%mm0             \n\t"
709        PAVGB" %%mm2, %%mm1             \n\t"
710        "pxor  %%mm6, %%mm0             \n\t"
711        "pxor  %%mm6, %%mm1             \n\t"
712        "movq  %%mm0, (%2)              \n\t"
713        "movq  %%mm1, (%2, %3)          \n\t"
714        "movq  (%1, %3,2), %%mm1        \n\t"
715        "movq  (%1, %4),   %%mm0        \n\t"
716        "pxor  %%mm6, %%mm1             \n\t"
717        "pxor  %%mm6, %%mm0             \n\t"
718        PAVGB" %%mm1, %%mm2             \n\t"
719        PAVGB" %%mm0, %%mm1             \n\t"
720        "pxor  %%mm6, %%mm2             \n\t"
721        "pxor  %%mm6, %%mm1             \n\t"
722        "movq %%mm2, (%2, %3,2)         \n\t"
723        "movq %%mm1, (%2, %4)           \n\t"
724        "lea   (%1, %3,4), %1           \n\t"
725        "lea   (%2, %3,4), %2           \n\t"
726        "subl $4, %0                    \n\t"
727        "jg 1b                          \n\t"
728        :"+g"(h), "+r"(pixels), "+r" (block)
729        :"r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
730        :"memory"
731    );
732}
733
734static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
735{
736    __asm__ volatile(
737        "lea (%3, %3), %%"REG_a"        \n\t"
738        "1:                             \n\t"
739        "movq (%2), %%mm0               \n\t"
740        "movq (%2, %3), %%mm1           \n\t"
741        PAVGB" (%1), %%mm0              \n\t"
742        PAVGB" (%1, %3), %%mm1          \n\t"
743        "movq %%mm0, (%2)               \n\t"
744        "movq %%mm1, (%2, %3)           \n\t"
745        "add %%"REG_a", %1              \n\t"
746        "add %%"REG_a", %2              \n\t"
747        "movq (%2), %%mm0               \n\t"
748        "movq (%2, %3), %%mm1           \n\t"
749        PAVGB" (%1), %%mm0              \n\t"
750        PAVGB" (%1, %3), %%mm1          \n\t"
751        "add %%"REG_a", %1              \n\t"
752        "movq %%mm0, (%2)               \n\t"
753        "movq %%mm1, (%2, %3)           \n\t"
754        "add %%"REG_a", %2              \n\t"
755        "subl $4, %0                    \n\t"
756        "jnz 1b                         \n\t"
757        :"+g"(h), "+S"(pixels), "+D"(block)
758        :"r" ((x86_reg)line_size)
759        :"%"REG_a, "memory");
760}
761
762static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
763{
764    __asm__ volatile(
765        "lea (%3, %3), %%"REG_a"        \n\t"
766        "1:                             \n\t"
767        "movq (%1), %%mm0               \n\t"
768        "movq (%1, %3), %%mm2           \n\t"
769        PAVGB" 1(%1), %%mm0             \n\t"
770        PAVGB" 1(%1, %3), %%mm2         \n\t"
771        PAVGB" (%2), %%mm0              \n\t"
772        PAVGB" (%2, %3), %%mm2          \n\t"
773        "add %%"REG_a", %1              \n\t"
774        "movq %%mm0, (%2)               \n\t"
775        "movq %%mm2, (%2, %3)           \n\t"
776        "movq (%1), %%mm0               \n\t"
777        "movq (%1, %3), %%mm2           \n\t"
778        PAVGB" 1(%1), %%mm0             \n\t"
779        PAVGB" 1(%1, %3), %%mm2         \n\t"
780        "add %%"REG_a", %2              \n\t"
781        "add %%"REG_a", %1              \n\t"
782        PAVGB" (%2), %%mm0              \n\t"
783        PAVGB" (%2, %3), %%mm2          \n\t"
784        "movq %%mm0, (%2)               \n\t"
785        "movq %%mm2, (%2, %3)           \n\t"
786        "add %%"REG_a", %2              \n\t"
787        "subl $4, %0                    \n\t"
788        "jnz 1b                         \n\t"
789        :"+g"(h), "+S"(pixels), "+D"(block)
790        :"r" ((x86_reg)line_size)
791        :"%"REG_a, "memory");
792}
793
794static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
795{
796    __asm__ volatile(
797        "lea (%3, %3), %%"REG_a"        \n\t"
798        "movq (%1), %%mm0               \n\t"
799        "sub %3, %2                     \n\t"
800        "1:                             \n\t"
801        "movq (%1, %3), %%mm1           \n\t"
802        "movq (%1, %%"REG_a"), %%mm2    \n\t"
803        "add %%"REG_a", %1              \n\t"
804        PAVGB" %%mm1, %%mm0             \n\t"
805        PAVGB" %%mm2, %%mm1             \n\t"
806        "movq (%2, %3), %%mm3           \n\t"
807        "movq (%2, %%"REG_a"), %%mm4    \n\t"
808        PAVGB" %%mm3, %%mm0             \n\t"
809        PAVGB" %%mm4, %%mm1             \n\t"
810        "movq %%mm0, (%2, %3)           \n\t"
811        "movq %%mm1, (%2, %%"REG_a")    \n\t"
812        "movq (%1, %3), %%mm1           \n\t"
813        "movq (%1, %%"REG_a"), %%mm0    \n\t"
814        PAVGB" %%mm1, %%mm2             \n\t"
815        PAVGB" %%mm0, %%mm1             \n\t"
816        "add %%"REG_a", %2              \n\t"
817        "add %%"REG_a", %1              \n\t"
818        "movq (%2, %3), %%mm3           \n\t"
819        "movq (%2, %%"REG_a"), %%mm4    \n\t"
820        PAVGB" %%mm3, %%mm2             \n\t"
821        PAVGB" %%mm4, %%mm1             \n\t"
822        "movq %%mm2, (%2, %3)           \n\t"
823        "movq %%mm1, (%2, %%"REG_a")    \n\t"
824        "add %%"REG_a", %2              \n\t"
825        "subl $4, %0                    \n\t"
826        "jnz 1b                         \n\t"
827        :"+g"(h), "+S"(pixels), "+D"(block)
828        :"r" ((x86_reg)line_size)
829        :"%"REG_a, "memory");
830}
831
832/* Note this is not correctly rounded, but this function is only
833 * used for B-frames so it does not matter. */
834static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
835{
836    MOVQ_BONE(mm6);
837    __asm__ volatile(
838        "lea (%3, %3), %%"REG_a"        \n\t"
839        "movq (%1), %%mm0               \n\t"
840        PAVGB" 1(%1), %%mm0             \n\t"
841         ".p2align 3                    \n\t"
842        "1:                             \n\t"
843        "movq (%1, %%"REG_a"), %%mm2    \n\t"
844        "movq (%1, %3), %%mm1           \n\t"
845        "psubusb %%mm6, %%mm2           \n\t"
846        PAVGB" 1(%1, %3), %%mm1         \n\t"
847        PAVGB" 1(%1, %%"REG_a"), %%mm2  \n\t"
848        "add %%"REG_a", %1              \n\t"
849        PAVGB" %%mm1, %%mm0             \n\t"
850        PAVGB" %%mm2, %%mm1             \n\t"
851        PAVGB" (%2), %%mm0              \n\t"
852        PAVGB" (%2, %3), %%mm1          \n\t"
853        "movq %%mm0, (%2)               \n\t"
854        "movq %%mm1, (%2, %3)           \n\t"
855        "movq (%1, %3), %%mm1           \n\t"
856        "movq (%1, %%"REG_a"), %%mm0    \n\t"
857        PAVGB" 1(%1, %3), %%mm1         \n\t"
858        PAVGB" 1(%1, %%"REG_a"), %%mm0  \n\t"
859        "add %%"REG_a", %2              \n\t"
860        "add %%"REG_a", %1              \n\t"
861        PAVGB" %%mm1, %%mm2             \n\t"
862        PAVGB" %%mm0, %%mm1             \n\t"
863        PAVGB" (%2), %%mm2              \n\t"
864        PAVGB" (%2, %3), %%mm1          \n\t"
865        "movq %%mm2, (%2)               \n\t"
866        "movq %%mm1, (%2, %3)           \n\t"
867        "add %%"REG_a", %2              \n\t"
868        "subl $4, %0                    \n\t"
869        "jnz 1b                         \n\t"
870        :"+g"(h), "+S"(pixels), "+D"(block)
871        :"r" ((x86_reg)line_size)
872        :"%"REG_a,  "memory");
873}
874
875static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
876{
877    do {
878        __asm__ volatile(
879            "movd (%1), %%mm0               \n\t"
880            "movd (%1, %2), %%mm1           \n\t"
881            "movd (%1, %2, 2), %%mm2        \n\t"
882            "movd (%1, %3), %%mm3           \n\t"
883            PAVGB" (%0), %%mm0              \n\t"
884            PAVGB" (%0, %2), %%mm1          \n\t"
885            PAVGB" (%0, %2, 2), %%mm2       \n\t"
886            PAVGB" (%0, %3), %%mm3          \n\t"
887            "movd %%mm0, (%1)               \n\t"
888            "movd %%mm1, (%1, %2)           \n\t"
889            "movd %%mm2, (%1, %2, 2)        \n\t"
890            "movd %%mm3, (%1, %3)           \n\t"
891            ::"S"(pixels), "D"(block),
892             "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
893            :"memory");
894        block += 4*line_size;
895        pixels += 4*line_size;
896        h -= 4;
897    } while(h > 0);
898}
899
900//FIXME the following could be optimized too ...
901static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
902    DEF(put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
903    DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
904}
905static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
906    DEF(put_pixels8_y2)(block  , pixels  , line_size, h);
907    DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
908}
909static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
910    DEF(put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
911    DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
912}
913static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
914    DEF(avg_pixels8)(block  , pixels  , line_size, h);
915    DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
916}
917static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
918    DEF(avg_pixels8_x2)(block  , pixels  , line_size, h);
919    DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
920}
921static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
922    DEF(avg_pixels8_y2)(block  , pixels  , line_size, h);
923    DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
924}
925static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
926    DEF(avg_pixels8_xy2)(block  , pixels  , line_size, h);
927    DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
928}
929
930#define QPEL_2TAP_L3(OPNAME) \
931static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
932    __asm__ volatile(\
933        "1:                    \n\t"\
934        "movq   (%1,%2), %%mm0 \n\t"\
935        "movq  8(%1,%2), %%mm1 \n\t"\
936        PAVGB"  (%1,%3), %%mm0 \n\t"\
937        PAVGB" 8(%1,%3), %%mm1 \n\t"\
938        PAVGB"  (%1),    %%mm0 \n\t"\
939        PAVGB" 8(%1),    %%mm1 \n\t"\
940        STORE_OP( (%1,%4),%%mm0)\
941        STORE_OP(8(%1,%4),%%mm1)\
942        "movq  %%mm0,  (%1,%4) \n\t"\
943        "movq  %%mm1, 8(%1,%4) \n\t"\
944        "add   %5, %1          \n\t"\
945        "decl  %0              \n\t"\
946        "jnz   1b              \n\t"\
947        :"+g"(h), "+r"(src)\
948        :"r"((x86_reg)off1), "r"((x86_reg)off2),\
949         "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
950        :"memory"\
951    );\
952}\
953static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
954    __asm__ volatile(\
955        "1:                    \n\t"\
956        "movq   (%1,%2), %%mm0 \n\t"\
957        PAVGB"  (%1,%3), %%mm0 \n\t"\
958        PAVGB"  (%1),    %%mm0 \n\t"\
959        STORE_OP((%1,%4),%%mm0)\
960        "movq  %%mm0,  (%1,%4) \n\t"\
961        "add   %5, %1          \n\t"\
962        "decl  %0              \n\t"\
963        "jnz   1b              \n\t"\
964        :"+g"(h), "+r"(src)\
965        :"r"((x86_reg)off1), "r"((x86_reg)off2),\
966         "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
967        :"memory"\
968    );\
969}
970
971#define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
972QPEL_2TAP_L3(avg_)
973#undef STORE_OP
974#define STORE_OP(a,b)
975QPEL_2TAP_L3(put_)
976#undef STORE_OP
977#undef QPEL_2TAP_L3
978