1/*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 *              software YUV to YUV converter
5 *              software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27#include <stddef.h>
28#include <stdint.h>
29
30#include "libavutil/attributes.h"
31#include "libavutil/x86/asm.h"
32
33#undef PREFETCH
34#undef MOVNTQ
35#undef EMMS
36#undef SFENCE
37#undef PAVGB
38
39#if COMPILE_TEMPLATE_AMD3DNOW
40#define PREFETCH  "prefetch"
41#define PAVGB     "pavgusb"
42#elif COMPILE_TEMPLATE_MMXEXT
43#define PREFETCH "prefetchnta"
44#define PAVGB     "pavgb"
45#else
46#define PREFETCH  " # nop"
47#endif
48
49#if COMPILE_TEMPLATE_AMD3DNOW
50/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
51#define EMMS     "femms"
52#else
53#define EMMS     "emms"
54#endif
55
56#if COMPILE_TEMPLATE_MMXEXT
57#define MOVNTQ "movntq"
58#define SFENCE "sfence"
59#else
60#define MOVNTQ "movq"
61#define SFENCE " # nop"
62#endif
63
64#if !COMPILE_TEMPLATE_SSE2
65
66#if !COMPILE_TEMPLATE_AMD3DNOW
67
68static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
69{
70    uint8_t *dest = dst;
71    const uint8_t *s = src;
72    const uint8_t *end;
73    const uint8_t *mm_end;
74    end = s + src_size;
75    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
76    mm_end = end - 23;
77    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
78    while (s < mm_end) {
79        __asm__ volatile(
80            PREFETCH"  32(%1)           \n\t"
81            "movd        (%1), %%mm0    \n\t"
82            "punpckldq  3(%1), %%mm0    \n\t"
83            "movd       6(%1), %%mm1    \n\t"
84            "punpckldq  9(%1), %%mm1    \n\t"
85            "movd      12(%1), %%mm2    \n\t"
86            "punpckldq 15(%1), %%mm2    \n\t"
87            "movd      18(%1), %%mm3    \n\t"
88            "punpckldq 21(%1), %%mm3    \n\t"
89            "por        %%mm7, %%mm0    \n\t"
90            "por        %%mm7, %%mm1    \n\t"
91            "por        %%mm7, %%mm2    \n\t"
92            "por        %%mm7, %%mm3    \n\t"
93            MOVNTQ"     %%mm0,   (%0)   \n\t"
94            MOVNTQ"     %%mm1,  8(%0)   \n\t"
95            MOVNTQ"     %%mm2, 16(%0)   \n\t"
96            MOVNTQ"     %%mm3, 24(%0)"
97            :: "r"(dest), "r"(s)
98            :"memory");
99        dest += 32;
100        s += 24;
101    }
102    __asm__ volatile(SFENCE:::"memory");
103    __asm__ volatile(EMMS:::"memory");
104    while (s < end) {
105        *dest++ = *s++;
106        *dest++ = *s++;
107        *dest++ = *s++;
108        *dest++ = 255;
109    }
110}
111
112#define STORE_BGR24_MMX \
113            "psrlq         $8, %%mm2    \n\t" \
114            "psrlq         $8, %%mm3    \n\t" \
115            "psrlq         $8, %%mm6    \n\t" \
116            "psrlq         $8, %%mm7    \n\t" \
117            "pand "MANGLE(mask24l)", %%mm0\n\t" \
118            "pand "MANGLE(mask24l)", %%mm1\n\t" \
119            "pand "MANGLE(mask24l)", %%mm4\n\t" \
120            "pand "MANGLE(mask24l)", %%mm5\n\t" \
121            "pand "MANGLE(mask24h)", %%mm2\n\t" \
122            "pand "MANGLE(mask24h)", %%mm3\n\t" \
123            "pand "MANGLE(mask24h)", %%mm6\n\t" \
124            "pand "MANGLE(mask24h)", %%mm7\n\t" \
125            "por        %%mm2, %%mm0    \n\t" \
126            "por        %%mm3, %%mm1    \n\t" \
127            "por        %%mm6, %%mm4    \n\t" \
128            "por        %%mm7, %%mm5    \n\t" \
129 \
130            "movq       %%mm1, %%mm2    \n\t" \
131            "movq       %%mm4, %%mm3    \n\t" \
132            "psllq        $48, %%mm2    \n\t" \
133            "psllq        $32, %%mm3    \n\t" \
134            "por        %%mm2, %%mm0    \n\t" \
135            "psrlq        $16, %%mm1    \n\t" \
136            "psrlq        $32, %%mm4    \n\t" \
137            "psllq        $16, %%mm5    \n\t" \
138            "por        %%mm3, %%mm1    \n\t" \
139            "por        %%mm5, %%mm4    \n\t" \
140 \
141            MOVNTQ"     %%mm0,   (%0)    \n\t" \
142            MOVNTQ"     %%mm1,  8(%0)    \n\t" \
143            MOVNTQ"     %%mm4, 16(%0)"
144
145
146static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
147{
148    uint8_t *dest = dst;
149    const uint8_t *s = src;
150    const uint8_t *end;
151    const uint8_t *mm_end;
152    end = s + src_size;
153    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
154    mm_end = end - 31;
155    while (s < mm_end) {
156        __asm__ volatile(
157            PREFETCH"  32(%1)           \n\t"
158            "movq        (%1), %%mm0    \n\t"
159            "movq       8(%1), %%mm1    \n\t"
160            "movq      16(%1), %%mm4    \n\t"
161            "movq      24(%1), %%mm5    \n\t"
162            "movq       %%mm0, %%mm2    \n\t"
163            "movq       %%mm1, %%mm3    \n\t"
164            "movq       %%mm4, %%mm6    \n\t"
165            "movq       %%mm5, %%mm7    \n\t"
166            STORE_BGR24_MMX
167            :: "r"(dest), "r"(s)
168              NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
169            :"memory");
170        dest += 24;
171        s += 32;
172    }
173    __asm__ volatile(SFENCE:::"memory");
174    __asm__ volatile(EMMS:::"memory");
175    while (s < end) {
176        *dest++ = *s++;
177        *dest++ = *s++;
178        *dest++ = *s++;
179        s++;
180    }
181}
182
183/*
184 original by Strepto/Astral
185 ported to gcc & bugfixed: A'rpi
186 MMXEXT, 3DNOW optimization by Nick Kurshev
187 32-bit C version, and and&add trick by Michael Niedermayer
188*/
189static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
190{
191    register const uint8_t* s=src;
192    register uint8_t* d=dst;
193    register const uint8_t *end;
194    const uint8_t *mm_end;
195    end = s + src_size;
196    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
197    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
198    mm_end = end - 15;
199    while (s<mm_end) {
200        __asm__ volatile(
201            PREFETCH" 32(%1)        \n\t"
202            "movq      (%1), %%mm0  \n\t"
203            "movq     8(%1), %%mm2  \n\t"
204            "movq     %%mm0, %%mm1  \n\t"
205            "movq     %%mm2, %%mm3  \n\t"
206            "pand     %%mm4, %%mm0  \n\t"
207            "pand     %%mm4, %%mm2  \n\t"
208            "paddw    %%mm1, %%mm0  \n\t"
209            "paddw    %%mm3, %%mm2  \n\t"
210            MOVNTQ"   %%mm0,  (%0)  \n\t"
211            MOVNTQ"   %%mm2, 8(%0)"
212            :: "r"(d), "r"(s)
213        );
214        d+=16;
215        s+=16;
216    }
217    __asm__ volatile(SFENCE:::"memory");
218    __asm__ volatile(EMMS:::"memory");
219    mm_end = end - 3;
220    while (s < mm_end) {
221        register unsigned x= *((const uint32_t *)s);
222        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
223        d+=4;
224        s+=4;
225    }
226    if (s < end) {
227        register unsigned short x= *((const uint16_t *)s);
228        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
229    }
230}
231
232static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
233{
234    register const uint8_t* s=src;
235    register uint8_t* d=dst;
236    register const uint8_t *end;
237    const uint8_t *mm_end;
238    end = s + src_size;
239    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
240    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
241    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
242    mm_end = end - 15;
243    while (s<mm_end) {
244        __asm__ volatile(
245            PREFETCH" 32(%1)        \n\t"
246            "movq      (%1), %%mm0  \n\t"
247            "movq     8(%1), %%mm2  \n\t"
248            "movq     %%mm0, %%mm1  \n\t"
249            "movq     %%mm2, %%mm3  \n\t"
250            "psrlq       $1, %%mm0  \n\t"
251            "psrlq       $1, %%mm2  \n\t"
252            "pand     %%mm7, %%mm0  \n\t"
253            "pand     %%mm7, %%mm2  \n\t"
254            "pand     %%mm6, %%mm1  \n\t"
255            "pand     %%mm6, %%mm3  \n\t"
256            "por      %%mm1, %%mm0  \n\t"
257            "por      %%mm3, %%mm2  \n\t"
258            MOVNTQ"   %%mm0,  (%0)  \n\t"
259            MOVNTQ"   %%mm2, 8(%0)"
260            :: "r"(d), "r"(s)
261        );
262        d+=16;
263        s+=16;
264    }
265    __asm__ volatile(SFENCE:::"memory");
266    __asm__ volatile(EMMS:::"memory");
267    mm_end = end - 3;
268    while (s < mm_end) {
269        register uint32_t x= *((const uint32_t*)s);
270        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
271        s+=4;
272        d+=4;
273    }
274    if (s < end) {
275        register uint16_t x= *((const uint16_t*)s);
276        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
277    }
278}
279
280static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
281{
282    const uint8_t *s = src;
283    const uint8_t *end;
284    const uint8_t *mm_end;
285    uint16_t *d = (uint16_t *)dst;
286    end = s + src_size;
287    mm_end = end - 15;
288    __asm__ volatile(
289        "movq           %3, %%mm5   \n\t"
290        "movq           %4, %%mm6   \n\t"
291        "movq           %5, %%mm7   \n\t"
292        "jmp 2f                     \n\t"
293        ".p2align        4          \n\t"
294        "1:                         \n\t"
295        PREFETCH"   32(%1)          \n\t"
296        "movd         (%1), %%mm0   \n\t"
297        "movd        4(%1), %%mm3   \n\t"
298        "punpckldq   8(%1), %%mm0   \n\t"
299        "punpckldq  12(%1), %%mm3   \n\t"
300        "movq        %%mm0, %%mm1   \n\t"
301        "movq        %%mm3, %%mm4   \n\t"
302        "pand        %%mm6, %%mm0   \n\t"
303        "pand        %%mm6, %%mm3   \n\t"
304        "pmaddwd     %%mm7, %%mm0   \n\t"
305        "pmaddwd     %%mm7, %%mm3   \n\t"
306        "pand        %%mm5, %%mm1   \n\t"
307        "pand        %%mm5, %%mm4   \n\t"
308        "por         %%mm1, %%mm0   \n\t"
309        "por         %%mm4, %%mm3   \n\t"
310        "psrld          $5, %%mm0   \n\t"
311        "pslld         $11, %%mm3   \n\t"
312        "por         %%mm3, %%mm0   \n\t"
313        MOVNTQ"      %%mm0, (%0)    \n\t"
314        "add           $16,  %1     \n\t"
315        "add            $8,  %0     \n\t"
316        "2:                         \n\t"
317        "cmp            %2,  %1     \n\t"
318        " jb            1b          \n\t"
319        : "+r" (d), "+r"(s)
320        : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
321    );
322    __asm__ volatile(SFENCE:::"memory");
323    __asm__ volatile(EMMS:::"memory");
324    while (s < end) {
325        register int rgb = *(const uint32_t*)s; s += 4;
326        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
327    }
328}
329
330static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
331{
332    const uint8_t *s = src;
333    const uint8_t *end;
334    const uint8_t *mm_end;
335    uint16_t *d = (uint16_t *)dst;
336    end = s + src_size;
337    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
338    __asm__ volatile(
339        "movq          %0, %%mm7    \n\t"
340        "movq          %1, %%mm6    \n\t"
341        ::"m"(red_16mask),"m"(green_16mask));
342    mm_end = end - 15;
343    while (s < mm_end) {
344        __asm__ volatile(
345            PREFETCH"  32(%1)           \n\t"
346            "movd        (%1), %%mm0    \n\t"
347            "movd       4(%1), %%mm3    \n\t"
348            "punpckldq  8(%1), %%mm0    \n\t"
349            "punpckldq 12(%1), %%mm3    \n\t"
350            "movq       %%mm0, %%mm1    \n\t"
351            "movq       %%mm0, %%mm2    \n\t"
352            "movq       %%mm3, %%mm4    \n\t"
353            "movq       %%mm3, %%mm5    \n\t"
354            "psllq         $8, %%mm0    \n\t"
355            "psllq         $8, %%mm3    \n\t"
356            "pand       %%mm7, %%mm0    \n\t"
357            "pand       %%mm7, %%mm3    \n\t"
358            "psrlq         $5, %%mm1    \n\t"
359            "psrlq         $5, %%mm4    \n\t"
360            "pand       %%mm6, %%mm1    \n\t"
361            "pand       %%mm6, %%mm4    \n\t"
362            "psrlq        $19, %%mm2    \n\t"
363            "psrlq        $19, %%mm5    \n\t"
364            "pand          %2, %%mm2    \n\t"
365            "pand          %2, %%mm5    \n\t"
366            "por        %%mm1, %%mm0    \n\t"
367            "por        %%mm4, %%mm3    \n\t"
368            "por        %%mm2, %%mm0    \n\t"
369            "por        %%mm5, %%mm3    \n\t"
370            "psllq        $16, %%mm3    \n\t"
371            "por        %%mm3, %%mm0    \n\t"
372            MOVNTQ"     %%mm0, (%0)     \n\t"
373            :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
374        d += 4;
375        s += 16;
376    }
377    __asm__ volatile(SFENCE:::"memory");
378    __asm__ volatile(EMMS:::"memory");
379    while (s < end) {
380        register int rgb = *(const uint32_t*)s; s += 4;
381        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
382    }
383}
384
385static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
386{
387    const uint8_t *s = src;
388    const uint8_t *end;
389    const uint8_t *mm_end;
390    uint16_t *d = (uint16_t *)dst;
391    end = s + src_size;
392    mm_end = end - 15;
393    __asm__ volatile(
394        "movq           %3, %%mm5   \n\t"
395        "movq           %4, %%mm6   \n\t"
396        "movq           %5, %%mm7   \n\t"
397        "jmp            2f          \n\t"
398        ".p2align        4          \n\t"
399        "1:                         \n\t"
400        PREFETCH"   32(%1)          \n\t"
401        "movd         (%1), %%mm0   \n\t"
402        "movd        4(%1), %%mm3   \n\t"
403        "punpckldq   8(%1), %%mm0   \n\t"
404        "punpckldq  12(%1), %%mm3   \n\t"
405        "movq        %%mm0, %%mm1   \n\t"
406        "movq        %%mm3, %%mm4   \n\t"
407        "pand        %%mm6, %%mm0   \n\t"
408        "pand        %%mm6, %%mm3   \n\t"
409        "pmaddwd     %%mm7, %%mm0   \n\t"
410        "pmaddwd     %%mm7, %%mm3   \n\t"
411        "pand        %%mm5, %%mm1   \n\t"
412        "pand        %%mm5, %%mm4   \n\t"
413        "por         %%mm1, %%mm0   \n\t"
414        "por         %%mm4, %%mm3   \n\t"
415        "psrld          $6, %%mm0   \n\t"
416        "pslld         $10, %%mm3   \n\t"
417        "por         %%mm3, %%mm0   \n\t"
418        MOVNTQ"      %%mm0, (%0)    \n\t"
419        "add           $16,  %1     \n\t"
420        "add            $8,  %0     \n\t"
421        "2:                         \n\t"
422        "cmp            %2,  %1     \n\t"
423        " jb            1b          \n\t"
424        : "+r" (d), "+r"(s)
425        : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
426    );
427    __asm__ volatile(SFENCE:::"memory");
428    __asm__ volatile(EMMS:::"memory");
429    while (s < end) {
430        register int rgb = *(const uint32_t*)s; s += 4;
431        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
432    }
433}
434
435static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
436{
437    const uint8_t *s = src;
438    const uint8_t *end;
439    const uint8_t *mm_end;
440    uint16_t *d = (uint16_t *)dst;
441    end = s + src_size;
442    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
443    __asm__ volatile(
444        "movq          %0, %%mm7    \n\t"
445        "movq          %1, %%mm6    \n\t"
446        ::"m"(red_15mask),"m"(green_15mask));
447    mm_end = end - 15;
448    while (s < mm_end) {
449        __asm__ volatile(
450            PREFETCH"  32(%1)           \n\t"
451            "movd        (%1), %%mm0    \n\t"
452            "movd       4(%1), %%mm3    \n\t"
453            "punpckldq  8(%1), %%mm0    \n\t"
454            "punpckldq 12(%1), %%mm3    \n\t"
455            "movq       %%mm0, %%mm1    \n\t"
456            "movq       %%mm0, %%mm2    \n\t"
457            "movq       %%mm3, %%mm4    \n\t"
458            "movq       %%mm3, %%mm5    \n\t"
459            "psllq         $7, %%mm0    \n\t"
460            "psllq         $7, %%mm3    \n\t"
461            "pand       %%mm7, %%mm0    \n\t"
462            "pand       %%mm7, %%mm3    \n\t"
463            "psrlq         $6, %%mm1    \n\t"
464            "psrlq         $6, %%mm4    \n\t"
465            "pand       %%mm6, %%mm1    \n\t"
466            "pand       %%mm6, %%mm4    \n\t"
467            "psrlq        $19, %%mm2    \n\t"
468            "psrlq        $19, %%mm5    \n\t"
469            "pand          %2, %%mm2    \n\t"
470            "pand          %2, %%mm5    \n\t"
471            "por        %%mm1, %%mm0    \n\t"
472            "por        %%mm4, %%mm3    \n\t"
473            "por        %%mm2, %%mm0    \n\t"
474            "por        %%mm5, %%mm3    \n\t"
475            "psllq        $16, %%mm3    \n\t"
476            "por        %%mm3, %%mm0    \n\t"
477            MOVNTQ"     %%mm0, (%0)     \n\t"
478            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
479        d += 4;
480        s += 16;
481    }
482    __asm__ volatile(SFENCE:::"memory");
483    __asm__ volatile(EMMS:::"memory");
484    while (s < end) {
485        register int rgb = *(const uint32_t*)s; s += 4;
486        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
487    }
488}
489
490static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
491{
492    const uint8_t *s = src;
493    const uint8_t *end;
494    const uint8_t *mm_end;
495    uint16_t *d = (uint16_t *)dst;
496    end = s + src_size;
497    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
498    __asm__ volatile(
499        "movq         %0, %%mm7     \n\t"
500        "movq         %1, %%mm6     \n\t"
501        ::"m"(red_16mask),"m"(green_16mask));
502    mm_end = end - 11;
503    while (s < mm_end) {
504        __asm__ volatile(
505            PREFETCH"  32(%1)           \n\t"
506            "movd        (%1), %%mm0    \n\t"
507            "movd       3(%1), %%mm3    \n\t"
508            "punpckldq  6(%1), %%mm0    \n\t"
509            "punpckldq  9(%1), %%mm3    \n\t"
510            "movq       %%mm0, %%mm1    \n\t"
511            "movq       %%mm0, %%mm2    \n\t"
512            "movq       %%mm3, %%mm4    \n\t"
513            "movq       %%mm3, %%mm5    \n\t"
514            "psrlq         $3, %%mm0    \n\t"
515            "psrlq         $3, %%mm3    \n\t"
516            "pand          %2, %%mm0    \n\t"
517            "pand          %2, %%mm3    \n\t"
518            "psrlq         $5, %%mm1    \n\t"
519            "psrlq         $5, %%mm4    \n\t"
520            "pand       %%mm6, %%mm1    \n\t"
521            "pand       %%mm6, %%mm4    \n\t"
522            "psrlq         $8, %%mm2    \n\t"
523            "psrlq         $8, %%mm5    \n\t"
524            "pand       %%mm7, %%mm2    \n\t"
525            "pand       %%mm7, %%mm5    \n\t"
526            "por        %%mm1, %%mm0    \n\t"
527            "por        %%mm4, %%mm3    \n\t"
528            "por        %%mm2, %%mm0    \n\t"
529            "por        %%mm5, %%mm3    \n\t"
530            "psllq        $16, %%mm3    \n\t"
531            "por        %%mm3, %%mm0    \n\t"
532            MOVNTQ"     %%mm0, (%0)     \n\t"
533            ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
534        d += 4;
535        s += 12;
536    }
537    __asm__ volatile(SFENCE:::"memory");
538    __asm__ volatile(EMMS:::"memory");
539    while (s < end) {
540        const int b = *s++;
541        const int g = *s++;
542        const int r = *s++;
543        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
544    }
545}
546
547static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
548{
549    const uint8_t *s = src;
550    const uint8_t *end;
551    const uint8_t *mm_end;
552    uint16_t *d = (uint16_t *)dst;
553    end = s + src_size;
554    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
555    __asm__ volatile(
556        "movq         %0, %%mm7     \n\t"
557        "movq         %1, %%mm6     \n\t"
558        ::"m"(red_16mask),"m"(green_16mask));
559    mm_end = end - 15;
560    while (s < mm_end) {
561        __asm__ volatile(
562            PREFETCH"  32(%1)           \n\t"
563            "movd        (%1), %%mm0    \n\t"
564            "movd       3(%1), %%mm3    \n\t"
565            "punpckldq  6(%1), %%mm0    \n\t"
566            "punpckldq  9(%1), %%mm3    \n\t"
567            "movq       %%mm0, %%mm1    \n\t"
568            "movq       %%mm0, %%mm2    \n\t"
569            "movq       %%mm3, %%mm4    \n\t"
570            "movq       %%mm3, %%mm5    \n\t"
571            "psllq         $8, %%mm0    \n\t"
572            "psllq         $8, %%mm3    \n\t"
573            "pand       %%mm7, %%mm0    \n\t"
574            "pand       %%mm7, %%mm3    \n\t"
575            "psrlq         $5, %%mm1    \n\t"
576            "psrlq         $5, %%mm4    \n\t"
577            "pand       %%mm6, %%mm1    \n\t"
578            "pand       %%mm6, %%mm4    \n\t"
579            "psrlq        $19, %%mm2    \n\t"
580            "psrlq        $19, %%mm5    \n\t"
581            "pand          %2, %%mm2    \n\t"
582            "pand          %2, %%mm5    \n\t"
583            "por        %%mm1, %%mm0    \n\t"
584            "por        %%mm4, %%mm3    \n\t"
585            "por        %%mm2, %%mm0    \n\t"
586            "por        %%mm5, %%mm3    \n\t"
587            "psllq        $16, %%mm3    \n\t"
588            "por        %%mm3, %%mm0    \n\t"
589            MOVNTQ"     %%mm0, (%0)     \n\t"
590            ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
591        d += 4;
592        s += 12;
593    }
594    __asm__ volatile(SFENCE:::"memory");
595    __asm__ volatile(EMMS:::"memory");
596    while (s < end) {
597        const int r = *s++;
598        const int g = *s++;
599        const int b = *s++;
600        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
601    }
602}
603
604static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
605{
606    const uint8_t *s = src;
607    const uint8_t *end;
608    const uint8_t *mm_end;
609    uint16_t *d = (uint16_t *)dst;
610    end = s + src_size;
611    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
612    __asm__ volatile(
613        "movq          %0, %%mm7    \n\t"
614        "movq          %1, %%mm6    \n\t"
615        ::"m"(red_15mask),"m"(green_15mask));
616    mm_end = end - 11;
617    while (s < mm_end) {
618        __asm__ volatile(
619            PREFETCH"  32(%1)           \n\t"
620            "movd        (%1), %%mm0    \n\t"
621            "movd       3(%1), %%mm3    \n\t"
622            "punpckldq  6(%1), %%mm0    \n\t"
623            "punpckldq  9(%1), %%mm3    \n\t"
624            "movq       %%mm0, %%mm1    \n\t"
625            "movq       %%mm0, %%mm2    \n\t"
626            "movq       %%mm3, %%mm4    \n\t"
627            "movq       %%mm3, %%mm5    \n\t"
628            "psrlq         $3, %%mm0    \n\t"
629            "psrlq         $3, %%mm3    \n\t"
630            "pand          %2, %%mm0    \n\t"
631            "pand          %2, %%mm3    \n\t"
632            "psrlq         $6, %%mm1    \n\t"
633            "psrlq         $6, %%mm4    \n\t"
634            "pand       %%mm6, %%mm1    \n\t"
635            "pand       %%mm6, %%mm4    \n\t"
636            "psrlq         $9, %%mm2    \n\t"
637            "psrlq         $9, %%mm5    \n\t"
638            "pand       %%mm7, %%mm2    \n\t"
639            "pand       %%mm7, %%mm5    \n\t"
640            "por        %%mm1, %%mm0    \n\t"
641            "por        %%mm4, %%mm3    \n\t"
642            "por        %%mm2, %%mm0    \n\t"
643            "por        %%mm5, %%mm3    \n\t"
644            "psllq        $16, %%mm3    \n\t"
645            "por        %%mm3, %%mm0    \n\t"
646            MOVNTQ"     %%mm0, (%0)     \n\t"
647            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
648        d += 4;
649        s += 12;
650    }
651    __asm__ volatile(SFENCE:::"memory");
652    __asm__ volatile(EMMS:::"memory");
653    while (s < end) {
654        const int b = *s++;
655        const int g = *s++;
656        const int r = *s++;
657        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
658    }
659}
660
661static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
662{
663    const uint8_t *s = src;
664    const uint8_t *end;
665    const uint8_t *mm_end;
666    uint16_t *d = (uint16_t *)dst;
667    end = s + src_size;
668    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
669    __asm__ volatile(
670        "movq         %0, %%mm7     \n\t"
671        "movq         %1, %%mm6     \n\t"
672        ::"m"(red_15mask),"m"(green_15mask));
673    mm_end = end - 15;
674    while (s < mm_end) {
675        __asm__ volatile(
676            PREFETCH" 32(%1)            \n\t"
677            "movd       (%1), %%mm0     \n\t"
678            "movd      3(%1), %%mm3     \n\t"
679            "punpckldq 6(%1), %%mm0     \n\t"
680            "punpckldq 9(%1), %%mm3     \n\t"
681            "movq      %%mm0, %%mm1     \n\t"
682            "movq      %%mm0, %%mm2     \n\t"
683            "movq      %%mm3, %%mm4     \n\t"
684            "movq      %%mm3, %%mm5     \n\t"
685            "psllq        $7, %%mm0     \n\t"
686            "psllq        $7, %%mm3     \n\t"
687            "pand      %%mm7, %%mm0     \n\t"
688            "pand      %%mm7, %%mm3     \n\t"
689            "psrlq        $6, %%mm1     \n\t"
690            "psrlq        $6, %%mm4     \n\t"
691            "pand      %%mm6, %%mm1     \n\t"
692            "pand      %%mm6, %%mm4     \n\t"
693            "psrlq       $19, %%mm2     \n\t"
694            "psrlq       $19, %%mm5     \n\t"
695            "pand         %2, %%mm2     \n\t"
696            "pand         %2, %%mm5     \n\t"
697            "por       %%mm1, %%mm0     \n\t"
698            "por       %%mm4, %%mm3     \n\t"
699            "por       %%mm2, %%mm0     \n\t"
700            "por       %%mm5, %%mm3     \n\t"
701            "psllq       $16, %%mm3     \n\t"
702            "por       %%mm3, %%mm0     \n\t"
703            MOVNTQ"    %%mm0, (%0)      \n\t"
704            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
705        d += 4;
706        s += 12;
707    }
708    __asm__ volatile(SFENCE:::"memory");
709    __asm__ volatile(EMMS:::"memory");
710    while (s < end) {
711        const int r = *s++;
712        const int g = *s++;
713        const int b = *s++;
714        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
715    }
716}
717
718static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
719{
720    const uint16_t *end;
721    const uint16_t *mm_end;
722    uint8_t *d = dst;
723    const uint16_t *s = (const uint16_t*)src;
724    end = s + src_size/2;
725    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
726    mm_end = end - 7;
727    while (s < mm_end) {
728        __asm__ volatile(
729            PREFETCH"  32(%1)           \n\t"
730            "movq        (%1), %%mm0    \n\t"
731            "movq        (%1), %%mm1    \n\t"
732            "movq        (%1), %%mm2    \n\t"
733            "pand          %2, %%mm0    \n\t"
734            "pand          %3, %%mm1    \n\t"
735            "pand          %4, %%mm2    \n\t"
736            "psllq         $5, %%mm0    \n\t"
737            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
738            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
739            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
740            "movq       %%mm0, %%mm3    \n\t"
741            "movq       %%mm1, %%mm4    \n\t"
742            "movq       %%mm2, %%mm5    \n\t"
743            "punpcklwd     %5, %%mm0    \n\t"
744            "punpcklwd     %5, %%mm1    \n\t"
745            "punpcklwd     %5, %%mm2    \n\t"
746            "punpckhwd     %5, %%mm3    \n\t"
747            "punpckhwd     %5, %%mm4    \n\t"
748            "punpckhwd     %5, %%mm5    \n\t"
749            "psllq         $8, %%mm1    \n\t"
750            "psllq        $16, %%mm2    \n\t"
751            "por        %%mm1, %%mm0    \n\t"
752            "por        %%mm2, %%mm0    \n\t"
753            "psllq         $8, %%mm4    \n\t"
754            "psllq        $16, %%mm5    \n\t"
755            "por        %%mm4, %%mm3    \n\t"
756            "por        %%mm5, %%mm3    \n\t"
757
758            "movq       %%mm0, %%mm6    \n\t"
759            "movq       %%mm3, %%mm7    \n\t"
760
761            "movq       8(%1), %%mm0    \n\t"
762            "movq       8(%1), %%mm1    \n\t"
763            "movq       8(%1), %%mm2    \n\t"
764            "pand          %2, %%mm0    \n\t"
765            "pand          %3, %%mm1    \n\t"
766            "pand          %4, %%mm2    \n\t"
767            "psllq         $5, %%mm0    \n\t"
768            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
769            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
770            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
771            "movq       %%mm0, %%mm3    \n\t"
772            "movq       %%mm1, %%mm4    \n\t"
773            "movq       %%mm2, %%mm5    \n\t"
774            "punpcklwd     %5, %%mm0    \n\t"
775            "punpcklwd     %5, %%mm1    \n\t"
776            "punpcklwd     %5, %%mm2    \n\t"
777            "punpckhwd     %5, %%mm3    \n\t"
778            "punpckhwd     %5, %%mm4    \n\t"
779            "punpckhwd     %5, %%mm5    \n\t"
780            "psllq         $8, %%mm1    \n\t"
781            "psllq        $16, %%mm2    \n\t"
782            "por        %%mm1, %%mm0    \n\t"
783            "por        %%mm2, %%mm0    \n\t"
784            "psllq         $8, %%mm4    \n\t"
785            "psllq        $16, %%mm5    \n\t"
786            "por        %%mm4, %%mm3    \n\t"
787            "por        %%mm5, %%mm3    \n\t"
788
789            :"=m"(*d)
790            :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
791             NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
792            :"memory");
793        /* borrowed 32 to 24 */
794        __asm__ volatile(
795            "movq       %%mm0, %%mm4    \n\t"
796            "movq       %%mm3, %%mm5    \n\t"
797            "movq       %%mm6, %%mm0    \n\t"
798            "movq       %%mm7, %%mm1    \n\t"
799
800            "movq       %%mm4, %%mm6    \n\t"
801            "movq       %%mm5, %%mm7    \n\t"
802            "movq       %%mm0, %%mm2    \n\t"
803            "movq       %%mm1, %%mm3    \n\t"
804
805            STORE_BGR24_MMX
806
807            :: "r"(d), "m"(*s)
808              NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
809            :"memory");
810        d += 24;
811        s += 8;
812    }
813    __asm__ volatile(SFENCE:::"memory");
814    __asm__ volatile(EMMS:::"memory");
815    while (s < end) {
816        register uint16_t bgr;
817        bgr = *s++;
818        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
819        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
820        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
821    }
822}
823
824static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
825{
826    const uint16_t *end;
827    const uint16_t *mm_end;
828    uint8_t *d = (uint8_t *)dst;
829    const uint16_t *s = (const uint16_t *)src;
830    end = s + src_size/2;
831    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
832    mm_end = end - 7;
833    while (s < mm_end) {
834        __asm__ volatile(
835            PREFETCH"  32(%1)           \n\t"
836            "movq        (%1), %%mm0    \n\t"
837            "movq        (%1), %%mm1    \n\t"
838            "movq        (%1), %%mm2    \n\t"
839            "pand          %2, %%mm0    \n\t"
840            "pand          %3, %%mm1    \n\t"
841            "pand          %4, %%mm2    \n\t"
842            "psllq         $5, %%mm0    \n\t"
843            "psrlq         $1, %%mm2    \n\t"
844            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
845            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
846            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
847            "movq       %%mm0, %%mm3    \n\t"
848            "movq       %%mm1, %%mm4    \n\t"
849            "movq       %%mm2, %%mm5    \n\t"
850            "punpcklwd     %5, %%mm0    \n\t"
851            "punpcklwd     %5, %%mm1    \n\t"
852            "punpcklwd     %5, %%mm2    \n\t"
853            "punpckhwd     %5, %%mm3    \n\t"
854            "punpckhwd     %5, %%mm4    \n\t"
855            "punpckhwd     %5, %%mm5    \n\t"
856            "psllq         $8, %%mm1    \n\t"
857            "psllq        $16, %%mm2    \n\t"
858            "por        %%mm1, %%mm0    \n\t"
859            "por        %%mm2, %%mm0    \n\t"
860            "psllq         $8, %%mm4    \n\t"
861            "psllq        $16, %%mm5    \n\t"
862            "por        %%mm4, %%mm3    \n\t"
863            "por        %%mm5, %%mm3    \n\t"
864
865            "movq       %%mm0, %%mm6    \n\t"
866            "movq       %%mm3, %%mm7    \n\t"
867
868            "movq       8(%1), %%mm0    \n\t"
869            "movq       8(%1), %%mm1    \n\t"
870            "movq       8(%1), %%mm2    \n\t"
871            "pand          %2, %%mm0    \n\t"
872            "pand          %3, %%mm1    \n\t"
873            "pand          %4, %%mm2    \n\t"
874            "psllq         $5, %%mm0    \n\t"
875            "psrlq         $1, %%mm2    \n\t"
876            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
877            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
878            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
879            "movq       %%mm0, %%mm3    \n\t"
880            "movq       %%mm1, %%mm4    \n\t"
881            "movq       %%mm2, %%mm5    \n\t"
882            "punpcklwd     %5, %%mm0    \n\t"
883            "punpcklwd     %5, %%mm1    \n\t"
884            "punpcklwd     %5, %%mm2    \n\t"
885            "punpckhwd     %5, %%mm3    \n\t"
886            "punpckhwd     %5, %%mm4    \n\t"
887            "punpckhwd     %5, %%mm5    \n\t"
888            "psllq         $8, %%mm1    \n\t"
889            "psllq        $16, %%mm2    \n\t"
890            "por        %%mm1, %%mm0    \n\t"
891            "por        %%mm2, %%mm0    \n\t"
892            "psllq         $8, %%mm4    \n\t"
893            "psllq        $16, %%mm5    \n\t"
894            "por        %%mm4, %%mm3    \n\t"
895            "por        %%mm5, %%mm3    \n\t"
896            :"=m"(*d)
897            :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
898             NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
899            :"memory");
900        /* borrowed 32 to 24 */
901        __asm__ volatile(
902            "movq       %%mm0, %%mm4    \n\t"
903            "movq       %%mm3, %%mm5    \n\t"
904            "movq       %%mm6, %%mm0    \n\t"
905            "movq       %%mm7, %%mm1    \n\t"
906
907            "movq       %%mm4, %%mm6    \n\t"
908            "movq       %%mm5, %%mm7    \n\t"
909            "movq       %%mm0, %%mm2    \n\t"
910            "movq       %%mm1, %%mm3    \n\t"
911
912            STORE_BGR24_MMX
913
914            :: "r"(d), "m"(*s)
915              NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
916            :"memory");
917        d += 24;
918        s += 8;
919    }
920    __asm__ volatile(SFENCE:::"memory");
921    __asm__ volatile(EMMS:::"memory");
922    while (s < end) {
923        register uint16_t bgr;
924        bgr = *s++;
925        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
926        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
927        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
928    }
929}
930
931/*
932 * mm0 = 00 B3 00 B2 00 B1 00 B0
933 * mm1 = 00 G3 00 G2 00 G1 00 G0
934 * mm2 = 00 R3 00 R2 00 R1 00 R0
935 * mm6 = FF FF FF FF FF FF FF FF
936 * mm7 = 00 00 00 00 00 00 00 00
937 */
938#define PACK_RGB32 \
939    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
940    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
941    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
942    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
943    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
944    "movq       %%mm0, %%mm3    \n\t"                               \
945    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
946    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
947    MOVNTQ"     %%mm0,  (%0)    \n\t"                               \
948    MOVNTQ"     %%mm3, 8(%0)    \n\t"                               \
949
950static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
951{
952    const uint16_t *end;
953    const uint16_t *mm_end;
954    uint8_t *d = dst;
955    const uint16_t *s = (const uint16_t *)src;
956    end = s + src_size/2;
957    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
958    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
959    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
960    mm_end = end - 3;
961    while (s < mm_end) {
962        __asm__ volatile(
963            PREFETCH"  32(%1)           \n\t"
964            "movq        (%1), %%mm0    \n\t"
965            "movq        (%1), %%mm1    \n\t"
966            "movq        (%1), %%mm2    \n\t"
967            "pand          %2, %%mm0    \n\t"
968            "pand          %3, %%mm1    \n\t"
969            "pand          %4, %%mm2    \n\t"
970            "psllq         $5, %%mm0    \n\t"
971            "pmulhw        %5, %%mm0    \n\t"
972            "pmulhw        %5, %%mm1    \n\t"
973            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
974            PACK_RGB32
975            ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
976              NAMED_CONSTRAINTS_ADD(mul15_hi)
977            :"memory");
978        d += 16;
979        s += 4;
980    }
981    __asm__ volatile(SFENCE:::"memory");
982    __asm__ volatile(EMMS:::"memory");
983    while (s < end) {
984        register uint16_t bgr;
985        bgr = *s++;
986        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
987        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
988        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
989        *d++ = 255;
990    }
991}
992
993static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
994{
995    const uint16_t *end;
996    const uint16_t *mm_end;
997    uint8_t *d = dst;
998    const uint16_t *s = (const uint16_t*)src;
999    end = s + src_size/2;
1000    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1001    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1002    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1003    mm_end = end - 3;
1004    while (s < mm_end) {
1005        __asm__ volatile(
1006            PREFETCH"  32(%1)           \n\t"
1007            "movq        (%1), %%mm0    \n\t"
1008            "movq        (%1), %%mm1    \n\t"
1009            "movq        (%1), %%mm2    \n\t"
1010            "pand          %2, %%mm0    \n\t"
1011            "pand          %3, %%mm1    \n\t"
1012            "pand          %4, %%mm2    \n\t"
1013            "psllq         $5, %%mm0    \n\t"
1014            "psrlq         $1, %%mm2    \n\t"
1015            "pmulhw        %5, %%mm0    \n\t"
1016            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
1017            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
1018            PACK_RGB32
1019            ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
1020              NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
1021            :"memory");
1022        d += 16;
1023        s += 4;
1024    }
1025    __asm__ volatile(SFENCE:::"memory");
1026    __asm__ volatile(EMMS:::"memory");
1027    while (s < end) {
1028        register uint16_t bgr;
1029        bgr = *s++;
1030        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1031        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1032        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1033        *d++ = 255;
1034    }
1035}
1036
1037static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
1038{
1039    x86_reg idx = 15 - src_size;
1040    const uint8_t *s = src-idx;
1041    uint8_t *d = dst-idx;
1042    __asm__ volatile(
1043        "test          %0, %0           \n\t"
1044        "jns           2f               \n\t"
1045        PREFETCH"       (%1, %0)        \n\t"
1046        "movq          %3, %%mm7        \n\t"
1047        "pxor          %4, %%mm7        \n\t"
1048        "movq       %%mm7, %%mm6        \n\t"
1049        "pxor          %5, %%mm7        \n\t"
1050        ".p2align       4               \n\t"
1051        "1:                             \n\t"
1052        PREFETCH"     32(%1, %0)        \n\t"
1053        "movq           (%1, %0), %%mm0 \n\t"
1054        "movq          8(%1, %0), %%mm1 \n\t"
1055# if COMPILE_TEMPLATE_MMXEXT
1056        "pshufw      $177, %%mm0, %%mm3 \n\t"
1057        "pshufw      $177, %%mm1, %%mm5 \n\t"
1058        "pand       %%mm7, %%mm0        \n\t"
1059        "pand       %%mm6, %%mm3        \n\t"
1060        "pand       %%mm7, %%mm1        \n\t"
1061        "pand       %%mm6, %%mm5        \n\t"
1062        "por        %%mm3, %%mm0        \n\t"
1063        "por        %%mm5, %%mm1        \n\t"
1064# else
1065        "movq       %%mm0, %%mm2        \n\t"
1066        "movq       %%mm1, %%mm4        \n\t"
1067        "pand       %%mm7, %%mm0        \n\t"
1068        "pand       %%mm6, %%mm2        \n\t"
1069        "pand       %%mm7, %%mm1        \n\t"
1070        "pand       %%mm6, %%mm4        \n\t"
1071        "movq       %%mm2, %%mm3        \n\t"
1072        "movq       %%mm4, %%mm5        \n\t"
1073        "pslld        $16, %%mm2        \n\t"
1074        "psrld        $16, %%mm3        \n\t"
1075        "pslld        $16, %%mm4        \n\t"
1076        "psrld        $16, %%mm5        \n\t"
1077        "por        %%mm2, %%mm0        \n\t"
1078        "por        %%mm4, %%mm1        \n\t"
1079        "por        %%mm3, %%mm0        \n\t"
1080        "por        %%mm5, %%mm1        \n\t"
1081# endif
1082        MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1083        MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1084        "add          $16, %0           \n\t"
1085        "js            1b               \n\t"
1086        SFENCE"                         \n\t"
1087        EMMS"                           \n\t"
1088        "2:                             \n\t"
1089        : "+&r"(idx)
1090        : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1091        : "memory");
1092    for (; idx<15; idx+=4) {
1093        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1094        v &= 0xff00ff;
1095        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1096    }
1097}
1098
1099static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1100{
1101    unsigned i;
1102    x86_reg mmx_size= 23 - src_size;
1103    __asm__ volatile (
1104        "test             %%"REG_a", %%"REG_a"          \n\t"
1105        "jns                     2f                     \n\t"
1106        "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1107        "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1108        "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1109        ".p2align                 4                     \n\t"
1110        "1:                                             \n\t"
1111        PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1112        "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1113        "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1114        "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1115        "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1116        "pand                 %%mm5, %%mm0              \n\t"
1117        "pand                 %%mm6, %%mm1              \n\t"
1118        "pand                 %%mm7, %%mm2              \n\t"
1119        "por                  %%mm0, %%mm1              \n\t"
1120        "por                  %%mm2, %%mm1              \n\t"
1121        "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1122        MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1123        "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1124        "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1125        "pand                 %%mm7, %%mm0              \n\t"
1126        "pand                 %%mm5, %%mm1              \n\t"
1127        "pand                 %%mm6, %%mm2              \n\t"
1128        "por                  %%mm0, %%mm1              \n\t"
1129        "por                  %%mm2, %%mm1              \n\t"
1130        "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1131        MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1132        "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1133        "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1134        "pand                 %%mm6, %%mm0              \n\t"
1135        "pand                 %%mm7, %%mm1              \n\t"
1136        "pand                 %%mm5, %%mm2              \n\t"
1137        "por                  %%mm0, %%mm1              \n\t"
1138        "por                  %%mm2, %%mm1              \n\t"
1139        MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1140        "add                    $24, %%"REG_a"          \n\t"
1141        " js                     1b                     \n\t"
1142        "2:                                             \n\t"
1143        : "+a" (mmx_size)
1144        : "r" (src-mmx_size), "r"(dst-mmx_size)
1145          NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
1146    );
1147
1148    __asm__ volatile(SFENCE:::"memory");
1149    __asm__ volatile(EMMS:::"memory");
1150
1151    if (mmx_size==23) return; //finished, was multiple of 8
1152
1153    src+= src_size;
1154    dst+= src_size;
1155    src_size= 23-mmx_size;
1156    src-= src_size;
1157    dst-= src_size;
1158    for (i=0; i<src_size; i+=3) {
1159        register uint8_t x;
1160        x          = src[i + 2];
1161        dst[i + 1] = src[i + 1];
1162        dst[i + 2] = src[i + 0];
1163        dst[i + 0] = x;
1164    }
1165}
1166
1167static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1168                                           int width, int height,
1169                                           int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1170{
1171    int y;
1172    const x86_reg chromWidth= width>>1;
1173    for (y=0; y<height; y++) {
1174        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1175        __asm__ volatile(
1176            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1177            ".p2align                    4              \n\t"
1178            "1:                                         \n\t"
1179            PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1180            PREFETCH"    32(%2, %%"REG_a")              \n\t"
1181            PREFETCH"    32(%3, %%"REG_a")              \n\t"
1182            "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1183            "movq                    %%mm0, %%mm2       \n\t" // U(0)
1184            "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1185            "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1186            "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1187
1188            "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1189            "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1190            "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1191            "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1192            "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1193            "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1194            "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1195            "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1196
1197            MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1198            MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1199            MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1200            MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1201
1202            "add                        $8, %%"REG_a"   \n\t"
1203            "cmp                        %4, %%"REG_a"   \n\t"
1204            " jb                        1b              \n\t"
1205            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1206            : "%"REG_a
1207        );
1208        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1209            usrc += chromStride;
1210            vsrc += chromStride;
1211        }
1212        ysrc += lumStride;
1213        dst  += dstStride;
1214    }
1215    __asm__(EMMS"       \n\t"
1216            SFENCE"     \n\t"
1217            :::"memory");
1218}
1219
1220/**
1221 * Height should be a multiple of 2 and width should be a multiple of 16.
1222 * (If this is a problem for anyone then tell me, and I will fix it.)
1223 */
1224static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1225                                      int width, int height,
1226                                      int lumStride, int chromStride, int dstStride)
1227{
1228    //FIXME interpolate chroma
1229    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1230}
1231
1232static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1233                                           int width, int height,
1234                                           int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1235{
1236    int y;
1237    const x86_reg chromWidth= width>>1;
1238    for (y=0; y<height; y++) {
1239        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1240        __asm__ volatile(
1241            "xor                %%"REG_a", %%"REG_a"    \n\t"
1242            ".p2align                   4               \n\t"
1243            "1:                                         \n\t"
1244            PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1245            PREFETCH"   32(%2, %%"REG_a")               \n\t"
1246            PREFETCH"   32(%3, %%"REG_a")               \n\t"
1247            "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1248            "movq                   %%mm0, %%mm2        \n\t" // U(0)
1249            "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1250            "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1251            "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1252
1253            "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1254            "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1255            "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1256            "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1257            "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1258            "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1259            "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1260            "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1261
1262            MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1263            MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1264            MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1265            MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1266
1267            "add                       $8, %%"REG_a"    \n\t"
1268            "cmp                       %4, %%"REG_a"    \n\t"
1269            " jb                       1b               \n\t"
1270            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1271            : "%"REG_a
1272        );
1273        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1274            usrc += chromStride;
1275            vsrc += chromStride;
1276        }
1277        ysrc += lumStride;
1278        dst += dstStride;
1279    }
1280    __asm__(EMMS"       \n\t"
1281            SFENCE"     \n\t"
1282            :::"memory");
1283}
1284
1285/**
1286 * Height should be a multiple of 2 and width should be a multiple of 16
1287 * (If this is a problem for anyone then tell me, and I will fix it.)
1288 */
1289static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1290                                      int width, int height,
1291                                      int lumStride, int chromStride, int dstStride)
1292{
1293    //FIXME interpolate chroma
1294    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1295}
1296
1297/**
1298 * Width should be a multiple of 16.
1299 */
1300static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1301                                         int width, int height,
1302                                         int lumStride, int chromStride, int dstStride)
1303{
1304    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1305}
1306
1307/**
1308 * Width should be a multiple of 16.
1309 */
1310static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1311                                         int width, int height,
1312                                         int lumStride, int chromStride, int dstStride)
1313{
1314    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1315}
1316
1317/**
1318 * Height should be a multiple of 2 and width should be a multiple of 16.
1319 * (If this is a problem for anyone then tell me, and I will fix it.)
1320 */
1321static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1322                                      int width, int height,
1323                                      int lumStride, int chromStride, int srcStride)
1324{
1325    int y;
1326    const x86_reg chromWidth= width>>1;
1327    for (y=0; y<height; y+=2) {
1328        __asm__ volatile(
1329            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1330            "pcmpeqw                 %%mm7, %%mm7       \n\t"
1331            "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1332            ".p2align                    4              \n\t"
1333            "1:                \n\t"
1334            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1335            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1336            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1337            "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1338            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1339            "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1340            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1341            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1342            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1343            "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1344            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1345
1346            MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1347
1348            "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1349            "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1350            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1351            "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1352            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1353            "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1354            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1355            "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1356            "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1357            "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1358
1359            MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1360
1361            "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1362            "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1363            "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1364            "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1365            "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1366            "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1367            "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1368            "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1369
1370            MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1371            MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1372
1373            "add                        $8, %%"REG_a"   \n\t"
1374            "cmp                        %4, %%"REG_a"   \n\t"
1375            " jb                        1b              \n\t"
1376            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1377            : "memory", "%"REG_a
1378        );
1379
1380        ydst += lumStride;
1381        src  += srcStride;
1382
1383        __asm__ volatile(
1384            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1385            ".p2align                    4              \n\t"
1386            "1:                                         \n\t"
1387            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1388            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1389            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1390            "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1391            "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1392            "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1393            "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1394            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1395            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1396            "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1397            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1398
1399            MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1400            MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1401
1402            "add                        $8, %%"REG_a"   \n\t"
1403            "cmp                        %4, %%"REG_a"   \n\t"
1404            " jb                        1b              \n\t"
1405
1406            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1407            : "memory", "%"REG_a
1408        );
1409        udst += chromStride;
1410        vdst += chromStride;
1411        ydst += lumStride;
1412        src  += srcStride;
1413    }
1414    __asm__ volatile(EMMS"       \n\t"
1415                     SFENCE"     \n\t"
1416                     :::"memory");
1417}
1418#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1419
1420#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1421static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1422{
1423    int x,y;
1424
1425    dst[0]= src[0];
1426
1427    // first line
1428    for (x=0; x<srcWidth-1; x++) {
1429        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1430        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1431    }
1432    dst[2*srcWidth-1]= src[srcWidth-1];
1433
1434    dst+= dstStride;
1435
1436    for (y=1; y<srcHeight; y++) {
1437        const x86_reg mmxSize= srcWidth&~15;
1438        __asm__ volatile(
1439            "mov           %4, %%"REG_a"            \n\t"
1440            "movq        "MANGLE(mmx_ff)", %%mm0    \n\t"
1441            "movq         (%0, %%"REG_a"), %%mm4    \n\t"
1442            "movq                   %%mm4, %%mm2    \n\t"
1443            "psllq                     $8, %%mm4    \n\t"
1444            "pand                   %%mm0, %%mm2    \n\t"
1445            "por                    %%mm2, %%mm4    \n\t"
1446            "movq         (%1, %%"REG_a"), %%mm5    \n\t"
1447            "movq                   %%mm5, %%mm3    \n\t"
1448            "psllq                     $8, %%mm5    \n\t"
1449            "pand                   %%mm0, %%mm3    \n\t"
1450            "por                    %%mm3, %%mm5    \n\t"
1451            "1:                                     \n\t"
1452            "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1453            "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1454            "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1455            "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1456            PAVGB"                  %%mm0, %%mm5    \n\t"
1457            PAVGB"                  %%mm0, %%mm3    \n\t"
1458            PAVGB"                  %%mm0, %%mm5    \n\t"
1459            PAVGB"                  %%mm0, %%mm3    \n\t"
1460            PAVGB"                  %%mm1, %%mm4    \n\t"
1461            PAVGB"                  %%mm1, %%mm2    \n\t"
1462            PAVGB"                  %%mm1, %%mm4    \n\t"
1463            PAVGB"                  %%mm1, %%mm2    \n\t"
1464            "movq                   %%mm5, %%mm7    \n\t"
1465            "movq                   %%mm4, %%mm6    \n\t"
1466            "punpcklbw              %%mm3, %%mm5    \n\t"
1467            "punpckhbw              %%mm3, %%mm7    \n\t"
1468            "punpcklbw              %%mm2, %%mm4    \n\t"
1469            "punpckhbw              %%mm2, %%mm6    \n\t"
1470            MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1471            MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1472            MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1473            MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1474            "add                       $8, %%"REG_a"            \n\t"
1475            "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1476            "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1477            " js                       1b                       \n\t"
1478            :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1479               "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1480               "g" (-mmxSize)
1481               NAMED_CONSTRAINTS_ADD(mmx_ff)
1482            : "%"REG_a
1483        );
1484
1485        for (x=mmxSize-1; x<srcWidth-1; x++) {
1486            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1487            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1488            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1489            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1490        }
1491        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1492        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1493
1494        dst+=dstStride*2;
1495        src+=srcStride;
1496    }
1497
1498    // last line
1499    dst[0]= src[0];
1500
1501    for (x=0; x<srcWidth-1; x++) {
1502        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1503        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1504    }
1505    dst[2*srcWidth-1]= src[srcWidth-1];
1506
1507    __asm__ volatile(EMMS"       \n\t"
1508                     SFENCE"     \n\t"
1509                     :::"memory");
1510}
1511#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
1512
1513#if !COMPILE_TEMPLATE_AMD3DNOW
1514/**
1515 * Height should be a multiple of 2 and width should be a multiple of 16.
1516 * (If this is a problem for anyone then tell me, and I will fix it.)
1517 * Chrominance data is only taken from every second line, others are ignored.
1518 * FIXME: Write HQ version.
1519 */
1520static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1521                                      int width, int height,
1522                                      int lumStride, int chromStride, int srcStride)
1523{
1524    int y;
1525    const x86_reg chromWidth= width>>1;
1526    for (y=0; y<height; y+=2) {
1527        __asm__ volatile(
1528            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1529            "pcmpeqw             %%mm7, %%mm7   \n\t"
1530            "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
1531            ".p2align                4          \n\t"
1532            "1:                                 \n\t"
1533            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1534            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
1535            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
1536            "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
1537            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
1538            "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
1539            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
1540            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
1541            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
1542            "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
1543            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
1544
1545            MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
1546
1547            "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
1548            "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
1549            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
1550            "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
1551            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
1552            "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
1553            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
1554            "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
1555            "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
1556            "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
1557
1558            MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1559
1560            "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
1561            "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
1562            "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
1563            "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
1564            "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
1565            "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
1566            "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
1567            "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
1568
1569            MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
1570            MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
1571
1572            "add                    $8, %%"REG_a"   \n\t"
1573            "cmp                    %4, %%"REG_a"   \n\t"
1574            " jb                    1b          \n\t"
1575            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1576            : "memory", "%"REG_a
1577        );
1578
1579        ydst += lumStride;
1580        src  += srcStride;
1581
1582        __asm__ volatile(
1583            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1584            ".p2align                    4              \n\t"
1585            "1:                                 \n\t"
1586            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1587            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
1588            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
1589            "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
1590            "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
1591            "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
1592            "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
1593            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
1594            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
1595            "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
1596            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
1597
1598            MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
1599            MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1600
1601            "add                    $8, %%"REG_a"   \n\t"
1602            "cmp                    %4, %%"REG_a"   \n\t"
1603            " jb                    1b          \n\t"
1604
1605            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1606            : "memory", "%"REG_a
1607        );
1608        udst += chromStride;
1609        vdst += chromStride;
1610        ydst += lumStride;
1611        src  += srcStride;
1612    }
1613    __asm__ volatile(EMMS"       \n\t"
1614                     SFENCE"     \n\t"
1615                     :::"memory");
1616}
1617#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1618
1619/**
1620 * Height should be a multiple of 2 and width should be a multiple of 2.
1621 * (If this is a problem for anyone then tell me, and I will fix it.)
1622 * Chrominance data is only taken from every second line,
1623 * others are ignored in the C version.
1624 * FIXME: Write HQ version.
1625 */
1626#if HAVE_7REGS
1627static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1628                                       int width, int height,
1629                                       int lumStride, int chromStride, int srcStride,
1630                                       int32_t *rgb2yuv)
1631{
1632#define BGR2Y_IDX "16*4+16*32"
1633#define BGR2U_IDX "16*4+16*33"
1634#define BGR2V_IDX "16*4+16*34"
1635    int y;
1636    const x86_reg chromWidth= width>>1;
1637    for (y=0; y<height-2; y+=2) {
1638        int i;
1639        for (i=0; i<2; i++) {
1640            __asm__ volatile(
1641                "mov                        %2, %%"REG_a"   \n\t"
1642                "movq          "BGR2Y_IDX"(%3), %%mm6       \n\t"
1643                "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1644                "pxor                    %%mm7, %%mm7       \n\t"
1645                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1646                ".p2align                    4              \n\t"
1647                "1:                                         \n\t"
1648                PREFETCH"    64(%0, %%"REG_d")              \n\t"
1649                "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1650                "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
1651                "punpcklbw               %%mm7, %%mm0       \n\t"
1652                "punpcklbw               %%mm7, %%mm1       \n\t"
1653                "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
1654                "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
1655                "punpcklbw               %%mm7, %%mm2       \n\t"
1656                "punpcklbw               %%mm7, %%mm3       \n\t"
1657                "pmaddwd                 %%mm6, %%mm0       \n\t"
1658                "pmaddwd                 %%mm6, %%mm1       \n\t"
1659                "pmaddwd                 %%mm6, %%mm2       \n\t"
1660                "pmaddwd                 %%mm6, %%mm3       \n\t"
1661                "psrad                      $8, %%mm0       \n\t"
1662                "psrad                      $8, %%mm1       \n\t"
1663                "psrad                      $8, %%mm2       \n\t"
1664                "psrad                      $8, %%mm3       \n\t"
1665                "packssdw                %%mm1, %%mm0       \n\t"
1666                "packssdw                %%mm3, %%mm2       \n\t"
1667                "pmaddwd                 %%mm5, %%mm0       \n\t"
1668                "pmaddwd                 %%mm5, %%mm2       \n\t"
1669                "packssdw                %%mm2, %%mm0       \n\t"
1670                "psraw                      $7, %%mm0       \n\t"
1671
1672                "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
1673                "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
1674                "punpcklbw               %%mm7, %%mm4       \n\t"
1675                "punpcklbw               %%mm7, %%mm1       \n\t"
1676                "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
1677                "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
1678                "punpcklbw               %%mm7, %%mm2       \n\t"
1679                "punpcklbw               %%mm7, %%mm3       \n\t"
1680                "pmaddwd                 %%mm6, %%mm4       \n\t"
1681                "pmaddwd                 %%mm6, %%mm1       \n\t"
1682                "pmaddwd                 %%mm6, %%mm2       \n\t"
1683                "pmaddwd                 %%mm6, %%mm3       \n\t"
1684                "psrad                      $8, %%mm4       \n\t"
1685                "psrad                      $8, %%mm1       \n\t"
1686                "psrad                      $8, %%mm2       \n\t"
1687                "psrad                      $8, %%mm3       \n\t"
1688                "packssdw                %%mm1, %%mm4       \n\t"
1689                "packssdw                %%mm3, %%mm2       \n\t"
1690                "pmaddwd                 %%mm5, %%mm4       \n\t"
1691                "pmaddwd                 %%mm5, %%mm2       \n\t"
1692                "add                       $24, %%"REG_d"   \n\t"
1693                "packssdw                %%mm2, %%mm4       \n\t"
1694                "psraw                      $7, %%mm4       \n\t"
1695
1696                "packuswb                %%mm4, %%mm0       \n\t"
1697                "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
1698
1699                MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
1700                "add                        $8,      %%"REG_a"  \n\t"
1701                " js                        1b                  \n\t"
1702                : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
1703                  NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset)
1704                : "%"REG_a, "%"REG_d
1705            );
1706            ydst += lumStride;
1707            src  += srcStride;
1708        }
1709        src -= srcStride*2;
1710        __asm__ volatile(
1711            "mov                        %4, %%"REG_a"   \n\t"
1712            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1713            "movq          "BGR2U_IDX"(%5), %%mm6       \n\t"
1714            "pxor                    %%mm7, %%mm7       \n\t"
1715            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1716            "add                 %%"REG_d", %%"REG_d"   \n\t"
1717            ".p2align                    4              \n\t"
1718            "1:                                         \n\t"
1719            PREFETCH"    64(%0, %%"REG_d")              \n\t"
1720            PREFETCH"    64(%1, %%"REG_d")              \n\t"
1721#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1722            "movq          (%0, %%"REG_d"), %%mm0       \n\t"
1723            "movq          (%1, %%"REG_d"), %%mm1       \n\t"
1724            "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
1725            "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
1726            PAVGB"                   %%mm1, %%mm0       \n\t"
1727            PAVGB"                   %%mm3, %%mm2       \n\t"
1728            "movq                    %%mm0, %%mm1       \n\t"
1729            "movq                    %%mm2, %%mm3       \n\t"
1730            "psrlq                     $24, %%mm0       \n\t"
1731            "psrlq                     $24, %%mm2       \n\t"
1732            PAVGB"                   %%mm1, %%mm0       \n\t"
1733            PAVGB"                   %%mm3, %%mm2       \n\t"
1734            "punpcklbw               %%mm7, %%mm0       \n\t"
1735            "punpcklbw               %%mm7, %%mm2       \n\t"
1736#else
1737            "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1738            "movd          (%1, %%"REG_d"), %%mm1       \n\t"
1739            "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
1740            "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
1741            "punpcklbw               %%mm7, %%mm0       \n\t"
1742            "punpcklbw               %%mm7, %%mm1       \n\t"
1743            "punpcklbw               %%mm7, %%mm2       \n\t"
1744            "punpcklbw               %%mm7, %%mm3       \n\t"
1745            "paddw                   %%mm1, %%mm0       \n\t"
1746            "paddw                   %%mm3, %%mm2       \n\t"
1747            "paddw                   %%mm2, %%mm0       \n\t"
1748            "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
1749            "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
1750            "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
1751            "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
1752            "punpcklbw               %%mm7, %%mm4       \n\t"
1753            "punpcklbw               %%mm7, %%mm1       \n\t"
1754            "punpcklbw               %%mm7, %%mm2       \n\t"
1755            "punpcklbw               %%mm7, %%mm3       \n\t"
1756            "paddw                   %%mm1, %%mm4       \n\t"
1757            "paddw                   %%mm3, %%mm2       \n\t"
1758            "paddw                   %%mm4, %%mm2       \n\t"
1759            "psrlw                      $2, %%mm0       \n\t"
1760            "psrlw                      $2, %%mm2       \n\t"
1761#endif
1762            "movq          "BGR2V_IDX"(%5), %%mm1       \n\t"
1763            "movq          "BGR2V_IDX"(%5), %%mm3       \n\t"
1764
1765            "pmaddwd                 %%mm0, %%mm1       \n\t"
1766            "pmaddwd                 %%mm2, %%mm3       \n\t"
1767            "pmaddwd                 %%mm6, %%mm0       \n\t"
1768            "pmaddwd                 %%mm6, %%mm2       \n\t"
1769            "psrad                      $8, %%mm0       \n\t"
1770            "psrad                      $8, %%mm1       \n\t"
1771            "psrad                      $8, %%mm2       \n\t"
1772            "psrad                      $8, %%mm3       \n\t"
1773            "packssdw                %%mm2, %%mm0       \n\t"
1774            "packssdw                %%mm3, %%mm1       \n\t"
1775            "pmaddwd                 %%mm5, %%mm0       \n\t"
1776            "pmaddwd                 %%mm5, %%mm1       \n\t"
1777            "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
1778            "psraw                      $7, %%mm0       \n\t"
1779
1780#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1781            "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
1782            "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
1783            "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
1784            "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
1785            PAVGB"                   %%mm1, %%mm4       \n\t"
1786            PAVGB"                   %%mm3, %%mm2       \n\t"
1787            "movq                    %%mm4, %%mm1       \n\t"
1788            "movq                    %%mm2, %%mm3       \n\t"
1789            "psrlq                     $24, %%mm4       \n\t"
1790            "psrlq                     $24, %%mm2       \n\t"
1791            PAVGB"                   %%mm1, %%mm4       \n\t"
1792            PAVGB"                   %%mm3, %%mm2       \n\t"
1793            "punpcklbw               %%mm7, %%mm4       \n\t"
1794            "punpcklbw               %%mm7, %%mm2       \n\t"
1795#else
1796            "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
1797            "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
1798            "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
1799            "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
1800            "punpcklbw               %%mm7, %%mm4       \n\t"
1801            "punpcklbw               %%mm7, %%mm1       \n\t"
1802            "punpcklbw               %%mm7, %%mm2       \n\t"
1803            "punpcklbw               %%mm7, %%mm3       \n\t"
1804            "paddw                   %%mm1, %%mm4       \n\t"
1805            "paddw                   %%mm3, %%mm2       \n\t"
1806            "paddw                   %%mm2, %%mm4       \n\t"
1807            "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
1808            "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
1809            "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
1810            "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
1811            "punpcklbw               %%mm7, %%mm5       \n\t"
1812            "punpcklbw               %%mm7, %%mm1       \n\t"
1813            "punpcklbw               %%mm7, %%mm2       \n\t"
1814            "punpcklbw               %%mm7, %%mm3       \n\t"
1815            "paddw                   %%mm1, %%mm5       \n\t"
1816            "paddw                   %%mm3, %%mm2       \n\t"
1817            "paddw                   %%mm5, %%mm2       \n\t"
1818            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1819            "psrlw                      $2, %%mm4       \n\t"
1820            "psrlw                      $2, %%mm2       \n\t"
1821#endif
1822            "movq          "BGR2V_IDX"(%5), %%mm1       \n\t"
1823            "movq          "BGR2V_IDX"(%5), %%mm3       \n\t"
1824
1825            "pmaddwd                 %%mm4, %%mm1       \n\t"
1826            "pmaddwd                 %%mm2, %%mm3       \n\t"
1827            "pmaddwd                 %%mm6, %%mm4       \n\t"
1828            "pmaddwd                 %%mm6, %%mm2       \n\t"
1829            "psrad                      $8, %%mm4       \n\t"
1830            "psrad                      $8, %%mm1       \n\t"
1831            "psrad                      $8, %%mm2       \n\t"
1832            "psrad                      $8, %%mm3       \n\t"
1833            "packssdw                %%mm2, %%mm4       \n\t"
1834            "packssdw                %%mm3, %%mm1       \n\t"
1835            "pmaddwd                 %%mm5, %%mm4       \n\t"
1836            "pmaddwd                 %%mm5, %%mm1       \n\t"
1837            "add                       $24, %%"REG_d"   \n\t"
1838            "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
1839            "psraw                      $7, %%mm4       \n\t"
1840
1841            "movq                    %%mm0, %%mm1           \n\t"
1842            "punpckldq               %%mm4, %%mm0           \n\t"
1843            "punpckhdq               %%mm4, %%mm1           \n\t"
1844            "packsswb                %%mm1, %%mm0           \n\t"
1845            "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
1846            "movd                    %%mm0, (%2, %%"REG_a") \n\t"
1847            "punpckhdq               %%mm0, %%mm0           \n\t"
1848            "movd                    %%mm0, (%3, %%"REG_a") \n\t"
1849            "add                        $4, %%"REG_a"       \n\t"
1850            " js                        1b                  \n\t"
1851            : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
1852              NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset)
1853            : "%"REG_a, "%"REG_d
1854        );
1855
1856        udst += chromStride;
1857        vdst += chromStride;
1858        src  += srcStride*2;
1859    }
1860
1861    __asm__ volatile(EMMS"       \n\t"
1862                     SFENCE"     \n\t"
1863                     :::"memory");
1864
1865     ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
1866}
1867#endif /* HAVE_7REGS */
1868#endif /* !COMPILE_TEMPLATE_SSE2 */
1869
1870#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
1871static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1872                                    int width, int height, int src1Stride,
1873                                    int src2Stride, int dstStride)
1874{
1875    int h;
1876
1877    for (h=0; h < height; h++) {
1878        int w;
1879
1880#if COMPILE_TEMPLATE_SSE2
1881        __asm__(
1882            "xor              %%"REG_a", %%"REG_a"  \n\t"
1883            "1:                                     \n\t"
1884            PREFETCH" 64(%1, %%"REG_a")             \n\t"
1885            PREFETCH" 64(%2, %%"REG_a")             \n\t"
1886            "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
1887            "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
1888            "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
1889            "punpcklbw           %%xmm2, %%xmm0     \n\t"
1890            "punpckhbw           %%xmm2, %%xmm1     \n\t"
1891            "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
1892            "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
1893            "add                    $16, %%"REG_a"  \n\t"
1894            "cmp                     %3, %%"REG_a"  \n\t"
1895            " jb                     1b             \n\t"
1896            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1897            : "memory", "%"REG_a""
1898        );
1899#else
1900        __asm__(
1901            "xor %%"REG_a", %%"REG_a"               \n\t"
1902            "1:                                     \n\t"
1903            PREFETCH" 64(%1, %%"REG_a")             \n\t"
1904            PREFETCH" 64(%2, %%"REG_a")             \n\t"
1905            "movq       (%1, %%"REG_a"), %%mm0      \n\t"
1906            "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
1907            "movq                 %%mm0, %%mm1      \n\t"
1908            "movq                 %%mm2, %%mm3      \n\t"
1909            "movq       (%2, %%"REG_a"), %%mm4      \n\t"
1910            "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
1911            "punpcklbw            %%mm4, %%mm0      \n\t"
1912            "punpckhbw            %%mm4, %%mm1      \n\t"
1913            "punpcklbw            %%mm5, %%mm2      \n\t"
1914            "punpckhbw            %%mm5, %%mm3      \n\t"
1915            MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
1916            MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
1917            MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
1918            MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
1919            "add                    $16, %%"REG_a"  \n\t"
1920            "cmp                     %3, %%"REG_a"  \n\t"
1921            " jb                     1b             \n\t"
1922            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1923            : "memory", "%"REG_a
1924        );
1925#endif
1926        for (w= (width&(~15)); w < width; w++) {
1927            dest[2*w+0] = src1[w];
1928            dest[2*w+1] = src2[w];
1929        }
1930        dest += dstStride;
1931        src1 += src1Stride;
1932        src2 += src2Stride;
1933    }
1934    __asm__(
1935            EMMS"       \n\t"
1936            SFENCE"     \n\t"
1937            ::: "memory"
1938            );
1939}
1940#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
1941
1942#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
1943#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
1944void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1945                         const uint8_t *unused,
1946                         const uint8_t *src1,
1947                         const uint8_t *src2,
1948                         int w,
1949                         uint32_t *unused2);
1950static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
1951                                      int width, int height, int srcStride,
1952                                      int dst1Stride, int dst2Stride)
1953{
1954    int h;
1955
1956    for (h = 0; h < height; h++) {
1957        RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL);
1958        src  += srcStride;
1959        dst1 += dst1Stride;
1960        dst2 += dst2Stride;
1961    }
1962    __asm__(
1963            EMMS"       \n\t"
1964            SFENCE"     \n\t"
1965            ::: "memory"
1966            );
1967}
1968#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1969#endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */
1970
1971#if !COMPILE_TEMPLATE_SSE2
1972#if !COMPILE_TEMPLATE_AMD3DNOW
1973static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
1974                                       uint8_t *dst1, uint8_t *dst2,
1975                                       int width, int height,
1976                                       int srcStride1, int srcStride2,
1977                                       int dstStride1, int dstStride2)
1978{
1979    x86_reg x, y;
1980    int w,h;
1981    w=width/2; h=height/2;
1982    __asm__ volatile(
1983        PREFETCH" %0    \n\t"
1984        PREFETCH" %1    \n\t"
1985        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1986    for (y=0;y<h;y++) {
1987        const uint8_t* s1=src1+srcStride1*(y>>1);
1988        uint8_t* d=dst1+dstStride1*y;
1989        x=0;
1990        for (;x<w-31;x+=32) {
1991            __asm__ volatile(
1992                PREFETCH"   32(%1,%2)        \n\t"
1993                "movq         (%1,%2), %%mm0 \n\t"
1994                "movq        8(%1,%2), %%mm2 \n\t"
1995                "movq       16(%1,%2), %%mm4 \n\t"
1996                "movq       24(%1,%2), %%mm6 \n\t"
1997                "movq      %%mm0, %%mm1 \n\t"
1998                "movq      %%mm2, %%mm3 \n\t"
1999                "movq      %%mm4, %%mm5 \n\t"
2000                "movq      %%mm6, %%mm7 \n\t"
2001                "punpcklbw %%mm0, %%mm0 \n\t"
2002                "punpckhbw %%mm1, %%mm1 \n\t"
2003                "punpcklbw %%mm2, %%mm2 \n\t"
2004                "punpckhbw %%mm3, %%mm3 \n\t"
2005                "punpcklbw %%mm4, %%mm4 \n\t"
2006                "punpckhbw %%mm5, %%mm5 \n\t"
2007                "punpcklbw %%mm6, %%mm6 \n\t"
2008                "punpckhbw %%mm7, %%mm7 \n\t"
2009                MOVNTQ"    %%mm0,   (%0,%2,2)  \n\t"
2010                MOVNTQ"    %%mm1,  8(%0,%2,2)  \n\t"
2011                MOVNTQ"    %%mm2, 16(%0,%2,2)  \n\t"
2012                MOVNTQ"    %%mm3, 24(%0,%2,2)  \n\t"
2013                MOVNTQ"    %%mm4, 32(%0,%2,2)  \n\t"
2014                MOVNTQ"    %%mm5, 40(%0,%2,2)  \n\t"
2015                MOVNTQ"    %%mm6, 48(%0,%2,2)  \n\t"
2016                MOVNTQ"    %%mm7, 56(%0,%2,2)"
2017                :: "r"(d), "r"(s1), "r"(x)
2018                :"memory");
2019        }
2020        for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2021    }
2022    for (y=0;y<h;y++) {
2023        const uint8_t* s2=src2+srcStride2*(y>>1);
2024        uint8_t* d=dst2+dstStride2*y;
2025        x=0;
2026        for (;x<w-31;x+=32) {
2027            __asm__ volatile(
2028                PREFETCH"   32(%1,%2)        \n\t"
2029                "movq         (%1,%2), %%mm0 \n\t"
2030                "movq        8(%1,%2), %%mm2 \n\t"
2031                "movq       16(%1,%2), %%mm4 \n\t"
2032                "movq       24(%1,%2), %%mm6 \n\t"
2033                "movq      %%mm0, %%mm1 \n\t"
2034                "movq      %%mm2, %%mm3 \n\t"
2035                "movq      %%mm4, %%mm5 \n\t"
2036                "movq      %%mm6, %%mm7 \n\t"
2037                "punpcklbw %%mm0, %%mm0 \n\t"
2038                "punpckhbw %%mm1, %%mm1 \n\t"
2039                "punpcklbw %%mm2, %%mm2 \n\t"
2040                "punpckhbw %%mm3, %%mm3 \n\t"
2041                "punpcklbw %%mm4, %%mm4 \n\t"
2042                "punpckhbw %%mm5, %%mm5 \n\t"
2043                "punpcklbw %%mm6, %%mm6 \n\t"
2044                "punpckhbw %%mm7, %%mm7 \n\t"
2045                MOVNTQ"    %%mm0,   (%0,%2,2)  \n\t"
2046                MOVNTQ"    %%mm1,  8(%0,%2,2)  \n\t"
2047                MOVNTQ"    %%mm2, 16(%0,%2,2)  \n\t"
2048                MOVNTQ"    %%mm3, 24(%0,%2,2)  \n\t"
2049                MOVNTQ"    %%mm4, 32(%0,%2,2)  \n\t"
2050                MOVNTQ"    %%mm5, 40(%0,%2,2)  \n\t"
2051                MOVNTQ"    %%mm6, 48(%0,%2,2)  \n\t"
2052                MOVNTQ"    %%mm7, 56(%0,%2,2)"
2053                :: "r"(d), "r"(s2), "r"(x)
2054                :"memory");
2055        }
2056        for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2057    }
2058    __asm__(
2059            EMMS"       \n\t"
2060            SFENCE"     \n\t"
2061            ::: "memory"
2062        );
2063}
2064
2065static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2066                                        uint8_t *dst,
2067                                        int width, int height,
2068                                        int srcStride1, int srcStride2,
2069                                        int srcStride3, int dstStride)
2070{
2071    x86_reg x;
2072    int y,w,h;
2073    w=width/2; h=height;
2074    for (y=0;y<h;y++) {
2075        const uint8_t* yp=src1+srcStride1*y;
2076        const uint8_t* up=src2+srcStride2*(y>>2);
2077        const uint8_t* vp=src3+srcStride3*(y>>2);
2078        uint8_t* d=dst+dstStride*y;
2079        x=0;
2080        for (;x<w-7;x+=8) {
2081            __asm__ volatile(
2082                PREFETCH"   32(%1, %0)          \n\t"
2083                PREFETCH"   32(%2, %0)          \n\t"
2084                PREFETCH"   32(%3, %0)          \n\t"
2085                "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2086                "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2087                "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2088                "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2089                "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2090                "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2091                "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2092                "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2093                "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2094                "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2095
2096                "movq            %%mm1, %%mm6   \n\t"
2097                "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2098                "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2099                "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2100                MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2101                MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2102
2103                "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2104                "movq     8(%1, %0, 4), %%mm0   \n\t"
2105                "movq            %%mm0, %%mm3   \n\t"
2106                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2107                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2108                MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2109                MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2110
2111                "movq            %%mm4, %%mm6   \n\t"
2112                "movq    16(%1, %0, 4), %%mm0   \n\t"
2113                "movq            %%mm0, %%mm3   \n\t"
2114                "punpcklbw       %%mm5, %%mm4   \n\t"
2115                "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2116                "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2117                MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2118                MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2119
2120                "punpckhbw       %%mm5, %%mm6   \n\t"
2121                "movq    24(%1, %0, 4), %%mm0   \n\t"
2122                "movq            %%mm0, %%mm3   \n\t"
2123                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2124                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2125                MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2126                MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2127
2128                : "+r" (x)
2129                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2130                :"memory");
2131        }
2132        for (; x<w; x++) {
2133            const int x2 = x<<2;
2134            d[8*x+0] = yp[x2];
2135            d[8*x+1] = up[x];
2136            d[8*x+2] = yp[x2+1];
2137            d[8*x+3] = vp[x];
2138            d[8*x+4] = yp[x2+2];
2139            d[8*x+5] = up[x];
2140            d[8*x+6] = yp[x2+3];
2141            d[8*x+7] = vp[x];
2142        }
2143    }
2144    __asm__(
2145            EMMS"       \n\t"
2146            SFENCE"     \n\t"
2147            ::: "memory"
2148        );
2149}
2150#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2151
2152static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2153{
2154    dst +=   count;
2155    src += 2*count;
2156    count= - count;
2157
2158    if(count <= -16) {
2159        count += 15;
2160        __asm__ volatile(
2161            "pcmpeqw       %%mm7, %%mm7        \n\t"
2162            "psrlw            $8, %%mm7        \n\t"
2163            "1:                                \n\t"
2164            "movq -30(%1, %0, 2), %%mm0        \n\t"
2165            "movq -22(%1, %0, 2), %%mm1        \n\t"
2166            "movq -14(%1, %0, 2), %%mm2        \n\t"
2167            "movq  -6(%1, %0, 2), %%mm3        \n\t"
2168            "pand          %%mm7, %%mm0        \n\t"
2169            "pand          %%mm7, %%mm1        \n\t"
2170            "pand          %%mm7, %%mm2        \n\t"
2171            "pand          %%mm7, %%mm3        \n\t"
2172            "packuswb      %%mm1, %%mm0        \n\t"
2173            "packuswb      %%mm3, %%mm2        \n\t"
2174            MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
2175            MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
2176            "add             $16, %0           \n\t"
2177            " js 1b                            \n\t"
2178            : "+r"(count)
2179            : "r"(src), "r"(dst)
2180        );
2181        count -= 15;
2182    }
2183    while(count<0) {
2184        dst[count]= src[2*count];
2185        count++;
2186    }
2187}
2188
2189static void RENAME(extract_odd)(const uint8_t *src, uint8_t *dst, x86_reg count)
2190{
2191    src ++;
2192    dst +=   count;
2193    src += 2*count;
2194    count= - count;
2195
2196    if(count < -16) {
2197        count += 16;
2198        __asm__ volatile(
2199            "pcmpeqw       %%mm7, %%mm7        \n\t"
2200            "psrlw            $8, %%mm7        \n\t"
2201            "1:                                \n\t"
2202            "movq -32(%1, %0, 2), %%mm0        \n\t"
2203            "movq -24(%1, %0, 2), %%mm1        \n\t"
2204            "movq -16(%1, %0, 2), %%mm2        \n\t"
2205            "movq  -8(%1, %0, 2), %%mm3        \n\t"
2206            "pand          %%mm7, %%mm0        \n\t"
2207            "pand          %%mm7, %%mm1        \n\t"
2208            "pand          %%mm7, %%mm2        \n\t"
2209            "pand          %%mm7, %%mm3        \n\t"
2210            "packuswb      %%mm1, %%mm0        \n\t"
2211            "packuswb      %%mm3, %%mm2        \n\t"
2212            MOVNTQ"        %%mm0,-16(%2, %0)   \n\t"
2213            MOVNTQ"        %%mm2,- 8(%2, %0)   \n\t"
2214            "add             $16, %0           \n\t"
2215            " js 1b                            \n\t"
2216            : "+r"(count)
2217            : "r"(src), "r"(dst)
2218        );
2219        count -= 16;
2220    }
2221    while(count<0) {
2222        dst[count]= src[2*count];
2223        count++;
2224    }
2225}
2226
2227#if !COMPILE_TEMPLATE_AMD3DNOW
2228static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2229{
2230    dst0+=   count;
2231    dst1+=   count;
2232    src += 4*count;
2233    count= - count;
2234    if(count <= -8) {
2235        count += 7;
2236        __asm__ volatile(
2237            "pcmpeqw       %%mm7, %%mm7        \n\t"
2238            "psrlw            $8, %%mm7        \n\t"
2239            "1:                                \n\t"
2240            "movq -28(%1, %0, 4), %%mm0        \n\t"
2241            "movq -20(%1, %0, 4), %%mm1        \n\t"
2242            "movq -12(%1, %0, 4), %%mm2        \n\t"
2243            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2244            "pand          %%mm7, %%mm0        \n\t"
2245            "pand          %%mm7, %%mm1        \n\t"
2246            "pand          %%mm7, %%mm2        \n\t"
2247            "pand          %%mm7, %%mm3        \n\t"
2248            "packuswb      %%mm1, %%mm0        \n\t"
2249            "packuswb      %%mm3, %%mm2        \n\t"
2250            "movq          %%mm0, %%mm1        \n\t"
2251            "movq          %%mm2, %%mm3        \n\t"
2252            "psrlw            $8, %%mm0        \n\t"
2253            "psrlw            $8, %%mm2        \n\t"
2254            "pand          %%mm7, %%mm1        \n\t"
2255            "pand          %%mm7, %%mm3        \n\t"
2256            "packuswb      %%mm2, %%mm0        \n\t"
2257            "packuswb      %%mm3, %%mm1        \n\t"
2258            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2259            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2260            "add              $8, %0           \n\t"
2261            " js 1b                            \n\t"
2262            : "+r"(count)
2263            : "r"(src), "r"(dst0), "r"(dst1)
2264        );
2265        count -= 7;
2266    }
2267    while(count<0) {
2268        dst0[count]= src[4*count+0];
2269        dst1[count]= src[4*count+2];
2270        count++;
2271    }
2272}
2273#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2274
2275static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2276{
2277    dst0 +=   count;
2278    dst1 +=   count;
2279    src0 += 4*count;
2280    src1 += 4*count;
2281    count= - count;
2282#ifdef PAVGB
2283    if(count <= -8) {
2284        count += 7;
2285        __asm__ volatile(
2286            "pcmpeqw        %%mm7, %%mm7        \n\t"
2287            "psrlw             $8, %%mm7        \n\t"
2288            "1:                                \n\t"
2289            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2290            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2291            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2292            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2293            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2294            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2295            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2296            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2297            "pand           %%mm7, %%mm0        \n\t"
2298            "pand           %%mm7, %%mm1        \n\t"
2299            "pand           %%mm7, %%mm2        \n\t"
2300            "pand           %%mm7, %%mm3        \n\t"
2301            "packuswb       %%mm1, %%mm0        \n\t"
2302            "packuswb       %%mm3, %%mm2        \n\t"
2303            "movq           %%mm0, %%mm1        \n\t"
2304            "movq           %%mm2, %%mm3        \n\t"
2305            "psrlw             $8, %%mm0        \n\t"
2306            "psrlw             $8, %%mm2        \n\t"
2307            "pand           %%mm7, %%mm1        \n\t"
2308            "pand           %%mm7, %%mm3        \n\t"
2309            "packuswb       %%mm2, %%mm0        \n\t"
2310            "packuswb       %%mm3, %%mm1        \n\t"
2311            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2312            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2313            "add               $8, %0           \n\t"
2314            " js 1b                            \n\t"
2315            : "+r"(count)
2316            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2317        );
2318        count -= 7;
2319    }
2320#endif
2321    while(count<0) {
2322        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2323        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2324        count++;
2325    }
2326}
2327
2328#if !COMPILE_TEMPLATE_AMD3DNOW
2329static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2330{
2331    dst0+=   count;
2332    dst1+=   count;
2333    src += 4*count;
2334    count= - count;
2335    if(count <= -8) {
2336        count += 7;
2337        __asm__ volatile(
2338            "pcmpeqw       %%mm7, %%mm7        \n\t"
2339            "psrlw            $8, %%mm7        \n\t"
2340            "1:                                \n\t"
2341            "movq -28(%1, %0, 4), %%mm0        \n\t"
2342            "movq -20(%1, %0, 4), %%mm1        \n\t"
2343            "movq -12(%1, %0, 4), %%mm2        \n\t"
2344            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2345            "psrlw            $8, %%mm0        \n\t"
2346            "psrlw            $8, %%mm1        \n\t"
2347            "psrlw            $8, %%mm2        \n\t"
2348            "psrlw            $8, %%mm3        \n\t"
2349            "packuswb      %%mm1, %%mm0        \n\t"
2350            "packuswb      %%mm3, %%mm2        \n\t"
2351            "movq          %%mm0, %%mm1        \n\t"
2352            "movq          %%mm2, %%mm3        \n\t"
2353            "psrlw            $8, %%mm0        \n\t"
2354            "psrlw            $8, %%mm2        \n\t"
2355            "pand          %%mm7, %%mm1        \n\t"
2356            "pand          %%mm7, %%mm3        \n\t"
2357            "packuswb      %%mm2, %%mm0        \n\t"
2358            "packuswb      %%mm3, %%mm1        \n\t"
2359            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2360            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2361            "add              $8, %0           \n\t"
2362            " js 1b                            \n\t"
2363            : "+r"(count)
2364            : "r"(src), "r"(dst0), "r"(dst1)
2365        );
2366        count -= 7;
2367    }
2368    src++;
2369    while(count<0) {
2370        dst0[count]= src[4*count+0];
2371        dst1[count]= src[4*count+2];
2372        count++;
2373    }
2374}
2375#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2376
2377static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2378{
2379    dst0 +=   count;
2380    dst1 +=   count;
2381    src0 += 4*count;
2382    src1 += 4*count;
2383    count= - count;
2384#ifdef PAVGB
2385    if(count <= -8) {
2386        count += 7;
2387        __asm__ volatile(
2388            "pcmpeqw        %%mm7, %%mm7        \n\t"
2389            "psrlw             $8, %%mm7        \n\t"
2390            "1:                                \n\t"
2391            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2392            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2393            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2394            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2395            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2396            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2397            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2398            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2399            "psrlw             $8, %%mm0        \n\t"
2400            "psrlw             $8, %%mm1        \n\t"
2401            "psrlw             $8, %%mm2        \n\t"
2402            "psrlw             $8, %%mm3        \n\t"
2403            "packuswb       %%mm1, %%mm0        \n\t"
2404            "packuswb       %%mm3, %%mm2        \n\t"
2405            "movq           %%mm0, %%mm1        \n\t"
2406            "movq           %%mm2, %%mm3        \n\t"
2407            "psrlw             $8, %%mm0        \n\t"
2408            "psrlw             $8, %%mm2        \n\t"
2409            "pand           %%mm7, %%mm1        \n\t"
2410            "pand           %%mm7, %%mm3        \n\t"
2411            "packuswb       %%mm2, %%mm0        \n\t"
2412            "packuswb       %%mm3, %%mm1        \n\t"
2413            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2414            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2415            "add               $8, %0           \n\t"
2416            " js 1b                            \n\t"
2417            : "+r"(count)
2418            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2419        );
2420        count -= 7;
2421    }
2422#endif
2423    src0++;
2424    src1++;
2425    while(count<0) {
2426        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2427        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2428        count++;
2429    }
2430}
2431
2432static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2433                                 int width, int height,
2434                                 int lumStride, int chromStride, int srcStride)
2435{
2436    int y;
2437    const int chromWidth = FF_CEIL_RSHIFT(width, 1);
2438
2439    for (y=0; y<height; y++) {
2440        RENAME(extract_even)(src, ydst, width);
2441        if(y&1) {
2442            RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2443            udst+= chromStride;
2444            vdst+= chromStride;
2445        }
2446
2447        src += srcStride;
2448        ydst+= lumStride;
2449    }
2450    __asm__(
2451            EMMS"       \n\t"
2452            SFENCE"     \n\t"
2453            ::: "memory"
2454        );
2455}
2456
2457#if !COMPILE_TEMPLATE_AMD3DNOW
2458static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2459                                 int width, int height,
2460                                 int lumStride, int chromStride, int srcStride)
2461{
2462    int y;
2463    const int chromWidth = FF_CEIL_RSHIFT(width, 1);
2464
2465    for (y=0; y<height; y++) {
2466        RENAME(extract_even)(src, ydst, width);
2467        RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2468
2469        src += srcStride;
2470        ydst+= lumStride;
2471        udst+= chromStride;
2472        vdst+= chromStride;
2473    }
2474    __asm__(
2475            EMMS"       \n\t"
2476            SFENCE"     \n\t"
2477            ::: "memory"
2478        );
2479}
2480#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2481
2482static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2483                                 int width, int height,
2484                                 int lumStride, int chromStride, int srcStride)
2485{
2486    int y;
2487    const int chromWidth = FF_CEIL_RSHIFT(width, 1);
2488
2489    for (y=0; y<height; y++) {
2490        RENAME(extract_odd)(src, ydst, width);
2491        if(y&1) {
2492            RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2493            udst+= chromStride;
2494            vdst+= chromStride;
2495        }
2496
2497        src += srcStride;
2498        ydst+= lumStride;
2499    }
2500    __asm__(
2501            EMMS"       \n\t"
2502            SFENCE"     \n\t"
2503            ::: "memory"
2504        );
2505}
2506
2507#if !COMPILE_TEMPLATE_AMD3DNOW
2508static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2509                                 int width, int height,
2510                                 int lumStride, int chromStride, int srcStride)
2511{
2512    int y;
2513    const int chromWidth = FF_CEIL_RSHIFT(width, 1);
2514
2515    for (y=0; y<height; y++) {
2516        RENAME(extract_odd)(src, ydst, width);
2517        RENAME(extract_even2)(src, udst, vdst, chromWidth);
2518
2519        src += srcStride;
2520        ydst+= lumStride;
2521        udst+= chromStride;
2522        vdst+= chromStride;
2523    }
2524    __asm__(
2525            EMMS"       \n\t"
2526            SFENCE"     \n\t"
2527            ::: "memory"
2528        );
2529}
2530#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2531#endif /* !COMPILE_TEMPLATE_SSE2 */
2532
2533static av_cold void RENAME(rgb2rgb_init)(void)
2534{
2535#if !COMPILE_TEMPLATE_SSE2
2536#if !COMPILE_TEMPLATE_AMD3DNOW
2537    rgb15to16          = RENAME(rgb15to16);
2538    rgb15tobgr24       = RENAME(rgb15tobgr24);
2539    rgb15to32          = RENAME(rgb15to32);
2540    rgb16tobgr24       = RENAME(rgb16tobgr24);
2541    rgb16to32          = RENAME(rgb16to32);
2542    rgb16to15          = RENAME(rgb16to15);
2543    rgb24tobgr16       = RENAME(rgb24tobgr16);
2544    rgb24tobgr15       = RENAME(rgb24tobgr15);
2545    rgb24tobgr32       = RENAME(rgb24tobgr32);
2546    rgb32to16          = RENAME(rgb32to16);
2547    rgb32to15          = RENAME(rgb32to15);
2548    rgb32tobgr24       = RENAME(rgb32tobgr24);
2549    rgb24to15          = RENAME(rgb24to15);
2550    rgb24to16          = RENAME(rgb24to16);
2551    rgb24tobgr24       = RENAME(rgb24tobgr24);
2552    shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2553    rgb32tobgr16       = RENAME(rgb32tobgr16);
2554    rgb32tobgr15       = RENAME(rgb32tobgr15);
2555    yv12toyuy2         = RENAME(yv12toyuy2);
2556    yv12touyvy         = RENAME(yv12touyvy);
2557    yuv422ptoyuy2      = RENAME(yuv422ptoyuy2);
2558    yuv422ptouyvy      = RENAME(yuv422ptouyvy);
2559    yuy2toyv12         = RENAME(yuy2toyv12);
2560    vu9_to_vu12        = RENAME(vu9_to_vu12);
2561    yvu9_to_yuy2       = RENAME(yvu9_to_yuy2);
2562    uyvytoyuv422       = RENAME(uyvytoyuv422);
2563    yuyvtoyuv422       = RENAME(yuyvtoyuv422);
2564#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2565
2566#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2567    planar2x           = RENAME(planar2x);
2568#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
2569#if HAVE_7REGS
2570    ff_rgb24toyv12     = RENAME(rgb24toyv12);
2571#endif /* HAVE_7REGS */
2572
2573    yuyvtoyuv420       = RENAME(yuyvtoyuv420);
2574    uyvytoyuv420       = RENAME(uyvytoyuv420);
2575#endif /* !COMPILE_TEMPLATE_SSE2 */
2576
2577#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
2578    interleaveBytes    = RENAME(interleaveBytes);
2579#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
2580#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
2581#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
2582    deinterleaveBytes  = RENAME(deinterleaveBytes);
2583#endif
2584#endif
2585}
2586