1/*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 *              software YUV to YUV converter
5 *              software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 *
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
28 */
29
30#include <stddef.h>
31
32#undef PREFETCH
33#undef MOVNTQ
34#undef EMMS
35#undef SFENCE
36#undef MMREG_SIZE
37#undef PREFETCHW
38#undef PAVGB
39
40#if HAVE_SSE2
41#define MMREG_SIZE 16
42#else
43#define MMREG_SIZE 8
44#endif
45
46#if HAVE_AMD3DNOW
47#define PREFETCH  "prefetch"
48#define PREFETCHW "prefetchw"
49#define PAVGB     "pavgusb"
50#elif HAVE_MMX2
51#define PREFETCH "prefetchnta"
52#define PREFETCHW "prefetcht0"
53#define PAVGB     "pavgb"
54#else
55#define PREFETCH  " # nop"
56#define PREFETCHW " # nop"
57#endif
58
59#if HAVE_AMD3DNOW
60/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
61#define EMMS     "femms"
62#else
63#define EMMS     "emms"
64#endif
65
66#if HAVE_MMX2
67#define MOVNTQ "movntq"
68#define SFENCE "sfence"
69#else
70#define MOVNTQ "movq"
71#define SFENCE " # nop"
72#endif
73
74static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
75{
76    uint8_t *dest = dst;
77    const uint8_t *s = src;
78    const uint8_t *end;
79    #if HAVE_MMX
80        const uint8_t *mm_end;
81    #endif
82    end = s + src_size;
83    #if HAVE_MMX
84        __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
85        mm_end = end - 23;
86        __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
87        while (s < mm_end)
88        {
89            __asm__ volatile(
90            PREFETCH"    32%1           \n\t"
91            "movd          %1, %%mm0    \n\t"
92            "punpckldq    3%1, %%mm0    \n\t"
93            "movd         6%1, %%mm1    \n\t"
94            "punpckldq    9%1, %%mm1    \n\t"
95            "movd        12%1, %%mm2    \n\t"
96            "punpckldq   15%1, %%mm2    \n\t"
97            "movd        18%1, %%mm3    \n\t"
98            "punpckldq   21%1, %%mm3    \n\t"
99            "por        %%mm7, %%mm0    \n\t"
100            "por        %%mm7, %%mm1    \n\t"
101            "por        %%mm7, %%mm2    \n\t"
102            "por        %%mm7, %%mm3    \n\t"
103            MOVNTQ"     %%mm0,   %0     \n\t"
104            MOVNTQ"     %%mm1,  8%0     \n\t"
105            MOVNTQ"     %%mm2, 16%0     \n\t"
106            MOVNTQ"     %%mm3, 24%0"
107            :"=m"(*dest)
108            :"m"(*s)
109            :"memory");
110            dest += 32;
111            s += 24;
112        }
113        __asm__ volatile(SFENCE:::"memory");
114        __asm__ volatile(EMMS:::"memory");
115    #endif
116    while (s < end)
117    {
118    #ifdef WORDS_BIGENDIAN
119        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
120        *dest++ = 255;
121        *dest++ = s[2];
122        *dest++ = s[1];
123        *dest++ = s[0];
124        s+=3;
125    #else
126        *dest++ = *s++;
127        *dest++ = *s++;
128        *dest++ = *s++;
129        *dest++ = 255;
130    #endif
131    }
132}
133
134static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
135{
136    uint8_t *dest = dst;
137    const uint8_t *s = src;
138    const uint8_t *end;
139#if HAVE_MMX
140    const uint8_t *mm_end;
141#endif
142    end = s + src_size;
143#if HAVE_MMX
144    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
145    mm_end = end - 31;
146    while (s < mm_end)
147    {
148        __asm__ volatile(
149        PREFETCH"    32%1           \n\t"
150        "movq          %1, %%mm0    \n\t"
151        "movq         8%1, %%mm1    \n\t"
152        "movq        16%1, %%mm4    \n\t"
153        "movq        24%1, %%mm5    \n\t"
154        "movq       %%mm0, %%mm2    \n\t"
155        "movq       %%mm1, %%mm3    \n\t"
156        "movq       %%mm4, %%mm6    \n\t"
157        "movq       %%mm5, %%mm7    \n\t"
158        "psrlq         $8, %%mm2    \n\t"
159        "psrlq         $8, %%mm3    \n\t"
160        "psrlq         $8, %%mm6    \n\t"
161        "psrlq         $8, %%mm7    \n\t"
162        "pand          %2, %%mm0    \n\t"
163        "pand          %2, %%mm1    \n\t"
164        "pand          %2, %%mm4    \n\t"
165        "pand          %2, %%mm5    \n\t"
166        "pand          %3, %%mm2    \n\t"
167        "pand          %3, %%mm3    \n\t"
168        "pand          %3, %%mm6    \n\t"
169        "pand          %3, %%mm7    \n\t"
170        "por        %%mm2, %%mm0    \n\t"
171        "por        %%mm3, %%mm1    \n\t"
172        "por        %%mm6, %%mm4    \n\t"
173        "por        %%mm7, %%mm5    \n\t"
174
175        "movq       %%mm1, %%mm2    \n\t"
176        "movq       %%mm4, %%mm3    \n\t"
177        "psllq        $48, %%mm2    \n\t"
178        "psllq        $32, %%mm3    \n\t"
179        "pand          %4, %%mm2    \n\t"
180        "pand          %5, %%mm3    \n\t"
181        "por        %%mm2, %%mm0    \n\t"
182        "psrlq        $16, %%mm1    \n\t"
183        "psrlq        $32, %%mm4    \n\t"
184        "psllq        $16, %%mm5    \n\t"
185        "por        %%mm3, %%mm1    \n\t"
186        "pand          %6, %%mm5    \n\t"
187        "por        %%mm5, %%mm4    \n\t"
188
189        MOVNTQ"     %%mm0,   %0     \n\t"
190        MOVNTQ"     %%mm1,  8%0     \n\t"
191        MOVNTQ"     %%mm4, 16%0"
192        :"=m"(*dest)
193        :"m"(*s),"m"(mask24l),
194         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
195        :"memory");
196        dest += 24;
197        s += 32;
198    }
199    __asm__ volatile(SFENCE:::"memory");
200    __asm__ volatile(EMMS:::"memory");
201#endif
202    while (s < end)
203    {
204#ifdef WORDS_BIGENDIAN
205        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
206        s++;
207        dest[2] = *s++;
208        dest[1] = *s++;
209        dest[0] = *s++;
210        dest += 3;
211#else
212        *dest++ = *s++;
213        *dest++ = *s++;
214        *dest++ = *s++;
215        s++;
216#endif
217    }
218}
219
220/*
221 original by Strepto/Astral
222 ported to gcc & bugfixed: A'rpi
223 MMX2, 3DNOW optimization by Nick Kurshev
224 32-bit C version, and and&add trick by Michael Niedermayer
225*/
226static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
227{
228    register const uint8_t* s=src;
229    register uint8_t* d=dst;
230    register const uint8_t *end;
231    const uint8_t *mm_end;
232    end = s + src_size;
233#if HAVE_MMX
234    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
235    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
236    mm_end = end - 15;
237    while (s<mm_end)
238    {
239        __asm__ volatile(
240        PREFETCH"  32%1         \n\t"
241        "movq        %1, %%mm0  \n\t"
242        "movq       8%1, %%mm2  \n\t"
243        "movq     %%mm0, %%mm1  \n\t"
244        "movq     %%mm2, %%mm3  \n\t"
245        "pand     %%mm4, %%mm0  \n\t"
246        "pand     %%mm4, %%mm2  \n\t"
247        "paddw    %%mm1, %%mm0  \n\t"
248        "paddw    %%mm3, %%mm2  \n\t"
249        MOVNTQ"   %%mm0,  %0    \n\t"
250        MOVNTQ"   %%mm2, 8%0"
251        :"=m"(*d)
252        :"m"(*s)
253        );
254        d+=16;
255        s+=16;
256    }
257    __asm__ volatile(SFENCE:::"memory");
258    __asm__ volatile(EMMS:::"memory");
259#endif
260    mm_end = end - 3;
261    while (s < mm_end)
262    {
263        register unsigned x= *((const uint32_t *)s);
264        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
265        d+=4;
266        s+=4;
267    }
268    if (s < end)
269    {
270        register unsigned short x= *((const uint16_t *)s);
271        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
272    }
273}
274
275static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
276{
277    register const uint8_t* s=src;
278    register uint8_t* d=dst;
279    register const uint8_t *end;
280    const uint8_t *mm_end;
281    end = s + src_size;
282#if HAVE_MMX
283    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
284    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
285    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
286    mm_end = end - 15;
287    while (s<mm_end)
288    {
289        __asm__ volatile(
290        PREFETCH"  32%1         \n\t"
291        "movq        %1, %%mm0  \n\t"
292        "movq       8%1, %%mm2  \n\t"
293        "movq     %%mm0, %%mm1  \n\t"
294        "movq     %%mm2, %%mm3  \n\t"
295        "psrlq       $1, %%mm0  \n\t"
296        "psrlq       $1, %%mm2  \n\t"
297        "pand     %%mm7, %%mm0  \n\t"
298        "pand     %%mm7, %%mm2  \n\t"
299        "pand     %%mm6, %%mm1  \n\t"
300        "pand     %%mm6, %%mm3  \n\t"
301        "por      %%mm1, %%mm0  \n\t"
302        "por      %%mm3, %%mm2  \n\t"
303        MOVNTQ"   %%mm0,  %0    \n\t"
304        MOVNTQ"   %%mm2, 8%0"
305        :"=m"(*d)
306        :"m"(*s)
307        );
308        d+=16;
309        s+=16;
310    }
311    __asm__ volatile(SFENCE:::"memory");
312    __asm__ volatile(EMMS:::"memory");
313#endif
314    mm_end = end - 3;
315    while (s < mm_end)
316    {
317        register uint32_t x= *((const uint32_t*)s);
318        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
319        s+=4;
320        d+=4;
321    }
322    if (s < end)
323    {
324        register uint16_t x= *((const uint16_t*)s);
325        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
326        s+=2;
327        d+=2;
328    }
329}
330
331static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
332{
333    const uint8_t *s = src;
334    const uint8_t *end;
335#if HAVE_MMX
336    const uint8_t *mm_end;
337#endif
338    uint16_t *d = (uint16_t *)dst;
339    end = s + src_size;
340#if HAVE_MMX
341    mm_end = end - 15;
342#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
343    __asm__ volatile(
344    "movq           %3, %%mm5   \n\t"
345    "movq           %4, %%mm6   \n\t"
346    "movq           %5, %%mm7   \n\t"
347    "jmp 2f                     \n\t"
348    ASMALIGN(4)
349    "1:                         \n\t"
350    PREFETCH"   32(%1)          \n\t"
351    "movd         (%1), %%mm0   \n\t"
352    "movd        4(%1), %%mm3   \n\t"
353    "punpckldq   8(%1), %%mm0   \n\t"
354    "punpckldq  12(%1), %%mm3   \n\t"
355    "movq        %%mm0, %%mm1   \n\t"
356    "movq        %%mm3, %%mm4   \n\t"
357    "pand        %%mm6, %%mm0   \n\t"
358    "pand        %%mm6, %%mm3   \n\t"
359    "pmaddwd     %%mm7, %%mm0   \n\t"
360    "pmaddwd     %%mm7, %%mm3   \n\t"
361    "pand        %%mm5, %%mm1   \n\t"
362    "pand        %%mm5, %%mm4   \n\t"
363    "por         %%mm1, %%mm0   \n\t"
364    "por         %%mm4, %%mm3   \n\t"
365    "psrld          $5, %%mm0   \n\t"
366    "pslld         $11, %%mm3   \n\t"
367    "por         %%mm3, %%mm0   \n\t"
368    MOVNTQ"      %%mm0, (%0)    \n\t"
369    "add           $16,  %1     \n\t"
370    "add            $8,  %0     \n\t"
371    "2:                         \n\t"
372    "cmp            %2,  %1     \n\t"
373    " jb            1b          \n\t"
374    : "+r" (d), "+r"(s)
375    : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
376    );
377#else
378    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
379    __asm__ volatile(
380        "movq    %0, %%mm7    \n\t"
381        "movq    %1, %%mm6    \n\t"
382        ::"m"(red_16mask),"m"(green_16mask));
383    while (s < mm_end)
384    {
385        __asm__ volatile(
386        PREFETCH"    32%1           \n\t"
387        "movd          %1, %%mm0    \n\t"
388        "movd         4%1, %%mm3    \n\t"
389        "punpckldq    8%1, %%mm0    \n\t"
390        "punpckldq   12%1, %%mm3    \n\t"
391        "movq       %%mm0, %%mm1    \n\t"
392        "movq       %%mm0, %%mm2    \n\t"
393        "movq       %%mm3, %%mm4    \n\t"
394        "movq       %%mm3, %%mm5    \n\t"
395        "psrlq         $3, %%mm0    \n\t"
396        "psrlq         $3, %%mm3    \n\t"
397        "pand          %2, %%mm0    \n\t"
398        "pand          %2, %%mm3    \n\t"
399        "psrlq         $5, %%mm1    \n\t"
400        "psrlq         $5, %%mm4    \n\t"
401        "pand       %%mm6, %%mm1    \n\t"
402        "pand       %%mm6, %%mm4    \n\t"
403        "psrlq         $8, %%mm2    \n\t"
404        "psrlq         $8, %%mm5    \n\t"
405        "pand       %%mm7, %%mm2    \n\t"
406        "pand       %%mm7, %%mm5    \n\t"
407        "por        %%mm1, %%mm0    \n\t"
408        "por        %%mm4, %%mm3    \n\t"
409        "por        %%mm2, %%mm0    \n\t"
410        "por        %%mm5, %%mm3    \n\t"
411        "psllq        $16, %%mm3    \n\t"
412        "por        %%mm3, %%mm0    \n\t"
413        MOVNTQ"     %%mm0, %0       \n\t"
414        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
415        d += 4;
416        s += 16;
417    }
418#endif
419    __asm__ volatile(SFENCE:::"memory");
420    __asm__ volatile(EMMS:::"memory");
421#endif
422    while (s < end)
423    {
424        register int rgb = *(const uint32_t*)s; s += 4;
425        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
426    }
427}
428
429static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
430{
431    const uint8_t *s = src;
432    const uint8_t *end;
433#if HAVE_MMX
434    const uint8_t *mm_end;
435#endif
436    uint16_t *d = (uint16_t *)dst;
437    end = s + src_size;
438#if HAVE_MMX
439    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
440    __asm__ volatile(
441        "movq          %0, %%mm7    \n\t"
442        "movq          %1, %%mm6    \n\t"
443        ::"m"(red_16mask),"m"(green_16mask));
444    mm_end = end - 15;
445    while (s < mm_end)
446    {
447        __asm__ volatile(
448        PREFETCH"    32%1           \n\t"
449        "movd          %1, %%mm0    \n\t"
450        "movd         4%1, %%mm3    \n\t"
451        "punpckldq    8%1, %%mm0    \n\t"
452        "punpckldq   12%1, %%mm3    \n\t"
453        "movq       %%mm0, %%mm1    \n\t"
454        "movq       %%mm0, %%mm2    \n\t"
455        "movq       %%mm3, %%mm4    \n\t"
456        "movq       %%mm3, %%mm5    \n\t"
457        "psllq         $8, %%mm0    \n\t"
458        "psllq         $8, %%mm3    \n\t"
459        "pand       %%mm7, %%mm0    \n\t"
460        "pand       %%mm7, %%mm3    \n\t"
461        "psrlq         $5, %%mm1    \n\t"
462        "psrlq         $5, %%mm4    \n\t"
463        "pand       %%mm6, %%mm1    \n\t"
464        "pand       %%mm6, %%mm4    \n\t"
465        "psrlq        $19, %%mm2    \n\t"
466        "psrlq        $19, %%mm5    \n\t"
467        "pand          %2, %%mm2    \n\t"
468        "pand          %2, %%mm5    \n\t"
469        "por        %%mm1, %%mm0    \n\t"
470        "por        %%mm4, %%mm3    \n\t"
471        "por        %%mm2, %%mm0    \n\t"
472        "por        %%mm5, %%mm3    \n\t"
473        "psllq        $16, %%mm3    \n\t"
474        "por        %%mm3, %%mm0    \n\t"
475        MOVNTQ"     %%mm0, %0       \n\t"
476        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
477        d += 4;
478        s += 16;
479    }
480    __asm__ volatile(SFENCE:::"memory");
481    __asm__ volatile(EMMS:::"memory");
482#endif
483    while (s < end)
484    {
485        register int rgb = *(const uint32_t*)s; s += 4;
486        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
487    }
488}
489
490static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
491{
492    const uint8_t *s = src;
493    const uint8_t *end;
494#if HAVE_MMX
495    const uint8_t *mm_end;
496#endif
497    uint16_t *d = (uint16_t *)dst;
498    end = s + src_size;
499#if HAVE_MMX
500    mm_end = end - 15;
501#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
502    __asm__ volatile(
503    "movq           %3, %%mm5   \n\t"
504    "movq           %4, %%mm6   \n\t"
505    "movq           %5, %%mm7   \n\t"
506    "jmp            2f          \n\t"
507    ASMALIGN(4)
508    "1:                         \n\t"
509    PREFETCH"   32(%1)          \n\t"
510    "movd         (%1), %%mm0   \n\t"
511    "movd        4(%1), %%mm3   \n\t"
512    "punpckldq   8(%1), %%mm0   \n\t"
513    "punpckldq  12(%1), %%mm3   \n\t"
514    "movq        %%mm0, %%mm1   \n\t"
515    "movq        %%mm3, %%mm4   \n\t"
516    "pand        %%mm6, %%mm0   \n\t"
517    "pand        %%mm6, %%mm3   \n\t"
518    "pmaddwd     %%mm7, %%mm0   \n\t"
519    "pmaddwd     %%mm7, %%mm3   \n\t"
520    "pand        %%mm5, %%mm1   \n\t"
521    "pand        %%mm5, %%mm4   \n\t"
522    "por         %%mm1, %%mm0   \n\t"
523    "por         %%mm4, %%mm3   \n\t"
524    "psrld          $6, %%mm0   \n\t"
525    "pslld         $10, %%mm3   \n\t"
526    "por         %%mm3, %%mm0   \n\t"
527    MOVNTQ"      %%mm0, (%0)    \n\t"
528    "add           $16,  %1     \n\t"
529    "add            $8,  %0     \n\t"
530    "2:                         \n\t"
531    "cmp            %2,  %1     \n\t"
532    " jb            1b          \n\t"
533    : "+r" (d), "+r"(s)
534    : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
535    );
536#else
537    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
538    __asm__ volatile(
539        "movq          %0, %%mm7    \n\t"
540        "movq          %1, %%mm6    \n\t"
541        ::"m"(red_15mask),"m"(green_15mask));
542    while (s < mm_end)
543    {
544        __asm__ volatile(
545        PREFETCH"    32%1           \n\t"
546        "movd          %1, %%mm0    \n\t"
547        "movd         4%1, %%mm3    \n\t"
548        "punpckldq    8%1, %%mm0    \n\t"
549        "punpckldq   12%1, %%mm3    \n\t"
550        "movq       %%mm0, %%mm1    \n\t"
551        "movq       %%mm0, %%mm2    \n\t"
552        "movq       %%mm3, %%mm4    \n\t"
553        "movq       %%mm3, %%mm5    \n\t"
554        "psrlq         $3, %%mm0    \n\t"
555        "psrlq         $3, %%mm3    \n\t"
556        "pand          %2, %%mm0    \n\t"
557        "pand          %2, %%mm3    \n\t"
558        "psrlq         $6, %%mm1    \n\t"
559        "psrlq         $6, %%mm4    \n\t"
560        "pand       %%mm6, %%mm1    \n\t"
561        "pand       %%mm6, %%mm4    \n\t"
562        "psrlq         $9, %%mm2    \n\t"
563        "psrlq         $9, %%mm5    \n\t"
564        "pand       %%mm7, %%mm2    \n\t"
565        "pand       %%mm7, %%mm5    \n\t"
566        "por        %%mm1, %%mm0    \n\t"
567        "por        %%mm4, %%mm3    \n\t"
568        "por        %%mm2, %%mm0    \n\t"
569        "por        %%mm5, %%mm3    \n\t"
570        "psllq        $16, %%mm3    \n\t"
571        "por        %%mm3, %%mm0    \n\t"
572        MOVNTQ"     %%mm0, %0       \n\t"
573        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
574        d += 4;
575        s += 16;
576    }
577#endif
578    __asm__ volatile(SFENCE:::"memory");
579    __asm__ volatile(EMMS:::"memory");
580#endif
581    while (s < end)
582    {
583        register int rgb = *(const uint32_t*)s; s += 4;
584        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
585    }
586}
587
588static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
589{
590    const uint8_t *s = src;
591    const uint8_t *end;
592#if HAVE_MMX
593    const uint8_t *mm_end;
594#endif
595    uint16_t *d = (uint16_t *)dst;
596    end = s + src_size;
597#if HAVE_MMX
598    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
599    __asm__ volatile(
600        "movq          %0, %%mm7    \n\t"
601        "movq          %1, %%mm6    \n\t"
602        ::"m"(red_15mask),"m"(green_15mask));
603    mm_end = end - 15;
604    while (s < mm_end)
605    {
606        __asm__ volatile(
607        PREFETCH"    32%1           \n\t"
608        "movd          %1, %%mm0    \n\t"
609        "movd         4%1, %%mm3    \n\t"
610        "punpckldq    8%1, %%mm0    \n\t"
611        "punpckldq   12%1, %%mm3    \n\t"
612        "movq       %%mm0, %%mm1    \n\t"
613        "movq       %%mm0, %%mm2    \n\t"
614        "movq       %%mm3, %%mm4    \n\t"
615        "movq       %%mm3, %%mm5    \n\t"
616        "psllq         $7, %%mm0    \n\t"
617        "psllq         $7, %%mm3    \n\t"
618        "pand       %%mm7, %%mm0    \n\t"
619        "pand       %%mm7, %%mm3    \n\t"
620        "psrlq         $6, %%mm1    \n\t"
621        "psrlq         $6, %%mm4    \n\t"
622        "pand       %%mm6, %%mm1    \n\t"
623        "pand       %%mm6, %%mm4    \n\t"
624        "psrlq        $19, %%mm2    \n\t"
625        "psrlq        $19, %%mm5    \n\t"
626        "pand          %2, %%mm2    \n\t"
627        "pand          %2, %%mm5    \n\t"
628        "por        %%mm1, %%mm0    \n\t"
629        "por        %%mm4, %%mm3    \n\t"
630        "por        %%mm2, %%mm0    \n\t"
631        "por        %%mm5, %%mm3    \n\t"
632        "psllq        $16, %%mm3    \n\t"
633        "por        %%mm3, %%mm0    \n\t"
634        MOVNTQ"     %%mm0, %0       \n\t"
635        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
636        d += 4;
637        s += 16;
638    }
639    __asm__ volatile(SFENCE:::"memory");
640    __asm__ volatile(EMMS:::"memory");
641#endif
642    while (s < end)
643    {
644        register int rgb = *(const uint32_t*)s; s += 4;
645        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
646    }
647}
648
649static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
650{
651    const uint8_t *s = src;
652    const uint8_t *end;
653#if HAVE_MMX
654    const uint8_t *mm_end;
655#endif
656    uint16_t *d = (uint16_t *)dst;
657    end = s + src_size;
658#if HAVE_MMX
659    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
660    __asm__ volatile(
661        "movq         %0, %%mm7     \n\t"
662        "movq         %1, %%mm6     \n\t"
663        ::"m"(red_16mask),"m"(green_16mask));
664    mm_end = end - 11;
665    while (s < mm_end)
666    {
667        __asm__ volatile(
668        PREFETCH"    32%1           \n\t"
669        "movd          %1, %%mm0    \n\t"
670        "movd         3%1, %%mm3    \n\t"
671        "punpckldq    6%1, %%mm0    \n\t"
672        "punpckldq    9%1, %%mm3    \n\t"
673        "movq       %%mm0, %%mm1    \n\t"
674        "movq       %%mm0, %%mm2    \n\t"
675        "movq       %%mm3, %%mm4    \n\t"
676        "movq       %%mm3, %%mm5    \n\t"
677        "psrlq         $3, %%mm0    \n\t"
678        "psrlq         $3, %%mm3    \n\t"
679        "pand          %2, %%mm0    \n\t"
680        "pand          %2, %%mm3    \n\t"
681        "psrlq         $5, %%mm1    \n\t"
682        "psrlq         $5, %%mm4    \n\t"
683        "pand       %%mm6, %%mm1    \n\t"
684        "pand       %%mm6, %%mm4    \n\t"
685        "psrlq         $8, %%mm2    \n\t"
686        "psrlq         $8, %%mm5    \n\t"
687        "pand       %%mm7, %%mm2    \n\t"
688        "pand       %%mm7, %%mm5    \n\t"
689        "por        %%mm1, %%mm0    \n\t"
690        "por        %%mm4, %%mm3    \n\t"
691        "por        %%mm2, %%mm0    \n\t"
692        "por        %%mm5, %%mm3    \n\t"
693        "psllq        $16, %%mm3    \n\t"
694        "por        %%mm3, %%mm0    \n\t"
695        MOVNTQ"     %%mm0, %0       \n\t"
696        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
697        d += 4;
698        s += 12;
699    }
700    __asm__ volatile(SFENCE:::"memory");
701    __asm__ volatile(EMMS:::"memory");
702#endif
703    while (s < end)
704    {
705        const int b = *s++;
706        const int g = *s++;
707        const int r = *s++;
708        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
709    }
710}
711
712static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
713{
714    const uint8_t *s = src;
715    const uint8_t *end;
716#if HAVE_MMX
717    const uint8_t *mm_end;
718#endif
719    uint16_t *d = (uint16_t *)dst;
720    end = s + src_size;
721#if HAVE_MMX
722    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
723    __asm__ volatile(
724        "movq         %0, %%mm7     \n\t"
725        "movq         %1, %%mm6     \n\t"
726        ::"m"(red_16mask),"m"(green_16mask));
727    mm_end = end - 15;
728    while (s < mm_end)
729    {
730        __asm__ volatile(
731        PREFETCH"    32%1           \n\t"
732        "movd          %1, %%mm0    \n\t"
733        "movd         3%1, %%mm3    \n\t"
734        "punpckldq    6%1, %%mm0    \n\t"
735        "punpckldq    9%1, %%mm3    \n\t"
736        "movq       %%mm0, %%mm1    \n\t"
737        "movq       %%mm0, %%mm2    \n\t"
738        "movq       %%mm3, %%mm4    \n\t"
739        "movq       %%mm3, %%mm5    \n\t"
740        "psllq         $8, %%mm0    \n\t"
741        "psllq         $8, %%mm3    \n\t"
742        "pand       %%mm7, %%mm0    \n\t"
743        "pand       %%mm7, %%mm3    \n\t"
744        "psrlq         $5, %%mm1    \n\t"
745        "psrlq         $5, %%mm4    \n\t"
746        "pand       %%mm6, %%mm1    \n\t"
747        "pand       %%mm6, %%mm4    \n\t"
748        "psrlq        $19, %%mm2    \n\t"
749        "psrlq        $19, %%mm5    \n\t"
750        "pand          %2, %%mm2    \n\t"
751        "pand          %2, %%mm5    \n\t"
752        "por        %%mm1, %%mm0    \n\t"
753        "por        %%mm4, %%mm3    \n\t"
754        "por        %%mm2, %%mm0    \n\t"
755        "por        %%mm5, %%mm3    \n\t"
756        "psllq        $16, %%mm3    \n\t"
757        "por        %%mm3, %%mm0    \n\t"
758        MOVNTQ"     %%mm0, %0       \n\t"
759        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
760        d += 4;
761        s += 12;
762    }
763    __asm__ volatile(SFENCE:::"memory");
764    __asm__ volatile(EMMS:::"memory");
765#endif
766    while (s < end)
767    {
768        const int r = *s++;
769        const int g = *s++;
770        const int b = *s++;
771        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
772    }
773}
774
775static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
776{
777    const uint8_t *s = src;
778    const uint8_t *end;
779#if HAVE_MMX
780    const uint8_t *mm_end;
781#endif
782    uint16_t *d = (uint16_t *)dst;
783    end = s + src_size;
784#if HAVE_MMX
785    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
786    __asm__ volatile(
787        "movq          %0, %%mm7    \n\t"
788        "movq          %1, %%mm6    \n\t"
789        ::"m"(red_15mask),"m"(green_15mask));
790    mm_end = end - 11;
791    while (s < mm_end)
792    {
793        __asm__ volatile(
794        PREFETCH"    32%1           \n\t"
795        "movd          %1, %%mm0    \n\t"
796        "movd         3%1, %%mm3    \n\t"
797        "punpckldq    6%1, %%mm0    \n\t"
798        "punpckldq    9%1, %%mm3    \n\t"
799        "movq       %%mm0, %%mm1    \n\t"
800        "movq       %%mm0, %%mm2    \n\t"
801        "movq       %%mm3, %%mm4    \n\t"
802        "movq       %%mm3, %%mm5    \n\t"
803        "psrlq         $3, %%mm0    \n\t"
804        "psrlq         $3, %%mm3    \n\t"
805        "pand          %2, %%mm0    \n\t"
806        "pand          %2, %%mm3    \n\t"
807        "psrlq         $6, %%mm1    \n\t"
808        "psrlq         $6, %%mm4    \n\t"
809        "pand       %%mm6, %%mm1    \n\t"
810        "pand       %%mm6, %%mm4    \n\t"
811        "psrlq         $9, %%mm2    \n\t"
812        "psrlq         $9, %%mm5    \n\t"
813        "pand       %%mm7, %%mm2    \n\t"
814        "pand       %%mm7, %%mm5    \n\t"
815        "por        %%mm1, %%mm0    \n\t"
816        "por        %%mm4, %%mm3    \n\t"
817        "por        %%mm2, %%mm0    \n\t"
818        "por        %%mm5, %%mm3    \n\t"
819        "psllq        $16, %%mm3    \n\t"
820        "por        %%mm3, %%mm0    \n\t"
821        MOVNTQ"     %%mm0, %0       \n\t"
822        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
823        d += 4;
824        s += 12;
825    }
826    __asm__ volatile(SFENCE:::"memory");
827    __asm__ volatile(EMMS:::"memory");
828#endif
829    while (s < end)
830    {
831        const int b = *s++;
832        const int g = *s++;
833        const int r = *s++;
834        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
835    }
836}
837
838static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
839{
840    const uint8_t *s = src;
841    const uint8_t *end;
842#if HAVE_MMX
843    const uint8_t *mm_end;
844#endif
845    uint16_t *d = (uint16_t *)dst;
846    end = s + src_size;
847#if HAVE_MMX
848    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
849    __asm__ volatile(
850        "movq         %0, %%mm7     \n\t"
851        "movq         %1, %%mm6     \n\t"
852        ::"m"(red_15mask),"m"(green_15mask));
853    mm_end = end - 15;
854    while (s < mm_end)
855    {
856        __asm__ volatile(
857        PREFETCH"   32%1            \n\t"
858        "movd         %1, %%mm0     \n\t"
859        "movd        3%1, %%mm3     \n\t"
860        "punpckldq   6%1, %%mm0     \n\t"
861        "punpckldq   9%1, %%mm3     \n\t"
862        "movq      %%mm0, %%mm1     \n\t"
863        "movq      %%mm0, %%mm2     \n\t"
864        "movq      %%mm3, %%mm4     \n\t"
865        "movq      %%mm3, %%mm5     \n\t"
866        "psllq        $7, %%mm0     \n\t"
867        "psllq        $7, %%mm3     \n\t"
868        "pand      %%mm7, %%mm0     \n\t"
869        "pand      %%mm7, %%mm3     \n\t"
870        "psrlq        $6, %%mm1     \n\t"
871        "psrlq        $6, %%mm4     \n\t"
872        "pand      %%mm6, %%mm1     \n\t"
873        "pand      %%mm6, %%mm4     \n\t"
874        "psrlq       $19, %%mm2     \n\t"
875        "psrlq       $19, %%mm5     \n\t"
876        "pand         %2, %%mm2     \n\t"
877        "pand         %2, %%mm5     \n\t"
878        "por       %%mm1, %%mm0     \n\t"
879        "por       %%mm4, %%mm3     \n\t"
880        "por       %%mm2, %%mm0     \n\t"
881        "por       %%mm5, %%mm3     \n\t"
882        "psllq       $16, %%mm3     \n\t"
883        "por       %%mm3, %%mm0     \n\t"
884        MOVNTQ"    %%mm0, %0        \n\t"
885        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
886        d += 4;
887        s += 12;
888    }
889    __asm__ volatile(SFENCE:::"memory");
890    __asm__ volatile(EMMS:::"memory");
891#endif
892    while (s < end)
893    {
894        const int r = *s++;
895        const int g = *s++;
896        const int b = *s++;
897        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
898    }
899}
900
901/*
902  I use less accurate approximation here by simply left-shifting the input
903  value and filling the low order bits with zeroes. This method improves PNG
904  compression but this scheme cannot reproduce white exactly, since it does
905  not generate an all-ones maximum value; the net effect is to darken the
906  image slightly.
907
908  The better method should be "left bit replication":
909
910   4 3 2 1 0
911   ---------
912   1 1 0 1 1
913
914   7 6 5 4 3  2 1 0
915   ----------------
916   1 1 0 1 1  1 1 0
917   |=======|  |===|
918       |      leftmost bits repeated to fill open bits
919       |
920   original bits
921*/
922static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
923{
924    const uint16_t *end;
925#if HAVE_MMX
926    const uint16_t *mm_end;
927#endif
928    uint8_t *d = dst;
929    const uint16_t *s = (const uint16_t*)src;
930    end = s + src_size/2;
931#if HAVE_MMX
932    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
933    mm_end = end - 7;
934    while (s < mm_end)
935    {
936        __asm__ volatile(
937        PREFETCH"    32%1           \n\t"
938        "movq          %1, %%mm0    \n\t"
939        "movq          %1, %%mm1    \n\t"
940        "movq          %1, %%mm2    \n\t"
941        "pand          %2, %%mm0    \n\t"
942        "pand          %3, %%mm1    \n\t"
943        "pand          %4, %%mm2    \n\t"
944        "psllq         $3, %%mm0    \n\t"
945        "psrlq         $2, %%mm1    \n\t"
946        "psrlq         $7, %%mm2    \n\t"
947        "movq       %%mm0, %%mm3    \n\t"
948        "movq       %%mm1, %%mm4    \n\t"
949        "movq       %%mm2, %%mm5    \n\t"
950        "punpcklwd     %5, %%mm0    \n\t"
951        "punpcklwd     %5, %%mm1    \n\t"
952        "punpcklwd     %5, %%mm2    \n\t"
953        "punpckhwd     %5, %%mm3    \n\t"
954        "punpckhwd     %5, %%mm4    \n\t"
955        "punpckhwd     %5, %%mm5    \n\t"
956        "psllq         $8, %%mm1    \n\t"
957        "psllq        $16, %%mm2    \n\t"
958        "por        %%mm1, %%mm0    \n\t"
959        "por        %%mm2, %%mm0    \n\t"
960        "psllq         $8, %%mm4    \n\t"
961        "psllq        $16, %%mm5    \n\t"
962        "por        %%mm4, %%mm3    \n\t"
963        "por        %%mm5, %%mm3    \n\t"
964
965        "movq       %%mm0, %%mm6    \n\t"
966        "movq       %%mm3, %%mm7    \n\t"
967
968        "movq         8%1, %%mm0    \n\t"
969        "movq         8%1, %%mm1    \n\t"
970        "movq         8%1, %%mm2    \n\t"
971        "pand          %2, %%mm0    \n\t"
972        "pand          %3, %%mm1    \n\t"
973        "pand          %4, %%mm2    \n\t"
974        "psllq         $3, %%mm0    \n\t"
975        "psrlq         $2, %%mm1    \n\t"
976        "psrlq         $7, %%mm2    \n\t"
977        "movq       %%mm0, %%mm3    \n\t"
978        "movq       %%mm1, %%mm4    \n\t"
979        "movq       %%mm2, %%mm5    \n\t"
980        "punpcklwd     %5, %%mm0    \n\t"
981        "punpcklwd     %5, %%mm1    \n\t"
982        "punpcklwd     %5, %%mm2    \n\t"
983        "punpckhwd     %5, %%mm3    \n\t"
984        "punpckhwd     %5, %%mm4    \n\t"
985        "punpckhwd     %5, %%mm5    \n\t"
986        "psllq         $8, %%mm1    \n\t"
987        "psllq        $16, %%mm2    \n\t"
988        "por        %%mm1, %%mm0    \n\t"
989        "por        %%mm2, %%mm0    \n\t"
990        "psllq         $8, %%mm4    \n\t"
991        "psllq        $16, %%mm5    \n\t"
992        "por        %%mm4, %%mm3    \n\t"
993        "por        %%mm5, %%mm3    \n\t"
994
995        :"=m"(*d)
996        :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
997        :"memory");
998        /* borrowed 32 to 24 */
999        __asm__ volatile(
1000        "movq       %%mm0, %%mm4    \n\t"
1001        "movq       %%mm3, %%mm5    \n\t"
1002        "movq       %%mm6, %%mm0    \n\t"
1003        "movq       %%mm7, %%mm1    \n\t"
1004
1005        "movq       %%mm4, %%mm6    \n\t"
1006        "movq       %%mm5, %%mm7    \n\t"
1007        "movq       %%mm0, %%mm2    \n\t"
1008        "movq       %%mm1, %%mm3    \n\t"
1009
1010        "psrlq         $8, %%mm2    \n\t"
1011        "psrlq         $8, %%mm3    \n\t"
1012        "psrlq         $8, %%mm6    \n\t"
1013        "psrlq         $8, %%mm7    \n\t"
1014        "pand          %2, %%mm0    \n\t"
1015        "pand          %2, %%mm1    \n\t"
1016        "pand          %2, %%mm4    \n\t"
1017        "pand          %2, %%mm5    \n\t"
1018        "pand          %3, %%mm2    \n\t"
1019        "pand          %3, %%mm3    \n\t"
1020        "pand          %3, %%mm6    \n\t"
1021        "pand          %3, %%mm7    \n\t"
1022        "por        %%mm2, %%mm0    \n\t"
1023        "por        %%mm3, %%mm1    \n\t"
1024        "por        %%mm6, %%mm4    \n\t"
1025        "por        %%mm7, %%mm5    \n\t"
1026
1027        "movq       %%mm1, %%mm2    \n\t"
1028        "movq       %%mm4, %%mm3    \n\t"
1029        "psllq        $48, %%mm2    \n\t"
1030        "psllq        $32, %%mm3    \n\t"
1031        "pand          %4, %%mm2    \n\t"
1032        "pand          %5, %%mm3    \n\t"
1033        "por        %%mm2, %%mm0    \n\t"
1034        "psrlq        $16, %%mm1    \n\t"
1035        "psrlq        $32, %%mm4    \n\t"
1036        "psllq        $16, %%mm5    \n\t"
1037        "por        %%mm3, %%mm1    \n\t"
1038        "pand          %6, %%mm5    \n\t"
1039        "por        %%mm5, %%mm4    \n\t"
1040
1041        MOVNTQ"     %%mm0,   %0     \n\t"
1042        MOVNTQ"     %%mm1,  8%0     \n\t"
1043        MOVNTQ"     %%mm4, 16%0"
1044
1045        :"=m"(*d)
1046        :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1047        :"memory");
1048        d += 24;
1049        s += 8;
1050    }
1051    __asm__ volatile(SFENCE:::"memory");
1052    __asm__ volatile(EMMS:::"memory");
1053#endif
1054    while (s < end)
1055    {
1056        register uint16_t bgr;
1057        bgr = *s++;
1058        *d++ = (bgr&0x1F)<<3;
1059        *d++ = (bgr&0x3E0)>>2;
1060        *d++ = (bgr&0x7C00)>>7;
1061    }
1062}
1063
1064static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1065{
1066    const uint16_t *end;
1067#if HAVE_MMX
1068    const uint16_t *mm_end;
1069#endif
1070    uint8_t *d = (uint8_t *)dst;
1071    const uint16_t *s = (const uint16_t *)src;
1072    end = s + src_size/2;
1073#if HAVE_MMX
1074    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1075    mm_end = end - 7;
1076    while (s < mm_end)
1077    {
1078        __asm__ volatile(
1079        PREFETCH"    32%1           \n\t"
1080        "movq          %1, %%mm0    \n\t"
1081        "movq          %1, %%mm1    \n\t"
1082        "movq          %1, %%mm2    \n\t"
1083        "pand          %2, %%mm0    \n\t"
1084        "pand          %3, %%mm1    \n\t"
1085        "pand          %4, %%mm2    \n\t"
1086        "psllq         $3, %%mm0    \n\t"
1087        "psrlq         $3, %%mm1    \n\t"
1088        "psrlq         $8, %%mm2    \n\t"
1089        "movq       %%mm0, %%mm3    \n\t"
1090        "movq       %%mm1, %%mm4    \n\t"
1091        "movq       %%mm2, %%mm5    \n\t"
1092        "punpcklwd     %5, %%mm0    \n\t"
1093        "punpcklwd     %5, %%mm1    \n\t"
1094        "punpcklwd     %5, %%mm2    \n\t"
1095        "punpckhwd     %5, %%mm3    \n\t"
1096        "punpckhwd     %5, %%mm4    \n\t"
1097        "punpckhwd     %5, %%mm5    \n\t"
1098        "psllq         $8, %%mm1    \n\t"
1099        "psllq        $16, %%mm2    \n\t"
1100        "por        %%mm1, %%mm0    \n\t"
1101        "por        %%mm2, %%mm0    \n\t"
1102        "psllq         $8, %%mm4    \n\t"
1103        "psllq        $16, %%mm5    \n\t"
1104        "por        %%mm4, %%mm3    \n\t"
1105        "por        %%mm5, %%mm3    \n\t"
1106
1107        "movq       %%mm0, %%mm6    \n\t"
1108        "movq       %%mm3, %%mm7    \n\t"
1109
1110        "movq         8%1, %%mm0    \n\t"
1111        "movq         8%1, %%mm1    \n\t"
1112        "movq         8%1, %%mm2    \n\t"
1113        "pand          %2, %%mm0    \n\t"
1114        "pand          %3, %%mm1    \n\t"
1115        "pand          %4, %%mm2    \n\t"
1116        "psllq         $3, %%mm0    \n\t"
1117        "psrlq         $3, %%mm1    \n\t"
1118        "psrlq         $8, %%mm2    \n\t"
1119        "movq       %%mm0, %%mm3    \n\t"
1120        "movq       %%mm1, %%mm4    \n\t"
1121        "movq       %%mm2, %%mm5    \n\t"
1122        "punpcklwd     %5, %%mm0    \n\t"
1123        "punpcklwd     %5, %%mm1    \n\t"
1124        "punpcklwd     %5, %%mm2    \n\t"
1125        "punpckhwd     %5, %%mm3    \n\t"
1126        "punpckhwd     %5, %%mm4    \n\t"
1127        "punpckhwd     %5, %%mm5    \n\t"
1128        "psllq         $8, %%mm1    \n\t"
1129        "psllq        $16, %%mm2    \n\t"
1130        "por        %%mm1, %%mm0    \n\t"
1131        "por        %%mm2, %%mm0    \n\t"
1132        "psllq         $8, %%mm4    \n\t"
1133        "psllq        $16, %%mm5    \n\t"
1134        "por        %%mm4, %%mm3    \n\t"
1135        "por        %%mm5, %%mm3    \n\t"
1136        :"=m"(*d)
1137        :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1138        :"memory");
1139        /* borrowed 32 to 24 */
1140        __asm__ volatile(
1141        "movq       %%mm0, %%mm4    \n\t"
1142        "movq       %%mm3, %%mm5    \n\t"
1143        "movq       %%mm6, %%mm0    \n\t"
1144        "movq       %%mm7, %%mm1    \n\t"
1145
1146        "movq       %%mm4, %%mm6    \n\t"
1147        "movq       %%mm5, %%mm7    \n\t"
1148        "movq       %%mm0, %%mm2    \n\t"
1149        "movq       %%mm1, %%mm3    \n\t"
1150
1151        "psrlq         $8, %%mm2    \n\t"
1152        "psrlq         $8, %%mm3    \n\t"
1153        "psrlq         $8, %%mm6    \n\t"
1154        "psrlq         $8, %%mm7    \n\t"
1155        "pand          %2, %%mm0    \n\t"
1156        "pand          %2, %%mm1    \n\t"
1157        "pand          %2, %%mm4    \n\t"
1158        "pand          %2, %%mm5    \n\t"
1159        "pand          %3, %%mm2    \n\t"
1160        "pand          %3, %%mm3    \n\t"
1161        "pand          %3, %%mm6    \n\t"
1162        "pand          %3, %%mm7    \n\t"
1163        "por        %%mm2, %%mm0    \n\t"
1164        "por        %%mm3, %%mm1    \n\t"
1165        "por        %%mm6, %%mm4    \n\t"
1166        "por        %%mm7, %%mm5    \n\t"
1167
1168        "movq       %%mm1, %%mm2    \n\t"
1169        "movq       %%mm4, %%mm3    \n\t"
1170        "psllq        $48, %%mm2    \n\t"
1171        "psllq        $32, %%mm3    \n\t"
1172        "pand          %4, %%mm2    \n\t"
1173        "pand          %5, %%mm3    \n\t"
1174        "por        %%mm2, %%mm0    \n\t"
1175        "psrlq        $16, %%mm1    \n\t"
1176        "psrlq        $32, %%mm4    \n\t"
1177        "psllq        $16, %%mm5    \n\t"
1178        "por        %%mm3, %%mm1    \n\t"
1179        "pand          %6, %%mm5    \n\t"
1180        "por        %%mm5, %%mm4    \n\t"
1181
1182        MOVNTQ"     %%mm0,   %0     \n\t"
1183        MOVNTQ"     %%mm1,  8%0     \n\t"
1184        MOVNTQ"     %%mm4, 16%0"
1185
1186        :"=m"(*d)
1187        :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1188        :"memory");
1189        d += 24;
1190        s += 8;
1191    }
1192    __asm__ volatile(SFENCE:::"memory");
1193    __asm__ volatile(EMMS:::"memory");
1194#endif
1195    while (s < end)
1196    {
1197        register uint16_t bgr;
1198        bgr = *s++;
1199        *d++ = (bgr&0x1F)<<3;
1200        *d++ = (bgr&0x7E0)>>3;
1201        *d++ = (bgr&0xF800)>>8;
1202    }
1203}
1204
1205static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1206{
1207    const uint16_t *end;
1208#if HAVE_MMX
1209    const uint16_t *mm_end;
1210#endif
1211    uint8_t *d = dst;
1212    const uint16_t *s = (const uint16_t *)src;
1213    end = s + src_size/2;
1214#if HAVE_MMX
1215    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1216    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1217    mm_end = end - 3;
1218    while (s < mm_end)
1219    {
1220        __asm__ volatile(
1221        PREFETCH"    32%1           \n\t"
1222        "movq          %1, %%mm0    \n\t"
1223        "movq          %1, %%mm1    \n\t"
1224        "movq          %1, %%mm2    \n\t"
1225        "pand          %2, %%mm0    \n\t"
1226        "pand          %3, %%mm1    \n\t"
1227        "pand          %4, %%mm2    \n\t"
1228        "psllq         $3, %%mm0    \n\t"
1229        "psrlq         $2, %%mm1    \n\t"
1230        "psrlq         $7, %%mm2    \n\t"
1231        "movq       %%mm0, %%mm3    \n\t"
1232        "movq       %%mm1, %%mm4    \n\t"
1233        "movq       %%mm2, %%mm5    \n\t"
1234        "punpcklwd  %%mm7, %%mm0    \n\t"
1235        "punpcklwd  %%mm7, %%mm1    \n\t"
1236        "punpcklwd  %%mm7, %%mm2    \n\t"
1237        "punpckhwd  %%mm7, %%mm3    \n\t"
1238        "punpckhwd  %%mm7, %%mm4    \n\t"
1239        "punpckhwd  %%mm7, %%mm5    \n\t"
1240        "psllq         $8, %%mm1    \n\t"
1241        "psllq        $16, %%mm2    \n\t"
1242        "por        %%mm1, %%mm0    \n\t"
1243        "por        %%mm2, %%mm0    \n\t"
1244        "psllq         $8, %%mm4    \n\t"
1245        "psllq        $16, %%mm5    \n\t"
1246        "por        %%mm4, %%mm3    \n\t"
1247        "por        %%mm5, %%mm3    \n\t"
1248        MOVNTQ"     %%mm0,  %0      \n\t"
1249        MOVNTQ"     %%mm3, 8%0      \n\t"
1250        :"=m"(*d)
1251        :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1252        :"memory");
1253        d += 16;
1254        s += 4;
1255    }
1256    __asm__ volatile(SFENCE:::"memory");
1257    __asm__ volatile(EMMS:::"memory");
1258#endif
1259    while (s < end)
1260    {
1261#if 0 //slightly slower on Athlon
1262        int bgr= *s++;
1263        *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1264#else
1265        register uint16_t bgr;
1266        bgr = *s++;
1267#ifdef WORDS_BIGENDIAN
1268        *d++ = 255;
1269        *d++ = (bgr&0x7C00)>>7;
1270        *d++ = (bgr&0x3E0)>>2;
1271        *d++ = (bgr&0x1F)<<3;
1272#else
1273        *d++ = (bgr&0x1F)<<3;
1274        *d++ = (bgr&0x3E0)>>2;
1275        *d++ = (bgr&0x7C00)>>7;
1276        *d++ = 255;
1277#endif
1278
1279#endif
1280    }
1281}
1282
1283static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1284{
1285    const uint16_t *end;
1286#if HAVE_MMX
1287    const uint16_t *mm_end;
1288#endif
1289    uint8_t *d = dst;
1290    const uint16_t *s = (const uint16_t*)src;
1291    end = s + src_size/2;
1292#if HAVE_MMX
1293    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1294    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1295    mm_end = end - 3;
1296    while (s < mm_end)
1297    {
1298        __asm__ volatile(
1299        PREFETCH"    32%1           \n\t"
1300        "movq          %1, %%mm0    \n\t"
1301        "movq          %1, %%mm1    \n\t"
1302        "movq          %1, %%mm2    \n\t"
1303        "pand          %2, %%mm0    \n\t"
1304        "pand          %3, %%mm1    \n\t"
1305        "pand          %4, %%mm2    \n\t"
1306        "psllq         $3, %%mm0    \n\t"
1307        "psrlq         $3, %%mm1    \n\t"
1308        "psrlq         $8, %%mm2    \n\t"
1309        "movq       %%mm0, %%mm3    \n\t"
1310        "movq       %%mm1, %%mm4    \n\t"
1311        "movq       %%mm2, %%mm5    \n\t"
1312        "punpcklwd  %%mm7, %%mm0    \n\t"
1313        "punpcklwd  %%mm7, %%mm1    \n\t"
1314        "punpcklwd  %%mm7, %%mm2    \n\t"
1315        "punpckhwd  %%mm7, %%mm3    \n\t"
1316        "punpckhwd  %%mm7, %%mm4    \n\t"
1317        "punpckhwd  %%mm7, %%mm5    \n\t"
1318        "psllq         $8, %%mm1    \n\t"
1319        "psllq        $16, %%mm2    \n\t"
1320        "por        %%mm1, %%mm0    \n\t"
1321        "por        %%mm2, %%mm0    \n\t"
1322        "psllq         $8, %%mm4    \n\t"
1323        "psllq        $16, %%mm5    \n\t"
1324        "por        %%mm4, %%mm3    \n\t"
1325        "por        %%mm5, %%mm3    \n\t"
1326        MOVNTQ"     %%mm0, %0       \n\t"
1327        MOVNTQ"     %%mm3, 8%0      \n\t"
1328        :"=m"(*d)
1329        :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1330        :"memory");
1331        d += 16;
1332        s += 4;
1333    }
1334    __asm__ volatile(SFENCE:::"memory");
1335    __asm__ volatile(EMMS:::"memory");
1336#endif
1337    while (s < end)
1338    {
1339        register uint16_t bgr;
1340        bgr = *s++;
1341#ifdef WORDS_BIGENDIAN
1342        *d++ = 255;
1343        *d++ = (bgr&0xF800)>>8;
1344        *d++ = (bgr&0x7E0)>>3;
1345        *d++ = (bgr&0x1F)<<3;
1346#else
1347        *d++ = (bgr&0x1F)<<3;
1348        *d++ = (bgr&0x7E0)>>3;
1349        *d++ = (bgr&0xF800)>>8;
1350        *d++ = 255;
1351#endif
1352    }
1353}
1354
1355static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1356{
1357    long idx = 15 - src_size;
1358    const uint8_t *s = src-idx;
1359    uint8_t *d = dst-idx;
1360#if HAVE_MMX
1361    __asm__ volatile(
1362    "test          %0, %0           \n\t"
1363    "jns           2f               \n\t"
1364    PREFETCH"       (%1, %0)        \n\t"
1365    "movq          %3, %%mm7        \n\t"
1366    "pxor          %4, %%mm7        \n\t"
1367    "movq       %%mm7, %%mm6        \n\t"
1368    "pxor          %5, %%mm7        \n\t"
1369    ASMALIGN(4)
1370    "1:                             \n\t"
1371    PREFETCH"     32(%1, %0)        \n\t"
1372    "movq           (%1, %0), %%mm0 \n\t"
1373    "movq          8(%1, %0), %%mm1 \n\t"
1374# if HAVE_MMX2
1375    "pshufw      $177, %%mm0, %%mm3 \n\t"
1376    "pshufw      $177, %%mm1, %%mm5 \n\t"
1377    "pand       %%mm7, %%mm0        \n\t"
1378    "pand       %%mm6, %%mm3        \n\t"
1379    "pand       %%mm7, %%mm1        \n\t"
1380    "pand       %%mm6, %%mm5        \n\t"
1381    "por        %%mm3, %%mm0        \n\t"
1382    "por        %%mm5, %%mm1        \n\t"
1383# else
1384    "movq       %%mm0, %%mm2        \n\t"
1385    "movq       %%mm1, %%mm4        \n\t"
1386    "pand       %%mm7, %%mm0        \n\t"
1387    "pand       %%mm6, %%mm2        \n\t"
1388    "pand       %%mm7, %%mm1        \n\t"
1389    "pand       %%mm6, %%mm4        \n\t"
1390    "movq       %%mm2, %%mm3        \n\t"
1391    "movq       %%mm4, %%mm5        \n\t"
1392    "pslld        $16, %%mm2        \n\t"
1393    "psrld        $16, %%mm3        \n\t"
1394    "pslld        $16, %%mm4        \n\t"
1395    "psrld        $16, %%mm5        \n\t"
1396    "por        %%mm2, %%mm0        \n\t"
1397    "por        %%mm4, %%mm1        \n\t"
1398    "por        %%mm3, %%mm0        \n\t"
1399    "por        %%mm5, %%mm1        \n\t"
1400# endif
1401    MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1402    MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1403    "add          $16, %0           \n\t"
1404    "js            1b               \n\t"
1405    SFENCE"                         \n\t"
1406    EMMS"                           \n\t"
1407    "2:                             \n\t"
1408    : "+&r"(idx)
1409    : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1410    : "memory");
1411#endif
1412    for (; idx<15; idx+=4) {
1413        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1414        v &= 0xff00ff;
1415        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1416    }
1417}
1418
1419static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1420{
1421    unsigned i;
1422#if HAVE_MMX
1423    long mmx_size= 23 - src_size;
1424    __asm__ volatile (
1425    "test             %%"REG_a", %%"REG_a"          \n\t"
1426    "jns                     2f                     \n\t"
1427    "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1428    "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1429    "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1430    ASMALIGN(4)
1431    "1:                                             \n\t"
1432    PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1433    "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1434    "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1435    "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1436    "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1437    "pand                 %%mm5, %%mm0              \n\t"
1438    "pand                 %%mm6, %%mm1              \n\t"
1439    "pand                 %%mm7, %%mm2              \n\t"
1440    "por                  %%mm0, %%mm1              \n\t"
1441    "por                  %%mm2, %%mm1              \n\t"
1442    "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1443    MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1444    "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1445    "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1446    "pand                 %%mm7, %%mm0              \n\t"
1447    "pand                 %%mm5, %%mm1              \n\t"
1448    "pand                 %%mm6, %%mm2              \n\t"
1449    "por                  %%mm0, %%mm1              \n\t"
1450    "por                  %%mm2, %%mm1              \n\t"
1451    "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1452    MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1453    "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1454    "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1455    "pand                 %%mm6, %%mm0              \n\t"
1456    "pand                 %%mm7, %%mm1              \n\t"
1457    "pand                 %%mm5, %%mm2              \n\t"
1458    "por                  %%mm0, %%mm1              \n\t"
1459    "por                  %%mm2, %%mm1              \n\t"
1460    MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1461    "add                    $24, %%"REG_a"          \n\t"
1462    " js                     1b                     \n\t"
1463    "2:                                             \n\t"
1464    : "+a" (mmx_size)
1465    : "r" (src-mmx_size), "r"(dst-mmx_size)
1466    );
1467
1468    __asm__ volatile(SFENCE:::"memory");
1469    __asm__ volatile(EMMS:::"memory");
1470
1471    if (mmx_size==23) return; //finished, was multiple of 8
1472
1473    src+= src_size;
1474    dst+= src_size;
1475    src_size= 23-mmx_size;
1476    src-= src_size;
1477    dst-= src_size;
1478#endif
1479    for (i=0; i<src_size; i+=3)
1480    {
1481        register uint8_t x;
1482        x          = src[i + 2];
1483        dst[i + 1] = src[i + 1];
1484        dst[i + 2] = src[i + 0];
1485        dst[i + 0] = x;
1486    }
1487}
1488
1489static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1490                                           long width, long height,
1491                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1492{
1493    long y;
1494    const long chromWidth= width>>1;
1495    for (y=0; y<height; y++)
1496    {
1497#if HAVE_MMX
1498//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1499        __asm__ volatile(
1500        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1501        ASMALIGN(4)
1502        "1:                                         \n\t"
1503        PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1504        PREFETCH"    32(%2, %%"REG_a")              \n\t"
1505        PREFETCH"    32(%3, %%"REG_a")              \n\t"
1506        "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1507        "movq                    %%mm0, %%mm2       \n\t" // U(0)
1508        "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1509        "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1510        "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1511
1512        "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1513        "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1514        "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1515        "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1516        "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1517        "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1518        "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1519        "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1520
1521        MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1522        MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1523        MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1524        MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1525
1526        "add                        $8, %%"REG_a"   \n\t"
1527        "cmp                        %4, %%"REG_a"   \n\t"
1528        " jb                        1b              \n\t"
1529        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1530        : "%"REG_a
1531        );
1532#else
1533
1534#if ARCH_ALPHA && HAVE_MVI
1535#define pl2yuy2(n)                  \
1536    y1 = yc[n];                     \
1537    y2 = yc2[n];                    \
1538    u = uc[n];                      \
1539    v = vc[n];                      \
1540    __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1541    __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1542    __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1543    __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1544    yuv1 = (u << 8) + (v << 24);                \
1545    yuv2 = yuv1 + y2;               \
1546    yuv1 += y1;                     \
1547    qdst[n]  = yuv1;                \
1548    qdst2[n] = yuv2;
1549
1550        int i;
1551        uint64_t *qdst = (uint64_t *) dst;
1552        uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1553        const uint32_t *yc = (uint32_t *) ysrc;
1554        const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1555        const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1556        for (i = 0; i < chromWidth; i += 8){
1557            uint64_t y1, y2, yuv1, yuv2;
1558            uint64_t u, v;
1559            /* Prefetch */
1560            __asm__("ldq $31,64(%0)" :: "r"(yc));
1561            __asm__("ldq $31,64(%0)" :: "r"(yc2));
1562            __asm__("ldq $31,64(%0)" :: "r"(uc));
1563            __asm__("ldq $31,64(%0)" :: "r"(vc));
1564
1565            pl2yuy2(0);
1566            pl2yuy2(1);
1567            pl2yuy2(2);
1568            pl2yuy2(3);
1569
1570            yc    += 4;
1571            yc2   += 4;
1572            uc    += 4;
1573            vc    += 4;
1574            qdst  += 4;
1575            qdst2 += 4;
1576        }
1577        y++;
1578        ysrc += lumStride;
1579        dst += dstStride;
1580
1581#elif HAVE_FAST_64BIT
1582        int i;
1583        uint64_t *ldst = (uint64_t *) dst;
1584        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1585        for (i = 0; i < chromWidth; i += 2){
1586            uint64_t k, l;
1587            k = yc[0] + (uc[0] << 8) +
1588                (yc[1] << 16) + (vc[0] << 24);
1589            l = yc[2] + (uc[1] << 8) +
1590                (yc[3] << 16) + (vc[1] << 24);
1591            *ldst++ = k + (l << 32);
1592            yc += 4;
1593            uc += 2;
1594            vc += 2;
1595        }
1596
1597#else
1598        int i, *idst = (int32_t *) dst;
1599        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1600        for (i = 0; i < chromWidth; i++){
1601#ifdef WORDS_BIGENDIAN
1602            *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1603                (yc[1] << 8) + (vc[0] << 0);
1604#else
1605            *idst++ = yc[0] + (uc[0] << 8) +
1606                (yc[1] << 16) + (vc[0] << 24);
1607#endif
1608            yc += 2;
1609            uc++;
1610            vc++;
1611        }
1612#endif
1613#endif
1614        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1615        {
1616            usrc += chromStride;
1617            vsrc += chromStride;
1618        }
1619        ysrc += lumStride;
1620        dst  += dstStride;
1621    }
1622#if HAVE_MMX
1623__asm__(    EMMS"       \n\t"
1624        SFENCE"     \n\t"
1625        :::"memory");
1626#endif
1627}
1628
1629/**
1630 * Height should be a multiple of 2 and width should be a multiple of 16.
1631 * (If this is a problem for anyone then tell me, and I will fix it.)
1632 */
1633static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1634                                      long width, long height,
1635                                      long lumStride, long chromStride, long dstStride)
1636{
1637    //FIXME interpolate chroma
1638    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1639}
1640
1641static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1642                                           long width, long height,
1643                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1644{
1645    long y;
1646    const long chromWidth= width>>1;
1647    for (y=0; y<height; y++)
1648    {
1649#if HAVE_MMX
1650//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1651        __asm__ volatile(
1652        "xor                %%"REG_a", %%"REG_a"    \n\t"
1653        ASMALIGN(4)
1654        "1:                                         \n\t"
1655        PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1656        PREFETCH"   32(%2, %%"REG_a")               \n\t"
1657        PREFETCH"   32(%3, %%"REG_a")               \n\t"
1658        "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1659        "movq                   %%mm0, %%mm2        \n\t" // U(0)
1660        "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1661        "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1662        "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1663
1664        "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1665        "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1666        "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1667        "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1668        "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1669        "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1670        "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1671        "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1672
1673        MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1674        MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1675        MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1676        MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1677
1678        "add                       $8, %%"REG_a"    \n\t"
1679        "cmp                       %4, %%"REG_a"    \n\t"
1680        " jb                       1b               \n\t"
1681        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1682        : "%"REG_a
1683        );
1684#else
1685//FIXME adapt the Alpha ASM code from yv12->yuy2
1686
1687#if HAVE_FAST_64BIT
1688        int i;
1689        uint64_t *ldst = (uint64_t *) dst;
1690        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1691        for (i = 0; i < chromWidth; i += 2){
1692            uint64_t k, l;
1693            k = uc[0] + (yc[0] << 8) +
1694                (vc[0] << 16) + (yc[1] << 24);
1695            l = uc[1] + (yc[2] << 8) +
1696                (vc[1] << 16) + (yc[3] << 24);
1697            *ldst++ = k + (l << 32);
1698            yc += 4;
1699            uc += 2;
1700            vc += 2;
1701        }
1702
1703#else
1704        int i, *idst = (int32_t *) dst;
1705        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1706        for (i = 0; i < chromWidth; i++){
1707#ifdef WORDS_BIGENDIAN
1708            *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1709                (vc[0] << 8) + (yc[1] << 0);
1710#else
1711            *idst++ = uc[0] + (yc[0] << 8) +
1712               (vc[0] << 16) + (yc[1] << 24);
1713#endif
1714            yc += 2;
1715            uc++;
1716            vc++;
1717        }
1718#endif
1719#endif
1720        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1721        {
1722            usrc += chromStride;
1723            vsrc += chromStride;
1724        }
1725        ysrc += lumStride;
1726        dst += dstStride;
1727    }
1728#if HAVE_MMX
1729__asm__(    EMMS"       \n\t"
1730        SFENCE"     \n\t"
1731        :::"memory");
1732#endif
1733}
1734
1735/**
1736 * Height should be a multiple of 2 and width should be a multiple of 16
1737 * (If this is a problem for anyone then tell me, and I will fix it.)
1738 */
1739static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1740                                      long width, long height,
1741                                      long lumStride, long chromStride, long dstStride)
1742{
1743    //FIXME interpolate chroma
1744    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1745}
1746
1747/**
1748 * Width should be a multiple of 16.
1749 */
1750static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1751                                         long width, long height,
1752                                         long lumStride, long chromStride, long dstStride)
1753{
1754    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1755}
1756
1757/**
1758 * Width should be a multiple of 16.
1759 */
1760static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1761                                         long width, long height,
1762                                         long lumStride, long chromStride, long dstStride)
1763{
1764    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1765}
1766
1767/**
1768 * Height should be a multiple of 2 and width should be a multiple of 16.
1769 * (If this is a problem for anyone then tell me, and I will fix it.)
1770 */
1771static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1772                                      long width, long height,
1773                                      long lumStride, long chromStride, long srcStride)
1774{
1775    long y;
1776    const long chromWidth= width>>1;
1777    for (y=0; y<height; y+=2)
1778    {
1779#if HAVE_MMX
1780        __asm__ volatile(
1781        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1782        "pcmpeqw                 %%mm7, %%mm7       \n\t"
1783        "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1784        ASMALIGN(4)
1785        "1:                \n\t"
1786        PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1787        "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1788        "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1789        "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1790        "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1791        "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1792        "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1793        "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1794        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1795        "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1796        "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1797
1798        MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1799
1800        "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1801        "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1802        "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1803        "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1804        "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1805        "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1806        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1807        "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1808        "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1809        "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1810
1811        MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1812
1813        "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1814        "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1815        "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1816        "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1817        "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1818        "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1819        "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1820        "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1821
1822        MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1823        MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1824
1825        "add                        $8, %%"REG_a"   \n\t"
1826        "cmp                        %4, %%"REG_a"   \n\t"
1827        " jb                        1b              \n\t"
1828        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1829        : "memory", "%"REG_a
1830        );
1831
1832        ydst += lumStride;
1833        src  += srcStride;
1834
1835        __asm__ volatile(
1836        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1837        ASMALIGN(4)
1838        "1:                                         \n\t"
1839        PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1840        "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1841        "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1842        "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1843        "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1844        "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1845        "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1846        "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1847        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1848        "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1849        "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1850
1851        MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1852        MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1853
1854        "add                        $8, %%"REG_a"   \n\t"
1855        "cmp                        %4, %%"REG_a"   \n\t"
1856        " jb                        1b              \n\t"
1857
1858        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1859        : "memory", "%"REG_a
1860        );
1861#else
1862        long i;
1863        for (i=0; i<chromWidth; i++)
1864        {
1865            ydst[2*i+0]     = src[4*i+0];
1866            udst[i]     = src[4*i+1];
1867            ydst[2*i+1]     = src[4*i+2];
1868            vdst[i]     = src[4*i+3];
1869        }
1870        ydst += lumStride;
1871        src  += srcStride;
1872
1873        for (i=0; i<chromWidth; i++)
1874        {
1875            ydst[2*i+0]     = src[4*i+0];
1876            ydst[2*i+1]     = src[4*i+2];
1877        }
1878#endif
1879        udst += chromStride;
1880        vdst += chromStride;
1881        ydst += lumStride;
1882        src  += srcStride;
1883    }
1884#if HAVE_MMX
1885__asm__ volatile(   EMMS"       \n\t"
1886                SFENCE"     \n\t"
1887                :::"memory");
1888#endif
1889}
1890
1891static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1892                                      uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1893                                      long width, long height, long lumStride, long chromStride)
1894{
1895    /* Y Plane */
1896    memcpy(ydst, ysrc, width*height);
1897
1898    /* XXX: implement upscaling for U,V */
1899}
1900
1901static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1902{
1903    long x,y;
1904
1905    dst[0]= src[0];
1906
1907    // first line
1908    for (x=0; x<srcWidth-1; x++){
1909        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1910        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1911    }
1912    dst[2*srcWidth-1]= src[srcWidth-1];
1913
1914        dst+= dstStride;
1915
1916    for (y=1; y<srcHeight; y++){
1917#if HAVE_MMX2 || HAVE_AMD3DNOW
1918        const long mmxSize= srcWidth&~15;
1919        __asm__ volatile(
1920        "mov           %4, %%"REG_a"            \n\t"
1921        "1:                                     \n\t"
1922        "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1923        "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1924        "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1925        "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1926        "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1927        "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1928        PAVGB"                  %%mm0, %%mm5    \n\t"
1929        PAVGB"                  %%mm0, %%mm3    \n\t"
1930        PAVGB"                  %%mm0, %%mm5    \n\t"
1931        PAVGB"                  %%mm0, %%mm3    \n\t"
1932        PAVGB"                  %%mm1, %%mm4    \n\t"
1933        PAVGB"                  %%mm1, %%mm2    \n\t"
1934        PAVGB"                  %%mm1, %%mm4    \n\t"
1935        PAVGB"                  %%mm1, %%mm2    \n\t"
1936        "movq                   %%mm5, %%mm7    \n\t"
1937        "movq                   %%mm4, %%mm6    \n\t"
1938        "punpcklbw              %%mm3, %%mm5    \n\t"
1939        "punpckhbw              %%mm3, %%mm7    \n\t"
1940        "punpcklbw              %%mm2, %%mm4    \n\t"
1941        "punpckhbw              %%mm2, %%mm6    \n\t"
1942#if 1
1943        MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1944        MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1945        MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1946        MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1947#else
1948        "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1949        "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1950        "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1951        "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1952#endif
1953        "add                       $8, %%"REG_a"            \n\t"
1954        " js                       1b                       \n\t"
1955        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1956           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1957           "g" (-mmxSize)
1958        : "%"REG_a
1959
1960        );
1961#else
1962        const long mmxSize=1;
1963#endif
1964        dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1965        dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1966
1967        for (x=mmxSize-1; x<srcWidth-1; x++){
1968            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1969            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1970            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1971            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1972        }
1973        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1974        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1975
1976        dst+=dstStride*2;
1977        src+=srcStride;
1978    }
1979
1980    // last line
1981#if 1
1982    dst[0]= src[0];
1983
1984    for (x=0; x<srcWidth-1; x++){
1985        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1986        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1987    }
1988    dst[2*srcWidth-1]= src[srcWidth-1];
1989#else
1990    for (x=0; x<srcWidth; x++){
1991        dst[2*x+0]=
1992        dst[2*x+1]= src[x];
1993    }
1994#endif
1995
1996#if HAVE_MMX
1997__asm__ volatile(   EMMS"       \n\t"
1998                SFENCE"     \n\t"
1999                :::"memory");
2000#endif
2001}
2002
2003/**
2004 * Height should be a multiple of 2 and width should be a multiple of 16.
2005 * (If this is a problem for anyone then tell me, and I will fix it.)
2006 * Chrominance data is only taken from every second line, others are ignored.
2007 * FIXME: Write HQ version.
2008 */
2009static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2010                                      long width, long height,
2011                                      long lumStride, long chromStride, long srcStride)
2012{
2013    long y;
2014    const long chromWidth= width>>1;
2015    for (y=0; y<height; y+=2)
2016    {
2017#if HAVE_MMX
2018        __asm__ volatile(
2019        "xor                 %%"REG_a", %%"REG_a"   \n\t"
2020        "pcmpeqw             %%mm7, %%mm7   \n\t"
2021        "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
2022        ASMALIGN(4)
2023        "1:                                 \n\t"
2024        PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
2025        "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
2026        "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
2027        "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
2028        "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
2029        "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
2030        "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
2031        "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
2032        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
2033        "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
2034        "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
2035
2036        MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
2037
2038        "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
2039        "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
2040        "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
2041        "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
2042        "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
2043        "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
2044        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
2045        "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
2046        "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
2047        "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
2048
2049        MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
2050
2051        "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
2052        "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
2053        "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
2054        "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
2055        "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
2056        "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
2057        "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
2058        "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
2059
2060        MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
2061        MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
2062
2063        "add                    $8, %%"REG_a"   \n\t"
2064        "cmp                    %4, %%"REG_a"   \n\t"
2065        " jb                    1b          \n\t"
2066        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2067        : "memory", "%"REG_a
2068        );
2069
2070        ydst += lumStride;
2071        src  += srcStride;
2072
2073        __asm__ volatile(
2074        "xor                 %%"REG_a", %%"REG_a"   \n\t"
2075        ASMALIGN(4)
2076        "1:                                 \n\t"
2077        PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
2078        "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
2079        "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
2080        "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
2081        "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
2082        "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
2083        "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
2084        "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
2085        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
2086        "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
2087        "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
2088
2089        MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
2090        MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2091
2092        "add                    $8, %%"REG_a"   \n\t"
2093        "cmp                    %4, %%"REG_a"   \n\t"
2094        " jb                    1b          \n\t"
2095
2096        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2097        : "memory", "%"REG_a
2098        );
2099#else
2100        long i;
2101        for (i=0; i<chromWidth; i++)
2102        {
2103            udst[i]     = src[4*i+0];
2104            ydst[2*i+0] = src[4*i+1];
2105            vdst[i]     = src[4*i+2];
2106            ydst[2*i+1] = src[4*i+3];
2107        }
2108        ydst += lumStride;
2109        src  += srcStride;
2110
2111        for (i=0; i<chromWidth; i++)
2112        {
2113            ydst[2*i+0] = src[4*i+1];
2114            ydst[2*i+1] = src[4*i+3];
2115        }
2116#endif
2117        udst += chromStride;
2118        vdst += chromStride;
2119        ydst += lumStride;
2120        src  += srcStride;
2121    }
2122#if HAVE_MMX
2123__asm__ volatile(   EMMS"       \n\t"
2124                SFENCE"     \n\t"
2125                :::"memory");
2126#endif
2127}
2128
2129/**
2130 * Height should be a multiple of 2 and width should be a multiple of 2.
2131 * (If this is a problem for anyone then tell me, and I will fix it.)
2132 * Chrominance data is only taken from every second line,
2133 * others are ignored in the C version.
2134 * FIXME: Write HQ version.
2135 */
2136static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2137                                       long width, long height,
2138                                       long lumStride, long chromStride, long srcStride)
2139{
2140    long y;
2141    const long chromWidth= width>>1;
2142#if HAVE_MMX
2143    for (y=0; y<height-2; y+=2)
2144    {
2145        long i;
2146        for (i=0; i<2; i++)
2147        {
2148            __asm__ volatile(
2149            "mov                        %2, %%"REG_a"   \n\t"
2150            "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
2151            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2152            "pxor                    %%mm7, %%mm7       \n\t"
2153            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2154            ASMALIGN(4)
2155            "1:                                         \n\t"
2156            PREFETCH"    64(%0, %%"REG_d")              \n\t"
2157            "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2158            "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
2159            "punpcklbw               %%mm7, %%mm0       \n\t"
2160            "punpcklbw               %%mm7, %%mm1       \n\t"
2161            "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
2162            "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
2163            "punpcklbw               %%mm7, %%mm2       \n\t"
2164            "punpcklbw               %%mm7, %%mm3       \n\t"
2165            "pmaddwd                 %%mm6, %%mm0       \n\t"
2166            "pmaddwd                 %%mm6, %%mm1       \n\t"
2167            "pmaddwd                 %%mm6, %%mm2       \n\t"
2168            "pmaddwd                 %%mm6, %%mm3       \n\t"
2169#ifndef FAST_BGR2YV12
2170            "psrad                      $8, %%mm0       \n\t"
2171            "psrad                      $8, %%mm1       \n\t"
2172            "psrad                      $8, %%mm2       \n\t"
2173            "psrad                      $8, %%mm3       \n\t"
2174#endif
2175            "packssdw                %%mm1, %%mm0       \n\t"
2176            "packssdw                %%mm3, %%mm2       \n\t"
2177            "pmaddwd                 %%mm5, %%mm0       \n\t"
2178            "pmaddwd                 %%mm5, %%mm2       \n\t"
2179            "packssdw                %%mm2, %%mm0       \n\t"
2180            "psraw                      $7, %%mm0       \n\t"
2181
2182            "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2183            "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
2184            "punpcklbw               %%mm7, %%mm4       \n\t"
2185            "punpcklbw               %%mm7, %%mm1       \n\t"
2186            "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
2187            "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
2188            "punpcklbw               %%mm7, %%mm2       \n\t"
2189            "punpcklbw               %%mm7, %%mm3       \n\t"
2190            "pmaddwd                 %%mm6, %%mm4       \n\t"
2191            "pmaddwd                 %%mm6, %%mm1       \n\t"
2192            "pmaddwd                 %%mm6, %%mm2       \n\t"
2193            "pmaddwd                 %%mm6, %%mm3       \n\t"
2194#ifndef FAST_BGR2YV12
2195            "psrad                      $8, %%mm4       \n\t"
2196            "psrad                      $8, %%mm1       \n\t"
2197            "psrad                      $8, %%mm2       \n\t"
2198            "psrad                      $8, %%mm3       \n\t"
2199#endif
2200            "packssdw                %%mm1, %%mm4       \n\t"
2201            "packssdw                %%mm3, %%mm2       \n\t"
2202            "pmaddwd                 %%mm5, %%mm4       \n\t"
2203            "pmaddwd                 %%mm5, %%mm2       \n\t"
2204            "add                       $24, %%"REG_d"   \n\t"
2205            "packssdw                %%mm2, %%mm4       \n\t"
2206            "psraw                      $7, %%mm4       \n\t"
2207
2208            "packuswb                %%mm4, %%mm0       \n\t"
2209            "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
2210
2211            MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
2212            "add                        $8,      %%"REG_a"  \n\t"
2213            " js                        1b                  \n\t"
2214            : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2215            : "%"REG_a, "%"REG_d
2216            );
2217            ydst += lumStride;
2218            src  += srcStride;
2219        }
2220        src -= srcStride*2;
2221        __asm__ volatile(
2222        "mov                        %4, %%"REG_a"   \n\t"
2223        "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2224        "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
2225        "pxor                    %%mm7, %%mm7       \n\t"
2226        "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2227        "add                 %%"REG_d", %%"REG_d"   \n\t"
2228        ASMALIGN(4)
2229        "1:                                         \n\t"
2230        PREFETCH"    64(%0, %%"REG_d")              \n\t"
2231        PREFETCH"    64(%1, %%"REG_d")              \n\t"
2232#if HAVE_MMX2 || HAVE_AMD3DNOW
2233        "movq          (%0, %%"REG_d"), %%mm0       \n\t"
2234        "movq          (%1, %%"REG_d"), %%mm1       \n\t"
2235        "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
2236        "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
2237        PAVGB"                   %%mm1, %%mm0       \n\t"
2238        PAVGB"                   %%mm3, %%mm2       \n\t"
2239        "movq                    %%mm0, %%mm1       \n\t"
2240        "movq                    %%mm2, %%mm3       \n\t"
2241        "psrlq                     $24, %%mm0       \n\t"
2242        "psrlq                     $24, %%mm2       \n\t"
2243        PAVGB"                   %%mm1, %%mm0       \n\t"
2244        PAVGB"                   %%mm3, %%mm2       \n\t"
2245        "punpcklbw               %%mm7, %%mm0       \n\t"
2246        "punpcklbw               %%mm7, %%mm2       \n\t"
2247#else
2248        "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2249        "movd          (%1, %%"REG_d"), %%mm1       \n\t"
2250        "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
2251        "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
2252        "punpcklbw               %%mm7, %%mm0       \n\t"
2253        "punpcklbw               %%mm7, %%mm1       \n\t"
2254        "punpcklbw               %%mm7, %%mm2       \n\t"
2255        "punpcklbw               %%mm7, %%mm3       \n\t"
2256        "paddw                   %%mm1, %%mm0       \n\t"
2257        "paddw                   %%mm3, %%mm2       \n\t"
2258        "paddw                   %%mm2, %%mm0       \n\t"
2259        "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
2260        "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
2261        "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
2262        "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
2263        "punpcklbw               %%mm7, %%mm4       \n\t"
2264        "punpcklbw               %%mm7, %%mm1       \n\t"
2265        "punpcklbw               %%mm7, %%mm2       \n\t"
2266        "punpcklbw               %%mm7, %%mm3       \n\t"
2267        "paddw                   %%mm1, %%mm4       \n\t"
2268        "paddw                   %%mm3, %%mm2       \n\t"
2269        "paddw                   %%mm4, %%mm2       \n\t"
2270        "psrlw                      $2, %%mm0       \n\t"
2271        "psrlw                      $2, %%mm2       \n\t"
2272#endif
2273        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2274        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2275
2276        "pmaddwd                 %%mm0, %%mm1       \n\t"
2277        "pmaddwd                 %%mm2, %%mm3       \n\t"
2278        "pmaddwd                 %%mm6, %%mm0       \n\t"
2279        "pmaddwd                 %%mm6, %%mm2       \n\t"
2280#ifndef FAST_BGR2YV12
2281        "psrad                      $8, %%mm0       \n\t"
2282        "psrad                      $8, %%mm1       \n\t"
2283        "psrad                      $8, %%mm2       \n\t"
2284        "psrad                      $8, %%mm3       \n\t"
2285#endif
2286        "packssdw                %%mm2, %%mm0       \n\t"
2287        "packssdw                %%mm3, %%mm1       \n\t"
2288        "pmaddwd                 %%mm5, %%mm0       \n\t"
2289        "pmaddwd                 %%mm5, %%mm1       \n\t"
2290        "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2291        "psraw                      $7, %%mm0       \n\t"
2292
2293#if HAVE_MMX2 || HAVE_AMD3DNOW
2294        "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
2295        "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
2296        "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
2297        "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
2298        PAVGB"                   %%mm1, %%mm4       \n\t"
2299        PAVGB"                   %%mm3, %%mm2       \n\t"
2300        "movq                    %%mm4, %%mm1       \n\t"
2301        "movq                    %%mm2, %%mm3       \n\t"
2302        "psrlq                     $24, %%mm4       \n\t"
2303        "psrlq                     $24, %%mm2       \n\t"
2304        PAVGB"                   %%mm1, %%mm4       \n\t"
2305        PAVGB"                   %%mm3, %%mm2       \n\t"
2306        "punpcklbw               %%mm7, %%mm4       \n\t"
2307        "punpcklbw               %%mm7, %%mm2       \n\t"
2308#else
2309        "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2310        "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
2311        "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
2312        "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
2313        "punpcklbw               %%mm7, %%mm4       \n\t"
2314        "punpcklbw               %%mm7, %%mm1       \n\t"
2315        "punpcklbw               %%mm7, %%mm2       \n\t"
2316        "punpcklbw               %%mm7, %%mm3       \n\t"
2317        "paddw                   %%mm1, %%mm4       \n\t"
2318        "paddw                   %%mm3, %%mm2       \n\t"
2319        "paddw                   %%mm2, %%mm4       \n\t"
2320        "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
2321        "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
2322        "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
2323        "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
2324        "punpcklbw               %%mm7, %%mm5       \n\t"
2325        "punpcklbw               %%mm7, %%mm1       \n\t"
2326        "punpcklbw               %%mm7, %%mm2       \n\t"
2327        "punpcklbw               %%mm7, %%mm3       \n\t"
2328        "paddw                   %%mm1, %%mm5       \n\t"
2329        "paddw                   %%mm3, %%mm2       \n\t"
2330        "paddw                   %%mm5, %%mm2       \n\t"
2331        "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2332        "psrlw                      $2, %%mm4       \n\t"
2333        "psrlw                      $2, %%mm2       \n\t"
2334#endif
2335        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2336        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2337
2338        "pmaddwd                 %%mm4, %%mm1       \n\t"
2339        "pmaddwd                 %%mm2, %%mm3       \n\t"
2340        "pmaddwd                 %%mm6, %%mm4       \n\t"
2341        "pmaddwd                 %%mm6, %%mm2       \n\t"
2342#ifndef FAST_BGR2YV12
2343        "psrad                      $8, %%mm4       \n\t"
2344        "psrad                      $8, %%mm1       \n\t"
2345        "psrad                      $8, %%mm2       \n\t"
2346        "psrad                      $8, %%mm3       \n\t"
2347#endif
2348        "packssdw                %%mm2, %%mm4       \n\t"
2349        "packssdw                %%mm3, %%mm1       \n\t"
2350        "pmaddwd                 %%mm5, %%mm4       \n\t"
2351        "pmaddwd                 %%mm5, %%mm1       \n\t"
2352        "add                       $24, %%"REG_d"   \n\t"
2353        "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2354        "psraw                      $7, %%mm4       \n\t"
2355
2356        "movq                    %%mm0, %%mm1           \n\t"
2357        "punpckldq               %%mm4, %%mm0           \n\t"
2358        "punpckhdq               %%mm4, %%mm1           \n\t"
2359        "packsswb                %%mm1, %%mm0           \n\t"
2360        "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
2361        "movd                    %%mm0, (%2, %%"REG_a") \n\t"
2362        "punpckhdq               %%mm0, %%mm0           \n\t"
2363        "movd                    %%mm0, (%3, %%"REG_a") \n\t"
2364        "add                        $4, %%"REG_a"       \n\t"
2365        " js                        1b                  \n\t"
2366        : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2367        : "%"REG_a, "%"REG_d
2368        );
2369
2370        udst += chromStride;
2371        vdst += chromStride;
2372        src  += srcStride*2;
2373    }
2374
2375    __asm__ volatile(   EMMS"       \n\t"
2376                    SFENCE"     \n\t"
2377                    :::"memory");
2378#else
2379    y=0;
2380#endif
2381    for (; y<height; y+=2)
2382    {
2383        long i;
2384        for (i=0; i<chromWidth; i++)
2385        {
2386            unsigned int b = src[6*i+0];
2387            unsigned int g = src[6*i+1];
2388            unsigned int r = src[6*i+2];
2389
2390            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2391            unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2392            unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2393
2394            udst[i]     = U;
2395            vdst[i]     = V;
2396            ydst[2*i]   = Y;
2397
2398            b = src[6*i+3];
2399            g = src[6*i+4];
2400            r = src[6*i+5];
2401
2402            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2403            ydst[2*i+1]     = Y;
2404        }
2405        ydst += lumStride;
2406        src  += srcStride;
2407
2408        for (i=0; i<chromWidth; i++)
2409        {
2410            unsigned int b = src[6*i+0];
2411            unsigned int g = src[6*i+1];
2412            unsigned int r = src[6*i+2];
2413
2414            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2415
2416            ydst[2*i]     = Y;
2417
2418            b = src[6*i+3];
2419            g = src[6*i+4];
2420            r = src[6*i+5];
2421
2422            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2423            ydst[2*i+1]     = Y;
2424        }
2425        udst += chromStride;
2426        vdst += chromStride;
2427        ydst += lumStride;
2428        src  += srcStride;
2429    }
2430}
2431
2432static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2433                             long width, long height, long src1Stride,
2434                             long src2Stride, long dstStride){
2435    long h;
2436
2437    for (h=0; h < height; h++)
2438    {
2439        long w;
2440
2441#if HAVE_MMX
2442#if HAVE_SSE2
2443        __asm__(
2444        "xor              %%"REG_a", %%"REG_a"  \n\t"
2445        "1:                                     \n\t"
2446        PREFETCH" 64(%1, %%"REG_a")             \n\t"
2447        PREFETCH" 64(%2, %%"REG_a")             \n\t"
2448        "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
2449        "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
2450        "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
2451        "punpcklbw           %%xmm2, %%xmm0     \n\t"
2452        "punpckhbw           %%xmm2, %%xmm1     \n\t"
2453        "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
2454        "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
2455        "add                    $16, %%"REG_a"  \n\t"
2456        "cmp                     %3, %%"REG_a"  \n\t"
2457        " jb                     1b             \n\t"
2458        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2459        : "memory", "%"REG_a""
2460        );
2461#else
2462        __asm__(
2463        "xor %%"REG_a", %%"REG_a"               \n\t"
2464        "1:                                     \n\t"
2465        PREFETCH" 64(%1, %%"REG_a")             \n\t"
2466        PREFETCH" 64(%2, %%"REG_a")             \n\t"
2467        "movq       (%1, %%"REG_a"), %%mm0      \n\t"
2468        "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
2469        "movq                 %%mm0, %%mm1      \n\t"
2470        "movq                 %%mm2, %%mm3      \n\t"
2471        "movq       (%2, %%"REG_a"), %%mm4      \n\t"
2472        "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
2473        "punpcklbw            %%mm4, %%mm0      \n\t"
2474        "punpckhbw            %%mm4, %%mm1      \n\t"
2475        "punpcklbw            %%mm5, %%mm2      \n\t"
2476        "punpckhbw            %%mm5, %%mm3      \n\t"
2477        MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
2478        MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
2479        MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
2480        MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
2481        "add                    $16, %%"REG_a"  \n\t"
2482        "cmp                     %3, %%"REG_a"  \n\t"
2483        " jb                     1b             \n\t"
2484        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2485        : "memory", "%"REG_a
2486        );
2487#endif
2488        for (w= (width&(~15)); w < width; w++)
2489        {
2490            dest[2*w+0] = src1[w];
2491            dest[2*w+1] = src2[w];
2492        }
2493#else
2494        for (w=0; w < width; w++)
2495        {
2496            dest[2*w+0] = src1[w];
2497            dest[2*w+1] = src2[w];
2498        }
2499#endif
2500        dest += dstStride;
2501                src1 += src1Stride;
2502                src2 += src2Stride;
2503    }
2504#if HAVE_MMX
2505    __asm__(
2506        EMMS"       \n\t"
2507        SFENCE"     \n\t"
2508        ::: "memory"
2509        );
2510#endif
2511}
2512
2513static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2514                                       uint8_t *dst1, uint8_t *dst2,
2515                                       long width, long height,
2516                                       long srcStride1, long srcStride2,
2517                                       long dstStride1, long dstStride2)
2518{
2519    long y,x,w,h;
2520    w=width/2; h=height/2;
2521#if HAVE_MMX
2522    __asm__ volatile(
2523    PREFETCH" %0    \n\t"
2524    PREFETCH" %1    \n\t"
2525    ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2526#endif
2527    for (y=0;y<h;y++){
2528    const uint8_t* s1=src1+srcStride1*(y>>1);
2529    uint8_t* d=dst1+dstStride1*y;
2530    x=0;
2531#if HAVE_MMX
2532    for (;x<w-31;x+=32)
2533    {
2534        __asm__ volatile(
2535        PREFETCH"   32%1        \n\t"
2536        "movq         %1, %%mm0 \n\t"
2537        "movq        8%1, %%mm2 \n\t"
2538        "movq       16%1, %%mm4 \n\t"
2539        "movq       24%1, %%mm6 \n\t"
2540        "movq      %%mm0, %%mm1 \n\t"
2541        "movq      %%mm2, %%mm3 \n\t"
2542        "movq      %%mm4, %%mm5 \n\t"
2543        "movq      %%mm6, %%mm7 \n\t"
2544        "punpcklbw %%mm0, %%mm0 \n\t"
2545        "punpckhbw %%mm1, %%mm1 \n\t"
2546        "punpcklbw %%mm2, %%mm2 \n\t"
2547        "punpckhbw %%mm3, %%mm3 \n\t"
2548        "punpcklbw %%mm4, %%mm4 \n\t"
2549        "punpckhbw %%mm5, %%mm5 \n\t"
2550        "punpcklbw %%mm6, %%mm6 \n\t"
2551        "punpckhbw %%mm7, %%mm7 \n\t"
2552        MOVNTQ"    %%mm0,   %0  \n\t"
2553        MOVNTQ"    %%mm1,  8%0  \n\t"
2554        MOVNTQ"    %%mm2, 16%0  \n\t"
2555        MOVNTQ"    %%mm3, 24%0  \n\t"
2556        MOVNTQ"    %%mm4, 32%0  \n\t"
2557        MOVNTQ"    %%mm5, 40%0  \n\t"
2558        MOVNTQ"    %%mm6, 48%0  \n\t"
2559        MOVNTQ"    %%mm7, 56%0"
2560        :"=m"(d[2*x])
2561        :"m"(s1[x])
2562        :"memory");
2563    }
2564#endif
2565    for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2566    }
2567    for (y=0;y<h;y++){
2568    const uint8_t* s2=src2+srcStride2*(y>>1);
2569    uint8_t* d=dst2+dstStride2*y;
2570    x=0;
2571#if HAVE_MMX
2572    for (;x<w-31;x+=32)
2573    {
2574        __asm__ volatile(
2575        PREFETCH"   32%1        \n\t"
2576        "movq         %1, %%mm0 \n\t"
2577        "movq        8%1, %%mm2 \n\t"
2578        "movq       16%1, %%mm4 \n\t"
2579        "movq       24%1, %%mm6 \n\t"
2580        "movq      %%mm0, %%mm1 \n\t"
2581        "movq      %%mm2, %%mm3 \n\t"
2582        "movq      %%mm4, %%mm5 \n\t"
2583        "movq      %%mm6, %%mm7 \n\t"
2584        "punpcklbw %%mm0, %%mm0 \n\t"
2585        "punpckhbw %%mm1, %%mm1 \n\t"
2586        "punpcklbw %%mm2, %%mm2 \n\t"
2587        "punpckhbw %%mm3, %%mm3 \n\t"
2588        "punpcklbw %%mm4, %%mm4 \n\t"
2589        "punpckhbw %%mm5, %%mm5 \n\t"
2590        "punpcklbw %%mm6, %%mm6 \n\t"
2591        "punpckhbw %%mm7, %%mm7 \n\t"
2592        MOVNTQ"    %%mm0,   %0  \n\t"
2593        MOVNTQ"    %%mm1,  8%0  \n\t"
2594        MOVNTQ"    %%mm2, 16%0  \n\t"
2595        MOVNTQ"    %%mm3, 24%0  \n\t"
2596        MOVNTQ"    %%mm4, 32%0  \n\t"
2597        MOVNTQ"    %%mm5, 40%0  \n\t"
2598        MOVNTQ"    %%mm6, 48%0  \n\t"
2599        MOVNTQ"    %%mm7, 56%0"
2600        :"=m"(d[2*x])
2601        :"m"(s2[x])
2602        :"memory");
2603    }
2604#endif
2605    for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2606    }
2607#if HAVE_MMX
2608    __asm__(
2609        EMMS"       \n\t"
2610        SFENCE"     \n\t"
2611        ::: "memory"
2612        );
2613#endif
2614}
2615
2616static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2617                                        uint8_t *dst,
2618                                        long width, long height,
2619                                        long srcStride1, long srcStride2,
2620                                        long srcStride3, long dstStride)
2621{
2622    long y,x,w,h;
2623    w=width/2; h=height;
2624    for (y=0;y<h;y++){
2625    const uint8_t* yp=src1+srcStride1*y;
2626    const uint8_t* up=src2+srcStride2*(y>>2);
2627    const uint8_t* vp=src3+srcStride3*(y>>2);
2628    uint8_t* d=dst+dstStride*y;
2629    x=0;
2630#if HAVE_MMX
2631    for (;x<w-7;x+=8)
2632    {
2633        __asm__ volatile(
2634        PREFETCH"   32(%1, %0)          \n\t"
2635        PREFETCH"   32(%2, %0)          \n\t"
2636        PREFETCH"   32(%3, %0)          \n\t"
2637        "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2638        "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2639        "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2640        "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2641        "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2642        "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2643        "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2644        "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2645        "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2646        "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2647
2648        "movq            %%mm1, %%mm6   \n\t"
2649        "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2650        "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2651        "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2652        MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2653        MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2654
2655        "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2656        "movq     8(%1, %0, 4), %%mm0   \n\t"
2657        "movq            %%mm0, %%mm3   \n\t"
2658        "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2659        "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2660        MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2661        MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2662
2663        "movq            %%mm4, %%mm6   \n\t"
2664        "movq    16(%1, %0, 4), %%mm0   \n\t"
2665        "movq            %%mm0, %%mm3   \n\t"
2666        "punpcklbw       %%mm5, %%mm4   \n\t"
2667        "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2668        "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2669        MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2670        MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2671
2672        "punpckhbw       %%mm5, %%mm6   \n\t"
2673        "movq    24(%1, %0, 4), %%mm0   \n\t"
2674        "movq            %%mm0, %%mm3   \n\t"
2675        "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2676        "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2677        MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2678        MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2679
2680        : "+r" (x)
2681        : "r"(yp), "r" (up), "r"(vp), "r"(d)
2682        :"memory");
2683    }
2684#endif
2685    for (; x<w; x++)
2686    {
2687        const long x2 = x<<2;
2688        d[8*x+0] = yp[x2];
2689        d[8*x+1] = up[x];
2690        d[8*x+2] = yp[x2+1];
2691        d[8*x+3] = vp[x];
2692        d[8*x+4] = yp[x2+2];
2693        d[8*x+5] = up[x];
2694        d[8*x+6] = yp[x2+3];
2695        d[8*x+7] = vp[x];
2696    }
2697    }
2698#if HAVE_MMX
2699    __asm__(
2700        EMMS"       \n\t"
2701        SFENCE"     \n\t"
2702        ::: "memory"
2703        );
2704#endif
2705}
2706
2707static inline void RENAME(rgb2rgb_init)(void){
2708    rgb15to16       = RENAME(rgb15to16);
2709    rgb15tobgr24    = RENAME(rgb15tobgr24);
2710    rgb15to32       = RENAME(rgb15to32);
2711    rgb16tobgr24    = RENAME(rgb16tobgr24);
2712    rgb16to32       = RENAME(rgb16to32);
2713    rgb16to15       = RENAME(rgb16to15);
2714    rgb24tobgr16    = RENAME(rgb24tobgr16);
2715    rgb24tobgr15    = RENAME(rgb24tobgr15);
2716    rgb24tobgr32    = RENAME(rgb24tobgr32);
2717    rgb32to16       = RENAME(rgb32to16);
2718    rgb32to15       = RENAME(rgb32to15);
2719    rgb32tobgr24    = RENAME(rgb32tobgr24);
2720    rgb24to15       = RENAME(rgb24to15);
2721    rgb24to16       = RENAME(rgb24to16);
2722    rgb24tobgr24    = RENAME(rgb24tobgr24);
2723    rgb32tobgr32    = RENAME(rgb32tobgr32);
2724    rgb32tobgr16    = RENAME(rgb32tobgr16);
2725    rgb32tobgr15    = RENAME(rgb32tobgr15);
2726    yv12toyuy2      = RENAME(yv12toyuy2);
2727    yv12touyvy      = RENAME(yv12touyvy);
2728    yuv422ptoyuy2   = RENAME(yuv422ptoyuy2);
2729    yuv422ptouyvy   = RENAME(yuv422ptouyvy);
2730    yuy2toyv12      = RENAME(yuy2toyv12);
2731//    uyvytoyv12      = RENAME(uyvytoyv12);
2732//    yvu9toyv12      = RENAME(yvu9toyv12);
2733    planar2x        = RENAME(planar2x);
2734    rgb24toyv12     = RENAME(rgb24toyv12);
2735    interleaveBytes = RENAME(interleaveBytes);
2736    vu9_to_vu12     = RENAME(vu9_to_vu12);
2737    yvu9_to_yuy2    = RENAME(yvu9_to_yuy2);
2738}
2739