1/*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#undef REAL_MOVNTQ
22#undef MOVNTQ
23#undef PAVGB
24#undef PREFETCH
25
26#if COMPILE_TEMPLATE_AMD3DNOW
27#define PREFETCH  "prefetch"
28#elif COMPILE_TEMPLATE_MMX2
29#define PREFETCH "prefetchnta"
30#else
31#define PREFETCH  " # nop"
32#endif
33
34#if COMPILE_TEMPLATE_MMX2
35#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
36#elif COMPILE_TEMPLATE_AMD3DNOW
37#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
38#endif
39
40#if COMPILE_TEMPLATE_MMX2
41#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
42#else
43#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
44#endif
45#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
46
47#if COMPILE_TEMPLATE_ALTIVEC
48#include "ppc/swscale_altivec_template.c"
49#endif
50
51#define YSCALEYUV2YV12X(x, offset, dest, width) \
52    __asm__ volatile(\
53        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
54        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
55        "movq                             %%mm3, %%mm4      \n\t"\
56        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
57        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
58        ASMALIGN(4) /* FIXME Unroll? */\
59        "1:                                                 \n\t"\
60        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
61        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
62        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
63        "add                                $16, %%"REG_d"  \n\t"\
64        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
65        "test                         %%"REG_S", %%"REG_S"  \n\t"\
66        "pmulhw                           %%mm0, %%mm2      \n\t"\
67        "pmulhw                           %%mm0, %%mm5      \n\t"\
68        "paddw                            %%mm2, %%mm3      \n\t"\
69        "paddw                            %%mm5, %%mm4      \n\t"\
70        " jnz                                1b             \n\t"\
71        "psraw                               $3, %%mm3      \n\t"\
72        "psraw                               $3, %%mm4      \n\t"\
73        "packuswb                         %%mm4, %%mm3      \n\t"\
74        MOVNTQ(%%mm3, (%1, %%REGa))\
75        "add                                 $8, %%"REG_a"  \n\t"\
76        "cmp                                 %2, %%"REG_a"  \n\t"\
77        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
78        "movq                             %%mm3, %%mm4      \n\t"\
79        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
80        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
81        "jb                                  1b             \n\t"\
82        :: "r" (&c->redDither),\
83        "r" (dest), "g" (width)\
84        : "%"REG_a, "%"REG_d, "%"REG_S\
85    );
86
87#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
88    __asm__ volatile(\
89        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
90        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
91        "pxor                             %%mm4, %%mm4      \n\t"\
92        "pxor                             %%mm5, %%mm5      \n\t"\
93        "pxor                             %%mm6, %%mm6      \n\t"\
94        "pxor                             %%mm7, %%mm7      \n\t"\
95        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
96        ASMALIGN(4) \
97        "1:                                                 \n\t"\
98        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
99        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
100        "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
101        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
102        "movq                             %%mm0, %%mm3      \n\t"\
103        "punpcklwd                        %%mm1, %%mm0      \n\t"\
104        "punpckhwd                        %%mm1, %%mm3      \n\t"\
105        "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
106        "pmaddwd                          %%mm1, %%mm0      \n\t"\
107        "pmaddwd                          %%mm1, %%mm3      \n\t"\
108        "paddd                            %%mm0, %%mm4      \n\t"\
109        "paddd                            %%mm3, %%mm5      \n\t"\
110        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
111        "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
112        "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
113        "test                         %%"REG_S", %%"REG_S"  \n\t"\
114        "movq                             %%mm2, %%mm0      \n\t"\
115        "punpcklwd                        %%mm3, %%mm2      \n\t"\
116        "punpckhwd                        %%mm3, %%mm0      \n\t"\
117        "pmaddwd                          %%mm1, %%mm2      \n\t"\
118        "pmaddwd                          %%mm1, %%mm0      \n\t"\
119        "paddd                            %%mm2, %%mm6      \n\t"\
120        "paddd                            %%mm0, %%mm7      \n\t"\
121        " jnz                                1b             \n\t"\
122        "psrad                              $16, %%mm4      \n\t"\
123        "psrad                              $16, %%mm5      \n\t"\
124        "psrad                              $16, %%mm6      \n\t"\
125        "psrad                              $16, %%mm7      \n\t"\
126        "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
127        "packssdw                         %%mm5, %%mm4      \n\t"\
128        "packssdw                         %%mm7, %%mm6      \n\t"\
129        "paddw                            %%mm0, %%mm4      \n\t"\
130        "paddw                            %%mm0, %%mm6      \n\t"\
131        "psraw                               $3, %%mm4      \n\t"\
132        "psraw                               $3, %%mm6      \n\t"\
133        "packuswb                         %%mm6, %%mm4      \n\t"\
134        MOVNTQ(%%mm4, (%1, %%REGa))\
135        "add                                 $8, %%"REG_a"  \n\t"\
136        "cmp                                 %2, %%"REG_a"  \n\t"\
137        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
138        "pxor                             %%mm4, %%mm4      \n\t"\
139        "pxor                             %%mm5, %%mm5      \n\t"\
140        "pxor                             %%mm6, %%mm6      \n\t"\
141        "pxor                             %%mm7, %%mm7      \n\t"\
142        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
143        "jb                                  1b             \n\t"\
144        :: "r" (&c->redDither),\
145        "r" (dest), "g" (width)\
146        : "%"REG_a, "%"REG_d, "%"REG_S\
147    );
148
149#define YSCALEYUV2YV121 \
150    "mov %2, %%"REG_a"                    \n\t"\
151    ASMALIGN(4) /* FIXME Unroll? */\
152    "1:                                   \n\t"\
153    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
154    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
155    "psraw                 $7, %%mm0      \n\t"\
156    "psraw                 $7, %%mm1      \n\t"\
157    "packuswb           %%mm1, %%mm0      \n\t"\
158    MOVNTQ(%%mm0, (%1, %%REGa))\
159    "add                   $8, %%"REG_a"  \n\t"\
160    "jnc                   1b             \n\t"
161
162#define YSCALEYUV2YV121_ACCURATE \
163    "mov %2, %%"REG_a"                    \n\t"\
164    "pcmpeqw %%mm7, %%mm7                 \n\t"\
165    "psrlw                 $15, %%mm7     \n\t"\
166    "psllw                  $6, %%mm7     \n\t"\
167    ASMALIGN(4) /* FIXME Unroll? */\
168    "1:                                   \n\t"\
169    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
170    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
171    "paddsw             %%mm7, %%mm0      \n\t"\
172    "paddsw             %%mm7, %%mm1      \n\t"\
173    "psraw                 $7, %%mm0      \n\t"\
174    "psraw                 $7, %%mm1      \n\t"\
175    "packuswb           %%mm1, %%mm0      \n\t"\
176    MOVNTQ(%%mm0, (%1, %%REGa))\
177    "add                   $8, %%"REG_a"  \n\t"\
178    "jnc                   1b             \n\t"
179
180/*
181    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
182       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
183       "r" (dest), "m" (dstW),
184       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
185    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
186*/
187#define YSCALEYUV2PACKEDX_UV \
188    __asm__ volatile(\
189        "xor                   %%"REG_a", %%"REG_a"     \n\t"\
190        ASMALIGN(4)\
191        "nop                                            \n\t"\
192        "1:                                             \n\t"\
193        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
194        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
195        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
196        "movq                      %%mm3, %%mm4         \n\t"\
197        ASMALIGN(4)\
198        "2:                                             \n\t"\
199        "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
200        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
201        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
202        "add                         $16, %%"REG_d"     \n\t"\
203        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
204        "pmulhw                    %%mm0, %%mm2         \n\t"\
205        "pmulhw                    %%mm0, %%mm5         \n\t"\
206        "paddw                     %%mm2, %%mm3         \n\t"\
207        "paddw                     %%mm5, %%mm4         \n\t"\
208        "test                  %%"REG_S", %%"REG_S"     \n\t"\
209        " jnz                         2b                \n\t"\
210
211#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
212    "lea                "offset"(%0), %%"REG_d"     \n\t"\
213    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
214    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
215    "movq                    "#dst1", "#dst2"       \n\t"\
216    ASMALIGN(4)\
217    "2:                                             \n\t"\
218    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
219    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
220    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
221    "add                         $16, %%"REG_d"            \n\t"\
222    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
223    "pmulhw                 "#coeff", "#src1"       \n\t"\
224    "pmulhw                 "#coeff", "#src2"       \n\t"\
225    "paddw                   "#src1", "#dst1"       \n\t"\
226    "paddw                   "#src2", "#dst2"       \n\t"\
227    "test                  %%"REG_S", %%"REG_S"     \n\t"\
228    " jnz                         2b                \n\t"\
229
230#define YSCALEYUV2PACKEDX \
231    YSCALEYUV2PACKEDX_UV \
232    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
233
234#define YSCALEYUV2PACKEDX_END                     \
235        :: "r" (&c->redDither),                   \
236            "m" (dummy), "m" (dummy), "m" (dummy),\
237            "r" (dest), "m" (dstW)                \
238        : "%"REG_a, "%"REG_d, "%"REG_S            \
239    );
240
241#define YSCALEYUV2PACKEDX_ACCURATE_UV \
242    __asm__ volatile(\
243        "xor %%"REG_a", %%"REG_a"                       \n\t"\
244        ASMALIGN(4)\
245        "nop                                            \n\t"\
246        "1:                                             \n\t"\
247        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
248        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
249        "pxor                      %%mm4, %%mm4         \n\t"\
250        "pxor                      %%mm5, %%mm5         \n\t"\
251        "pxor                      %%mm6, %%mm6         \n\t"\
252        "pxor                      %%mm7, %%mm7         \n\t"\
253        ASMALIGN(4)\
254        "2:                                             \n\t"\
255        "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
256        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
257        "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
258        "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
259        "movq                      %%mm0, %%mm3         \n\t"\
260        "punpcklwd                 %%mm1, %%mm0         \n\t"\
261        "punpckhwd                 %%mm1, %%mm3         \n\t"\
262        "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
263        "pmaddwd                   %%mm1, %%mm0         \n\t"\
264        "pmaddwd                   %%mm1, %%mm3         \n\t"\
265        "paddd                     %%mm0, %%mm4         \n\t"\
266        "paddd                     %%mm3, %%mm5         \n\t"\
267        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
268        "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
269        "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
270        "test                  %%"REG_S", %%"REG_S"     \n\t"\
271        "movq                      %%mm2, %%mm0         \n\t"\
272        "punpcklwd                 %%mm3, %%mm2         \n\t"\
273        "punpckhwd                 %%mm3, %%mm0         \n\t"\
274        "pmaddwd                   %%mm1, %%mm2         \n\t"\
275        "pmaddwd                   %%mm1, %%mm0         \n\t"\
276        "paddd                     %%mm2, %%mm6         \n\t"\
277        "paddd                     %%mm0, %%mm7         \n\t"\
278        " jnz                         2b                \n\t"\
279        "psrad                       $16, %%mm4         \n\t"\
280        "psrad                       $16, %%mm5         \n\t"\
281        "psrad                       $16, %%mm6         \n\t"\
282        "psrad                       $16, %%mm7         \n\t"\
283        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
284        "packssdw                  %%mm5, %%mm4         \n\t"\
285        "packssdw                  %%mm7, %%mm6         \n\t"\
286        "paddw                     %%mm0, %%mm4         \n\t"\
287        "paddw                     %%mm0, %%mm6         \n\t"\
288        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
289        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
290
291#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
292    "lea                "offset"(%0), %%"REG_d"     \n\t"\
293    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
294    "pxor                      %%mm1, %%mm1         \n\t"\
295    "pxor                      %%mm5, %%mm5         \n\t"\
296    "pxor                      %%mm7, %%mm7         \n\t"\
297    "pxor                      %%mm6, %%mm6         \n\t"\
298    ASMALIGN(4)\
299    "2:                                             \n\t"\
300    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
301    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
302    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
303    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
304    "movq                      %%mm0, %%mm3         \n\t"\
305    "punpcklwd                 %%mm4, %%mm0         \n\t"\
306    "punpckhwd                 %%mm4, %%mm3         \n\t"\
307    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
308    "pmaddwd                   %%mm4, %%mm0         \n\t"\
309    "pmaddwd                   %%mm4, %%mm3         \n\t"\
310    "paddd                     %%mm0, %%mm1         \n\t"\
311    "paddd                     %%mm3, %%mm5         \n\t"\
312    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
313    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
314    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
315    "test                  %%"REG_S", %%"REG_S"     \n\t"\
316    "movq                      %%mm2, %%mm0         \n\t"\
317    "punpcklwd                 %%mm3, %%mm2         \n\t"\
318    "punpckhwd                 %%mm3, %%mm0         \n\t"\
319    "pmaddwd                   %%mm4, %%mm2         \n\t"\
320    "pmaddwd                   %%mm4, %%mm0         \n\t"\
321    "paddd                     %%mm2, %%mm7         \n\t"\
322    "paddd                     %%mm0, %%mm6         \n\t"\
323    " jnz                         2b                \n\t"\
324    "psrad                       $16, %%mm1         \n\t"\
325    "psrad                       $16, %%mm5         \n\t"\
326    "psrad                       $16, %%mm7         \n\t"\
327    "psrad                       $16, %%mm6         \n\t"\
328    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
329    "packssdw                  %%mm5, %%mm1         \n\t"\
330    "packssdw                  %%mm6, %%mm7         \n\t"\
331    "paddw                     %%mm0, %%mm1         \n\t"\
332    "paddw                     %%mm0, %%mm7         \n\t"\
333    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
334    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
335
336#define YSCALEYUV2PACKEDX_ACCURATE \
337    YSCALEYUV2PACKEDX_ACCURATE_UV \
338    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
339
340#define YSCALEYUV2RGBX \
341    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
342    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
343    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
344    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
345    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
346    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
347    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
348    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
349    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
350    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
351    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
352    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
353    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
354    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
355    "paddw           %%mm3, %%mm4       \n\t"\
356    "movq            %%mm2, %%mm0       \n\t"\
357    "movq            %%mm5, %%mm6       \n\t"\
358    "movq            %%mm4, %%mm3       \n\t"\
359    "punpcklwd       %%mm2, %%mm2       \n\t"\
360    "punpcklwd       %%mm5, %%mm5       \n\t"\
361    "punpcklwd       %%mm4, %%mm4       \n\t"\
362    "paddw           %%mm1, %%mm2       \n\t"\
363    "paddw           %%mm1, %%mm5       \n\t"\
364    "paddw           %%mm1, %%mm4       \n\t"\
365    "punpckhwd       %%mm0, %%mm0       \n\t"\
366    "punpckhwd       %%mm6, %%mm6       \n\t"\
367    "punpckhwd       %%mm3, %%mm3       \n\t"\
368    "paddw           %%mm7, %%mm0       \n\t"\
369    "paddw           %%mm7, %%mm6       \n\t"\
370    "paddw           %%mm7, %%mm3       \n\t"\
371    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
372    "packuswb        %%mm0, %%mm2       \n\t"\
373    "packuswb        %%mm6, %%mm5       \n\t"\
374    "packuswb        %%mm3, %%mm4       \n\t"\
375
376#define REAL_YSCALEYUV2PACKED(index, c) \
377    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
378    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
379    "psraw                $3, %%mm0                           \n\t"\
380    "psraw                $3, %%mm1                           \n\t"\
381    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
382    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
383    "xor            "#index", "#index"                        \n\t"\
384    ASMALIGN(4)\
385    "1:                                 \n\t"\
386    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
387    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
388    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
389    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
390    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
391    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
392    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
393    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
394    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
395    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
396    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
397    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
398    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
399    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
400    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
401    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
402    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
403    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
404    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
405    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
406    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
408    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
410    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
411
412#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
413
414#define REAL_YSCALEYUV2RGB_UV(index, c) \
415    "xor            "#index", "#index"  \n\t"\
416    ASMALIGN(4)\
417    "1:                                 \n\t"\
418    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
419    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
420    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
421    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
422    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
423    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
424    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
425    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
426    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
427    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
428    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
429    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
430    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
431    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
432    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
433    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
434    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
435    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
436    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
437    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
438
439#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
440    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
441    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
442    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
443    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
444    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
445    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
446    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
447    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
448    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
449    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
450    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
451    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
452
453#define REAL_YSCALEYUV2RGB_COEFF(c) \
454    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
455    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
456    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
457    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
458    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
459    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
460    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
461    "paddw             %%mm3, %%mm4     \n\t"\
462    "movq              %%mm2, %%mm0     \n\t"\
463    "movq              %%mm5, %%mm6     \n\t"\
464    "movq              %%mm4, %%mm3     \n\t"\
465    "punpcklwd         %%mm2, %%mm2     \n\t"\
466    "punpcklwd         %%mm5, %%mm5     \n\t"\
467    "punpcklwd         %%mm4, %%mm4     \n\t"\
468    "paddw             %%mm1, %%mm2     \n\t"\
469    "paddw             %%mm1, %%mm5     \n\t"\
470    "paddw             %%mm1, %%mm4     \n\t"\
471    "punpckhwd         %%mm0, %%mm0     \n\t"\
472    "punpckhwd         %%mm6, %%mm6     \n\t"\
473    "punpckhwd         %%mm3, %%mm3     \n\t"\
474    "paddw             %%mm7, %%mm0     \n\t"\
475    "paddw             %%mm7, %%mm6     \n\t"\
476    "paddw             %%mm7, %%mm3     \n\t"\
477    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
478    "packuswb          %%mm0, %%mm2     \n\t"\
479    "packuswb          %%mm6, %%mm5     \n\t"\
480    "packuswb          %%mm3, %%mm4     \n\t"\
481
482#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
483
484#define YSCALEYUV2RGB(index, c) \
485    REAL_YSCALEYUV2RGB_UV(index, c) \
486    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
487    REAL_YSCALEYUV2RGB_COEFF(c)
488
489#define REAL_YSCALEYUV2PACKED1(index, c) \
490    "xor            "#index", "#index"  \n\t"\
491    ASMALIGN(4)\
492    "1:                                 \n\t"\
493    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
494    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
495    "psraw                $7, %%mm3     \n\t" \
496    "psraw                $7, %%mm4     \n\t" \
497    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
498    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
499    "psraw                $7, %%mm1     \n\t" \
500    "psraw                $7, %%mm7     \n\t" \
501
502#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
503
504#define REAL_YSCALEYUV2RGB1(index, c) \
505    "xor            "#index", "#index"  \n\t"\
506    ASMALIGN(4)\
507    "1:                                 \n\t"\
508    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
509    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
510    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
511    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
512    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
513    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
514    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
515    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
516    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
517    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
518    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
519    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
520    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
521    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
522    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
523    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
524    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
525    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
526    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
527    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
528    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
529    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
530    "paddw             %%mm3, %%mm4     \n\t"\
531    "movq              %%mm2, %%mm0     \n\t"\
532    "movq              %%mm5, %%mm6     \n\t"\
533    "movq              %%mm4, %%mm3     \n\t"\
534    "punpcklwd         %%mm2, %%mm2     \n\t"\
535    "punpcklwd         %%mm5, %%mm5     \n\t"\
536    "punpcklwd         %%mm4, %%mm4     \n\t"\
537    "paddw             %%mm1, %%mm2     \n\t"\
538    "paddw             %%mm1, %%mm5     \n\t"\
539    "paddw             %%mm1, %%mm4     \n\t"\
540    "punpckhwd         %%mm0, %%mm0     \n\t"\
541    "punpckhwd         %%mm6, %%mm6     \n\t"\
542    "punpckhwd         %%mm3, %%mm3     \n\t"\
543    "paddw             %%mm7, %%mm0     \n\t"\
544    "paddw             %%mm7, %%mm6     \n\t"\
545    "paddw             %%mm7, %%mm3     \n\t"\
546    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
547    "packuswb          %%mm0, %%mm2     \n\t"\
548    "packuswb          %%mm6, %%mm5     \n\t"\
549    "packuswb          %%mm3, %%mm4     \n\t"\
550
551#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
552
553#define REAL_YSCALEYUV2PACKED1b(index, c) \
554    "xor "#index", "#index"             \n\t"\
555    ASMALIGN(4)\
556    "1:                                 \n\t"\
557    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
558    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
559    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
560    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
561    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
562    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
563    "psrlw                $8, %%mm3     \n\t" \
564    "psrlw                $8, %%mm4     \n\t" \
565    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
566    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
567    "psraw                $7, %%mm1     \n\t" \
568    "psraw                $7, %%mm7     \n\t"
569#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
570
571// do vertical chrominance interpolation
572#define REAL_YSCALEYUV2RGB1b(index, c) \
573    "xor            "#index", "#index"  \n\t"\
574    ASMALIGN(4)\
575    "1:                                 \n\t"\
576    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
577    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
578    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
579    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
580    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
581    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
582    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
583    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
584    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
585    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
586    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
587    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
588    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
589    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
590    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
591    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
592    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
593    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
594    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
595    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
596    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
597    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
598    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
599    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
600    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
601    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
602    "paddw             %%mm3, %%mm4     \n\t"\
603    "movq              %%mm2, %%mm0     \n\t"\
604    "movq              %%mm5, %%mm6     \n\t"\
605    "movq              %%mm4, %%mm3     \n\t"\
606    "punpcklwd         %%mm2, %%mm2     \n\t"\
607    "punpcklwd         %%mm5, %%mm5     \n\t"\
608    "punpcklwd         %%mm4, %%mm4     \n\t"\
609    "paddw             %%mm1, %%mm2     \n\t"\
610    "paddw             %%mm1, %%mm5     \n\t"\
611    "paddw             %%mm1, %%mm4     \n\t"\
612    "punpckhwd         %%mm0, %%mm0     \n\t"\
613    "punpckhwd         %%mm6, %%mm6     \n\t"\
614    "punpckhwd         %%mm3, %%mm3     \n\t"\
615    "paddw             %%mm7, %%mm0     \n\t"\
616    "paddw             %%mm7, %%mm6     \n\t"\
617    "paddw             %%mm7, %%mm3     \n\t"\
618    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
619    "packuswb          %%mm0, %%mm2     \n\t"\
620    "packuswb          %%mm6, %%mm5     \n\t"\
621    "packuswb          %%mm3, %%mm4     \n\t"\
622
623#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
624
625#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
626    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
627    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
628    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
629    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
630    "packuswb          %%mm1, %%mm7     \n\t"
631#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
632
633#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
634    "movq       "#b", "#q2"     \n\t" /* B */\
635    "movq       "#r", "#t"      \n\t" /* R */\
636    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
637    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
638    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
639    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
640    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
641    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
642    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
643    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
644    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
645    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
646\
647    MOVNTQ(   q0,   (dst, index, 4))\
648    MOVNTQ(    b,  8(dst, index, 4))\
649    MOVNTQ(   q2, 16(dst, index, 4))\
650    MOVNTQ(   q3, 24(dst, index, 4))\
651\
652    "add      $8, "#index"      \n\t"\
653    "cmp "#dstw", "#index"      \n\t"\
654    " jb      1b                \n\t"
655#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
656
657#define REAL_WRITERGB16(dst, dstw, index) \
658    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
659    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
660    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
661    "psrlq           $3, %%mm2  \n\t"\
662\
663    "movq         %%mm2, %%mm1  \n\t"\
664    "movq         %%mm4, %%mm3  \n\t"\
665\
666    "punpcklbw    %%mm7, %%mm3  \n\t"\
667    "punpcklbw    %%mm5, %%mm2  \n\t"\
668    "punpckhbw    %%mm7, %%mm4  \n\t"\
669    "punpckhbw    %%mm5, %%mm1  \n\t"\
670\
671    "psllq           $3, %%mm3  \n\t"\
672    "psllq           $3, %%mm4  \n\t"\
673\
674    "por          %%mm3, %%mm2  \n\t"\
675    "por          %%mm4, %%mm1  \n\t"\
676\
677    MOVNTQ(%%mm2,  (dst, index, 2))\
678    MOVNTQ(%%mm1, 8(dst, index, 2))\
679\
680    "add             $8, "#index"   \n\t"\
681    "cmp        "#dstw", "#index"   \n\t"\
682    " jb             1b             \n\t"
683#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
684
685#define REAL_WRITERGB15(dst, dstw, index) \
686    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
687    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
688    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
689    "psrlq           $3, %%mm2  \n\t"\
690    "psrlq           $1, %%mm5  \n\t"\
691\
692    "movq         %%mm2, %%mm1  \n\t"\
693    "movq         %%mm4, %%mm3  \n\t"\
694\
695    "punpcklbw    %%mm7, %%mm3  \n\t"\
696    "punpcklbw    %%mm5, %%mm2  \n\t"\
697    "punpckhbw    %%mm7, %%mm4  \n\t"\
698    "punpckhbw    %%mm5, %%mm1  \n\t"\
699\
700    "psllq           $2, %%mm3  \n\t"\
701    "psllq           $2, %%mm4  \n\t"\
702\
703    "por          %%mm3, %%mm2  \n\t"\
704    "por          %%mm4, %%mm1  \n\t"\
705\
706    MOVNTQ(%%mm2,  (dst, index, 2))\
707    MOVNTQ(%%mm1, 8(dst, index, 2))\
708\
709    "add             $8, "#index"   \n\t"\
710    "cmp        "#dstw", "#index"   \n\t"\
711    " jb             1b             \n\t"
712#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
713
714#define WRITEBGR24OLD(dst, dstw, index) \
715    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
716    "movq      %%mm2, %%mm1             \n\t" /* B */\
717    "movq      %%mm5, %%mm6             \n\t" /* R */\
718    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
719    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
720    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
721    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
722    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
723    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
724    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
725    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
726    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
727    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
728\
729    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
730    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
731    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
732    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
733    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
734    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
735    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
736    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
737\
738    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
739    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
740    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
741    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
742    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
743    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
744    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
745    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
746    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
747    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
748    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
749    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
750    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
751\
752    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
753    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
754    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
755    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
756    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
757    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
758    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
759    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
760\
761    MOVNTQ(%%mm0,   (dst))\
762    MOVNTQ(%%mm2,  8(dst))\
763    MOVNTQ(%%mm3, 16(dst))\
764    "add         $24, "#dst"            \n\t"\
765\
766    "add          $8, "#index"          \n\t"\
767    "cmp     "#dstw", "#index"          \n\t"\
768    " jb          1b                    \n\t"
769
770#define WRITEBGR24MMX(dst, dstw, index) \
771    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
772    "movq      %%mm2, %%mm1     \n\t" /* B */\
773    "movq      %%mm5, %%mm6     \n\t" /* R */\
774    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
775    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
776    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
777    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
778    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
779    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
780    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
781    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
782    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
783    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
784\
785    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
786    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
787    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
788    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
789\
790    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
791    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
792    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
793    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
794\
795    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
796    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
797    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
798    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
799\
800    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
801    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
802    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
803    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
804    MOVNTQ(%%mm0, (dst))\
805\
806    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
807    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
808    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
809    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
810    MOVNTQ(%%mm6, 8(dst))\
811\
812    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
813    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
814    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
815    MOVNTQ(%%mm5, 16(dst))\
816\
817    "add         $24, "#dst"    \n\t"\
818\
819    "add          $8, "#index"  \n\t"\
820    "cmp     "#dstw", "#index"  \n\t"\
821    " jb          1b            \n\t"
822
823#define WRITEBGR24MMX2(dst, dstw, index) \
824    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
825    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
826    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
827    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
828    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
829    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
830\
831    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
832    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
833    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
834\
835    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
836    "por    %%mm1, %%mm6        \n\t"\
837    "por    %%mm3, %%mm6        \n\t"\
838    MOVNTQ(%%mm6, (dst))\
839\
840    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
841    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
842    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
843    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
844\
845    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
846    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
847    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
848\
849    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
850    "por    %%mm3, %%mm6        \n\t"\
851    MOVNTQ(%%mm6, 8(dst))\
852\
853    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
854    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
855    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
856\
857    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
858    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
859    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
860\
861    "por    %%mm1, %%mm3        \n\t"\
862    "por    %%mm3, %%mm6        \n\t"\
863    MOVNTQ(%%mm6, 16(dst))\
864\
865    "add      $24, "#dst"       \n\t"\
866\
867    "add       $8, "#index"     \n\t"\
868    "cmp  "#dstw", "#index"     \n\t"\
869    " jb       1b               \n\t"
870
871#if COMPILE_TEMPLATE_MMX2
872#undef WRITEBGR24
873#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
874#else
875#undef WRITEBGR24
876#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
877#endif
878
879#define REAL_WRITEYUY2(dst, dstw, index) \
880    "packuswb  %%mm3, %%mm3     \n\t"\
881    "packuswb  %%mm4, %%mm4     \n\t"\
882    "packuswb  %%mm7, %%mm1     \n\t"\
883    "punpcklbw %%mm4, %%mm3     \n\t"\
884    "movq      %%mm1, %%mm7     \n\t"\
885    "punpcklbw %%mm3, %%mm1     \n\t"\
886    "punpckhbw %%mm3, %%mm7     \n\t"\
887\
888    MOVNTQ(%%mm1, (dst, index, 2))\
889    MOVNTQ(%%mm7, 8(dst, index, 2))\
890\
891    "add          $8, "#index"  \n\t"\
892    "cmp     "#dstw", "#index"  \n\t"\
893    " jb          1b            \n\t"
894#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
895
896
897static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
898                                    const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
899                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
900{
901#if COMPILE_TEMPLATE_MMX
902    if(!(c->flags & SWS_BITEXACT)) {
903        if (c->flags & SWS_ACCURATE_RND) {
904            if (uDest) {
905                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
906                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
907            }
908            if (CONFIG_SWSCALE_ALPHA && aDest) {
909                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
910            }
911
912            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
913        } else {
914            if (uDest) {
915                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
916                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
917            }
918            if (CONFIG_SWSCALE_ALPHA && aDest) {
919                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
920            }
921
922            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
923        }
924        return;
925    }
926#endif
927#if COMPILE_TEMPLATE_ALTIVEC
928    yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
929                          chrFilter, chrSrc, chrFilterSize,
930                          dest, uDest, vDest, dstW, chrDstW);
931#else //COMPILE_TEMPLATE_ALTIVEC
932    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
933                chrFilter, chrSrc, chrFilterSize,
934                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
935#endif //!COMPILE_TEMPLATE_ALTIVEC
936}
937
938static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
939                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
940                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
941{
942    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
943                 chrFilter, chrSrc, chrFilterSize,
944                 dest, uDest, dstW, chrDstW, dstFormat);
945}
946
947static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
948                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
949{
950    int i;
951#if COMPILE_TEMPLATE_MMX
952    if(!(c->flags & SWS_BITEXACT)) {
953        long p= 4;
954        const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
955        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
956        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
957
958        if (c->flags & SWS_ACCURATE_RND) {
959            while(p--) {
960                if (dst[p]) {
961                    __asm__ volatile(
962                        YSCALEYUV2YV121_ACCURATE
963                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
964                        "g" (-counter[p])
965                        : "%"REG_a
966                    );
967                }
968            }
969        } else {
970            while(p--) {
971                if (dst[p]) {
972                    __asm__ volatile(
973                        YSCALEYUV2YV121
974                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
975                        "g" (-counter[p])
976                        : "%"REG_a
977                    );
978                }
979            }
980        }
981        return;
982    }
983#endif
984    for (i=0; i<dstW; i++) {
985        int val= (lumSrc[i]+64)>>7;
986
987        if (val&256) {
988            if (val<0) val=0;
989            else       val=255;
990        }
991
992        dest[i]= val;
993    }
994
995    if (uDest)
996        for (i=0; i<chrDstW; i++) {
997            int u=(chrSrc[i       ]+64)>>7;
998            int v=(chrSrc[i + VOFW]+64)>>7;
999
1000            if ((u|v)&256) {
1001                if (u<0)        u=0;
1002                else if (u>255) u=255;
1003                if (v<0)        v=0;
1004                else if (v>255) v=255;
1005            }
1006
1007            uDest[i]= u;
1008            vDest[i]= v;
1009        }
1010
1011    if (CONFIG_SWSCALE_ALPHA && aDest)
1012        for (i=0; i<dstW; i++) {
1013            int val= (alpSrc[i]+64)>>7;
1014            aDest[i]= av_clip_uint8(val);
1015        }
1016}
1017
1018
1019/**
1020 * vertical scale YV12 to RGB
1021 */
1022static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1023                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1024                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1025{
1026#if COMPILE_TEMPLATE_MMX
1027    x86_reg dummy=0;
1028    if(!(c->flags & SWS_BITEXACT)) {
1029        if (c->flags & SWS_ACCURATE_RND) {
1030            switch(c->dstFormat) {
1031            case PIX_FMT_RGB32:
1032                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1033                    YSCALEYUV2PACKEDX_ACCURATE
1034                    YSCALEYUV2RGBX
1035                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1036                    "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1037                    "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1038                    YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1039                    "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1040                    "psraw                        $3, %%mm1         \n\t"
1041                    "psraw                        $3, %%mm7         \n\t"
1042                    "packuswb                  %%mm7, %%mm1         \n\t"
1043                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1044
1045                    YSCALEYUV2PACKEDX_END
1046                } else {
1047                    YSCALEYUV2PACKEDX_ACCURATE
1048                    YSCALEYUV2RGBX
1049                    "pcmpeqd %%mm7, %%mm7 \n\t"
1050                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1051
1052                    YSCALEYUV2PACKEDX_END
1053                }
1054                return;
1055            case PIX_FMT_BGR24:
1056                YSCALEYUV2PACKEDX_ACCURATE
1057                YSCALEYUV2RGBX
1058                "pxor %%mm7, %%mm7 \n\t"
1059                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1060                "add %4, %%"REG_c"                        \n\t"
1061                WRITEBGR24(%%REGc, %5, %%REGa)
1062
1063
1064                :: "r" (&c->redDither),
1065                "m" (dummy), "m" (dummy), "m" (dummy),
1066                "r" (dest), "m" (dstW)
1067                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1068                );
1069                return;
1070            case PIX_FMT_RGB555:
1071                YSCALEYUV2PACKEDX_ACCURATE
1072                YSCALEYUV2RGBX
1073                "pxor %%mm7, %%mm7 \n\t"
1074                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1075#ifdef DITHER1XBPP
1076                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1077                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1078                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1079#endif
1080
1081                WRITERGB15(%4, %5, %%REGa)
1082                YSCALEYUV2PACKEDX_END
1083                return;
1084            case PIX_FMT_RGB565:
1085                YSCALEYUV2PACKEDX_ACCURATE
1086                YSCALEYUV2RGBX
1087                "pxor %%mm7, %%mm7 \n\t"
1088                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1089#ifdef DITHER1XBPP
1090                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1091                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1092                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1093#endif
1094
1095                WRITERGB16(%4, %5, %%REGa)
1096                YSCALEYUV2PACKEDX_END
1097                return;
1098            case PIX_FMT_YUYV422:
1099                YSCALEYUV2PACKEDX_ACCURATE
1100                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1101
1102                "psraw $3, %%mm3    \n\t"
1103                "psraw $3, %%mm4    \n\t"
1104                "psraw $3, %%mm1    \n\t"
1105                "psraw $3, %%mm7    \n\t"
1106                WRITEYUY2(%4, %5, %%REGa)
1107                YSCALEYUV2PACKEDX_END
1108                return;
1109            }
1110        } else {
1111            switch(c->dstFormat) {
1112            case PIX_FMT_RGB32:
1113                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1114                    YSCALEYUV2PACKEDX
1115                    YSCALEYUV2RGBX
1116                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1117                    "psraw                        $3, %%mm1         \n\t"
1118                    "psraw                        $3, %%mm7         \n\t"
1119                    "packuswb                  %%mm7, %%mm1         \n\t"
1120                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1121                    YSCALEYUV2PACKEDX_END
1122                } else {
1123                    YSCALEYUV2PACKEDX
1124                    YSCALEYUV2RGBX
1125                    "pcmpeqd %%mm7, %%mm7 \n\t"
1126                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1127                    YSCALEYUV2PACKEDX_END
1128                }
1129                return;
1130            case PIX_FMT_BGR24:
1131                YSCALEYUV2PACKEDX
1132                YSCALEYUV2RGBX
1133                "pxor                    %%mm7, %%mm7       \n\t"
1134                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1135                "add                        %4, %%"REG_c"   \n\t"
1136                WRITEBGR24(%%REGc, %5, %%REGa)
1137
1138                :: "r" (&c->redDither),
1139                "m" (dummy), "m" (dummy), "m" (dummy),
1140                "r" (dest),  "m" (dstW)
1141                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1142                );
1143                return;
1144            case PIX_FMT_RGB555:
1145                YSCALEYUV2PACKEDX
1146                YSCALEYUV2RGBX
1147                "pxor %%mm7, %%mm7 \n\t"
1148                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1149#ifdef DITHER1XBPP
1150                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1151                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1152                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1153#endif
1154
1155                WRITERGB15(%4, %5, %%REGa)
1156                YSCALEYUV2PACKEDX_END
1157                return;
1158            case PIX_FMT_RGB565:
1159                YSCALEYUV2PACKEDX
1160                YSCALEYUV2RGBX
1161                "pxor %%mm7, %%mm7 \n\t"
1162                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1163#ifdef DITHER1XBPP
1164                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1165                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1166                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1167#endif
1168
1169                WRITERGB16(%4, %5, %%REGa)
1170                YSCALEYUV2PACKEDX_END
1171                return;
1172            case PIX_FMT_YUYV422:
1173                YSCALEYUV2PACKEDX
1174                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1175
1176                "psraw $3, %%mm3    \n\t"
1177                "psraw $3, %%mm4    \n\t"
1178                "psraw $3, %%mm1    \n\t"
1179                "psraw $3, %%mm7    \n\t"
1180                WRITEYUY2(%4, %5, %%REGa)
1181                YSCALEYUV2PACKEDX_END
1182                return;
1183            }
1184        }
1185    }
1186#endif /* COMPILE_TEMPLATE_MMX */
1187#if COMPILE_TEMPLATE_ALTIVEC
1188    /* The following list of supported dstFormat values should
1189       match what's found in the body of ff_yuv2packedX_altivec() */
1190    if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1191         (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1192          c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1193          c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1194            ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1195                                   chrFilter, chrSrc, chrFilterSize,
1196                                   dest, dstW, dstY);
1197    else
1198#endif
1199        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1200                       chrFilter, chrSrc, chrFilterSize,
1201                       alpSrc, dest, dstW, dstY);
1202}
1203
1204/**
1205 * vertical bilinear scale YV12 to RGB
1206 */
1207static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1208                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1209{
1210    int  yalpha1=4095- yalpha;
1211    int uvalpha1=4095-uvalpha;
1212    int i;
1213
1214#if COMPILE_TEMPLATE_MMX
1215    if(!(c->flags & SWS_BITEXACT)) {
1216        switch(c->dstFormat) {
1217        //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1218        case PIX_FMT_RGB32:
1219            if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1220#if ARCH_X86_64
1221                __asm__ volatile(
1222                    YSCALEYUV2RGB(%%r8, %5)
1223                    YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1224                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1225                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1226                    "packuswb            %%mm7, %%mm1       \n\t"
1227                    WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1228
1229                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1230                    "a" (&c->redDither)
1231                    ,"r" (abuf0), "r" (abuf1)
1232                    : "%r8"
1233                );
1234#else
1235                *(const uint16_t **)(&c->u_temp)=abuf0;
1236                *(const uint16_t **)(&c->v_temp)=abuf1;
1237                __asm__ volatile(
1238                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1239                    "mov        %4, %%"REG_b"               \n\t"
1240                    "push %%"REG_BP"                        \n\t"
1241                    YSCALEYUV2RGB(%%REGBP, %5)
1242                    "push                   %0              \n\t"
1243                    "push                   %1              \n\t"
1244                    "mov          "U_TEMP"(%5), %0          \n\t"
1245                    "mov          "V_TEMP"(%5), %1          \n\t"
1246                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1247                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1248                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1249                    "packuswb            %%mm7, %%mm1       \n\t"
1250                    "pop                    %1              \n\t"
1251                    "pop                    %0              \n\t"
1252                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1253                    "pop %%"REG_BP"                         \n\t"
1254                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1255
1256                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1257                    "a" (&c->redDither)
1258                );
1259#endif
1260            } else {
1261                __asm__ volatile(
1262                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1263                    "mov        %4, %%"REG_b"               \n\t"
1264                    "push %%"REG_BP"                        \n\t"
1265                    YSCALEYUV2RGB(%%REGBP, %5)
1266                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1267                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1268                    "pop %%"REG_BP"                         \n\t"
1269                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1270
1271                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1272                    "a" (&c->redDither)
1273                );
1274            }
1275            return;
1276        case PIX_FMT_BGR24:
1277            __asm__ volatile(
1278                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1279                "mov        %4, %%"REG_b"               \n\t"
1280                "push %%"REG_BP"                        \n\t"
1281                YSCALEYUV2RGB(%%REGBP, %5)
1282                "pxor    %%mm7, %%mm7                   \n\t"
1283                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1284                "pop %%"REG_BP"                         \n\t"
1285                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1286                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1287                "a" (&c->redDither)
1288            );
1289            return;
1290        case PIX_FMT_RGB555:
1291            __asm__ volatile(
1292                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1293                "mov        %4, %%"REG_b"               \n\t"
1294                "push %%"REG_BP"                        \n\t"
1295                YSCALEYUV2RGB(%%REGBP, %5)
1296                "pxor    %%mm7, %%mm7                   \n\t"
1297                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1298#ifdef DITHER1XBPP
1299                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1300                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1301                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1302#endif
1303
1304                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1305                "pop %%"REG_BP"                         \n\t"
1306                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1307
1308                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1309                "a" (&c->redDither)
1310            );
1311            return;
1312        case PIX_FMT_RGB565:
1313            __asm__ volatile(
1314                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1315                "mov        %4, %%"REG_b"               \n\t"
1316                "push %%"REG_BP"                        \n\t"
1317                YSCALEYUV2RGB(%%REGBP, %5)
1318                "pxor    %%mm7, %%mm7                   \n\t"
1319                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1320#ifdef DITHER1XBPP
1321                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1322                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1323                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1324#endif
1325
1326                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1327                "pop %%"REG_BP"                         \n\t"
1328                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1329                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1330                "a" (&c->redDither)
1331            );
1332            return;
1333        case PIX_FMT_YUYV422:
1334            __asm__ volatile(
1335                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1336                "mov %4, %%"REG_b"                        \n\t"
1337                "push %%"REG_BP"                        \n\t"
1338                YSCALEYUV2PACKED(%%REGBP, %5)
1339                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1340                "pop %%"REG_BP"                         \n\t"
1341                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1342                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1343                "a" (&c->redDither)
1344            );
1345            return;
1346        default: break;
1347        }
1348    }
1349#endif //COMPILE_TEMPLATE_MMX
1350    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1351}
1352
1353/**
1354 * YV12 to RGB without scaling or interpolating
1355 */
1356static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1357                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1358{
1359    const int yalpha1=0;
1360    int i;
1361
1362    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1363    const int yalpha= 4096; //FIXME ...
1364
1365    if (flags&SWS_FULL_CHR_H_INT) {
1366        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1367        return;
1368    }
1369
1370#if COMPILE_TEMPLATE_MMX
1371    if(!(flags & SWS_BITEXACT)) {
1372        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1373            switch(dstFormat) {
1374            case PIX_FMT_RGB32:
1375                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1376                    __asm__ volatile(
1377                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1378                        "mov        %4, %%"REG_b"               \n\t"
1379                        "push %%"REG_BP"                        \n\t"
1380                        YSCALEYUV2RGB1(%%REGBP, %5)
1381                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1382                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1383                        "pop %%"REG_BP"                         \n\t"
1384                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1385
1386                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1387                        "a" (&c->redDither)
1388                    );
1389                } else {
1390                    __asm__ volatile(
1391                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1392                        "mov        %4, %%"REG_b"               \n\t"
1393                        "push %%"REG_BP"                        \n\t"
1394                        YSCALEYUV2RGB1(%%REGBP, %5)
1395                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1396                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1397                        "pop %%"REG_BP"                         \n\t"
1398                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1399
1400                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1401                        "a" (&c->redDither)
1402                    );
1403                }
1404                return;
1405            case PIX_FMT_BGR24:
1406                __asm__ volatile(
1407                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1408                    "mov        %4, %%"REG_b"               \n\t"
1409                    "push %%"REG_BP"                        \n\t"
1410                    YSCALEYUV2RGB1(%%REGBP, %5)
1411                    "pxor    %%mm7, %%mm7                   \n\t"
1412                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1413                    "pop %%"REG_BP"                         \n\t"
1414                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1415
1416                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1417                    "a" (&c->redDither)
1418                );
1419                return;
1420            case PIX_FMT_RGB555:
1421                __asm__ volatile(
1422                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1423                    "mov        %4, %%"REG_b"               \n\t"
1424                    "push %%"REG_BP"                        \n\t"
1425                    YSCALEYUV2RGB1(%%REGBP, %5)
1426                    "pxor    %%mm7, %%mm7                   \n\t"
1427                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1428#ifdef DITHER1XBPP
1429                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1430                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1431                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1432#endif
1433                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1434                    "pop %%"REG_BP"                         \n\t"
1435                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1436
1437                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1438                    "a" (&c->redDither)
1439                );
1440                return;
1441            case PIX_FMT_RGB565:
1442                __asm__ volatile(
1443                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1444                    "mov        %4, %%"REG_b"               \n\t"
1445                    "push %%"REG_BP"                        \n\t"
1446                    YSCALEYUV2RGB1(%%REGBP, %5)
1447                    "pxor    %%mm7, %%mm7                   \n\t"
1448                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1449#ifdef DITHER1XBPP
1450                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1451                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1452                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1453#endif
1454
1455                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1456                    "pop %%"REG_BP"                         \n\t"
1457                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1458
1459                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1460                    "a" (&c->redDither)
1461                );
1462                return;
1463            case PIX_FMT_YUYV422:
1464                __asm__ volatile(
1465                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1466                    "mov        %4, %%"REG_b"               \n\t"
1467                    "push %%"REG_BP"                        \n\t"
1468                    YSCALEYUV2PACKED1(%%REGBP, %5)
1469                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1470                    "pop %%"REG_BP"                         \n\t"
1471                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1472
1473                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1474                    "a" (&c->redDither)
1475                );
1476                return;
1477            }
1478        } else {
1479            switch(dstFormat) {
1480            case PIX_FMT_RGB32:
1481                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1482                    __asm__ volatile(
1483                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1484                        "mov        %4, %%"REG_b"               \n\t"
1485                        "push %%"REG_BP"                        \n\t"
1486                        YSCALEYUV2RGB1b(%%REGBP, %5)
1487                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1488                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1489                        "pop %%"REG_BP"                         \n\t"
1490                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1491
1492                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1493                        "a" (&c->redDither)
1494                    );
1495                } else {
1496                    __asm__ volatile(
1497                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1498                        "mov        %4, %%"REG_b"               \n\t"
1499                        "push %%"REG_BP"                        \n\t"
1500                        YSCALEYUV2RGB1b(%%REGBP, %5)
1501                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1502                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1503                        "pop %%"REG_BP"                         \n\t"
1504                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1505
1506                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1507                        "a" (&c->redDither)
1508                    );
1509                }
1510                return;
1511            case PIX_FMT_BGR24:
1512                __asm__ volatile(
1513                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1514                    "mov        %4, %%"REG_b"               \n\t"
1515                    "push %%"REG_BP"                        \n\t"
1516                    YSCALEYUV2RGB1b(%%REGBP, %5)
1517                    "pxor    %%mm7, %%mm7                   \n\t"
1518                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1519                    "pop %%"REG_BP"                         \n\t"
1520                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1521
1522                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1523                    "a" (&c->redDither)
1524                );
1525                return;
1526            case PIX_FMT_RGB555:
1527                __asm__ volatile(
1528                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1529                    "mov        %4, %%"REG_b"               \n\t"
1530                    "push %%"REG_BP"                        \n\t"
1531                    YSCALEYUV2RGB1b(%%REGBP, %5)
1532                    "pxor    %%mm7, %%mm7                   \n\t"
1533                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1534#ifdef DITHER1XBPP
1535                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1536                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1537                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1538#endif
1539                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1540                    "pop %%"REG_BP"                         \n\t"
1541                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1542
1543                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1544                    "a" (&c->redDither)
1545                );
1546                return;
1547            case PIX_FMT_RGB565:
1548                __asm__ volatile(
1549                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1550                    "mov        %4, %%"REG_b"               \n\t"
1551                    "push %%"REG_BP"                        \n\t"
1552                    YSCALEYUV2RGB1b(%%REGBP, %5)
1553                    "pxor    %%mm7, %%mm7                   \n\t"
1554                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1555#ifdef DITHER1XBPP
1556                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1557                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1558                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1559#endif
1560
1561                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1562                    "pop %%"REG_BP"                         \n\t"
1563                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1564
1565                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1566                    "a" (&c->redDither)
1567                );
1568                return;
1569            case PIX_FMT_YUYV422:
1570                __asm__ volatile(
1571                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1572                    "mov        %4, %%"REG_b"               \n\t"
1573                    "push %%"REG_BP"                        \n\t"
1574                    YSCALEYUV2PACKED1b(%%REGBP, %5)
1575                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1576                    "pop %%"REG_BP"                         \n\t"
1577                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1578
1579                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1580                    "a" (&c->redDither)
1581                );
1582                return;
1583            }
1584        }
1585    }
1586#endif /* COMPILE_TEMPLATE_MMX */
1587    if (uvalpha < 2048) {
1588        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1589    } else {
1590        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1591    }
1592}
1593
1594//FIXME yuy2* can read up to 7 samples too much
1595
1596static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1597{
1598#if COMPILE_TEMPLATE_MMX
1599    __asm__ volatile(
1600        "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1601        "mov                    %0, %%"REG_a"       \n\t"
1602        "1:                                         \n\t"
1603        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1604        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1605        "pand                %%mm2, %%mm0           \n\t"
1606        "pand                %%mm2, %%mm1           \n\t"
1607        "packuswb            %%mm1, %%mm0           \n\t"
1608        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1609        "add                    $8, %%"REG_a"       \n\t"
1610        " js                    1b                  \n\t"
1611        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1612        : "%"REG_a
1613    );
1614#else
1615    int i;
1616    for (i=0; i<width; i++)
1617        dst[i]= src[2*i];
1618#endif
1619}
1620
1621static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1622{
1623#if COMPILE_TEMPLATE_MMX
1624    __asm__ volatile(
1625        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1626        "mov                    %0, %%"REG_a"       \n\t"
1627        "1:                                         \n\t"
1628        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1629        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1630        "psrlw                  $8, %%mm0           \n\t"
1631        "psrlw                  $8, %%mm1           \n\t"
1632        "packuswb            %%mm1, %%mm0           \n\t"
1633        "movq                %%mm0, %%mm1           \n\t"
1634        "psrlw                  $8, %%mm0           \n\t"
1635        "pand                %%mm4, %%mm1           \n\t"
1636        "packuswb            %%mm0, %%mm0           \n\t"
1637        "packuswb            %%mm1, %%mm1           \n\t"
1638        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1639        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1640        "add                    $4, %%"REG_a"       \n\t"
1641        " js                    1b                  \n\t"
1642        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1643        : "%"REG_a
1644    );
1645#else
1646    int i;
1647    for (i=0; i<width; i++) {
1648        dstU[i]= src1[4*i + 1];
1649        dstV[i]= src1[4*i + 3];
1650    }
1651#endif
1652    assert(src1 == src2);
1653}
1654
1655static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1656{
1657#if COMPILE_TEMPLATE_MMX
1658    __asm__ volatile(
1659        "mov                    %0, %%"REG_a"       \n\t"
1660        "1:                                         \n\t"
1661        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1662        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1663        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1664        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1665        "psrlw                  $8, %%mm0           \n\t"
1666        "psrlw                  $8, %%mm1           \n\t"
1667        "psrlw                  $8, %%mm2           \n\t"
1668        "psrlw                  $8, %%mm3           \n\t"
1669        "packuswb            %%mm1, %%mm0           \n\t"
1670        "packuswb            %%mm3, %%mm2           \n\t"
1671        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1672        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1673        "add                    $8, %%"REG_a"       \n\t"
1674        " js                    1b                  \n\t"
1675        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1676        : "%"REG_a
1677    );
1678#else
1679    int i;
1680    for (i=0; i<width; i++) {
1681        dstU[i]= src1[2*i + 1];
1682        dstV[i]= src2[2*i + 1];
1683    }
1684#endif
1685}
1686
1687/* This is almost identical to the previous, end exists only because
1688 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1689static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1690{
1691#if COMPILE_TEMPLATE_MMX
1692    __asm__ volatile(
1693        "mov                  %0, %%"REG_a"         \n\t"
1694        "1:                                         \n\t"
1695        "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1696        "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1697        "psrlw                $8, %%mm0             \n\t"
1698        "psrlw                $8, %%mm1             \n\t"
1699        "packuswb          %%mm1, %%mm0             \n\t"
1700        "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1701        "add                  $8, %%"REG_a"         \n\t"
1702        " js                  1b                    \n\t"
1703        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1704        : "%"REG_a
1705    );
1706#else
1707    int i;
1708    for (i=0; i<width; i++)
1709        dst[i]= src[2*i+1];
1710#endif
1711}
1712
1713static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1714{
1715#if COMPILE_TEMPLATE_MMX
1716    __asm__ volatile(
1717        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1718        "mov                    %0, %%"REG_a"       \n\t"
1719        "1:                                         \n\t"
1720        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1721        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1722        "pand                %%mm4, %%mm0           \n\t"
1723        "pand                %%mm4, %%mm1           \n\t"
1724        "packuswb            %%mm1, %%mm0           \n\t"
1725        "movq                %%mm0, %%mm1           \n\t"
1726        "psrlw                  $8, %%mm0           \n\t"
1727        "pand                %%mm4, %%mm1           \n\t"
1728        "packuswb            %%mm0, %%mm0           \n\t"
1729        "packuswb            %%mm1, %%mm1           \n\t"
1730        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1731        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1732        "add                    $4, %%"REG_a"       \n\t"
1733        " js                    1b                  \n\t"
1734        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1735        : "%"REG_a
1736    );
1737#else
1738    int i;
1739    for (i=0; i<width; i++) {
1740        dstU[i]= src1[4*i + 0];
1741        dstV[i]= src1[4*i + 2];
1742    }
1743#endif
1744    assert(src1 == src2);
1745}
1746
1747static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1748{
1749#if COMPILE_TEMPLATE_MMX
1750    __asm__ volatile(
1751        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1752        "mov                    %0, %%"REG_a"       \n\t"
1753        "1:                                         \n\t"
1754        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1755        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1756        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1757        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1758        "pand                %%mm4, %%mm0           \n\t"
1759        "pand                %%mm4, %%mm1           \n\t"
1760        "pand                %%mm4, %%mm2           \n\t"
1761        "pand                %%mm4, %%mm3           \n\t"
1762        "packuswb            %%mm1, %%mm0           \n\t"
1763        "packuswb            %%mm3, %%mm2           \n\t"
1764        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1765        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1766        "add                    $8, %%"REG_a"       \n\t"
1767        " js                    1b                  \n\t"
1768        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1769        : "%"REG_a
1770    );
1771#else
1772    int i;
1773    for (i=0; i<width; i++) {
1774        dstU[i]= src1[2*i];
1775        dstV[i]= src2[2*i];
1776    }
1777#endif
1778}
1779
1780static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1781                                    const uint8_t *src, long width)
1782{
1783#if COMPILE_TEMPLATE_MMX
1784    __asm__ volatile(
1785        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1786        "mov                    %0, %%"REG_a"       \n\t"
1787        "1:                                         \n\t"
1788        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1789        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1790        "movq                %%mm0, %%mm2           \n\t"
1791        "movq                %%mm1, %%mm3           \n\t"
1792        "pand                %%mm4, %%mm0           \n\t"
1793        "pand                %%mm4, %%mm1           \n\t"
1794        "psrlw                  $8, %%mm2           \n\t"
1795        "psrlw                  $8, %%mm3           \n\t"
1796        "packuswb            %%mm1, %%mm0           \n\t"
1797        "packuswb            %%mm3, %%mm2           \n\t"
1798        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1799        "movq                %%mm2, (%3, %%"REG_a") \n\t"
1800        "add                    $8, %%"REG_a"       \n\t"
1801        " js                    1b                  \n\t"
1802        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1803        : "%"REG_a
1804    );
1805#else
1806    int i;
1807    for (i = 0; i < width; i++) {
1808        dst1[i] = src[2*i+0];
1809        dst2[i] = src[2*i+1];
1810    }
1811#endif
1812}
1813
1814static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1815                                    const uint8_t *src1, const uint8_t *src2,
1816                                    long width, uint32_t *unused)
1817{
1818    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1819}
1820
1821static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1822                                    const uint8_t *src1, const uint8_t *src2,
1823                                    long width, uint32_t *unused)
1824{
1825    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1826}
1827
1828#if COMPILE_TEMPLATE_MMX
1829static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1830{
1831
1832    if(srcFormat == PIX_FMT_BGR24) {
1833        __asm__ volatile(
1834            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1835            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1836            :
1837        );
1838    } else {
1839        __asm__ volatile(
1840            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1841            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1842            :
1843        );
1844    }
1845
1846    __asm__ volatile(
1847        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1848        "mov                        %2, %%"REG_a"   \n\t"
1849        "pxor                    %%mm7, %%mm7       \n\t"
1850        "1:                                         \n\t"
1851        PREFETCH"               64(%0)              \n\t"
1852        "movd                     (%0), %%mm0       \n\t"
1853        "movd                    2(%0), %%mm1       \n\t"
1854        "movd                    6(%0), %%mm2       \n\t"
1855        "movd                    8(%0), %%mm3       \n\t"
1856        "add                       $12, %0          \n\t"
1857        "punpcklbw               %%mm7, %%mm0       \n\t"
1858        "punpcklbw               %%mm7, %%mm1       \n\t"
1859        "punpcklbw               %%mm7, %%mm2       \n\t"
1860        "punpcklbw               %%mm7, %%mm3       \n\t"
1861        "pmaddwd                 %%mm5, %%mm0       \n\t"
1862        "pmaddwd                 %%mm6, %%mm1       \n\t"
1863        "pmaddwd                 %%mm5, %%mm2       \n\t"
1864        "pmaddwd                 %%mm6, %%mm3       \n\t"
1865        "paddd                   %%mm1, %%mm0       \n\t"
1866        "paddd                   %%mm3, %%mm2       \n\t"
1867        "paddd                   %%mm4, %%mm0       \n\t"
1868        "paddd                   %%mm4, %%mm2       \n\t"
1869        "psrad                     $15, %%mm0       \n\t"
1870        "psrad                     $15, %%mm2       \n\t"
1871        "packssdw                %%mm2, %%mm0       \n\t"
1872        "packuswb                %%mm0, %%mm0       \n\t"
1873        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1874        "add                        $4, %%"REG_a"   \n\t"
1875        " js                        1b              \n\t"
1876    : "+r" (src)
1877    : "r" (dst+width), "g" ((x86_reg)-width)
1878    : "%"REG_a
1879    );
1880}
1881
1882static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1883{
1884    __asm__ volatile(
1885        "movq                    24+%4, %%mm6       \n\t"
1886        "mov                        %3, %%"REG_a"   \n\t"
1887        "pxor                    %%mm7, %%mm7       \n\t"
1888        "1:                                         \n\t"
1889        PREFETCH"               64(%0)              \n\t"
1890        "movd                     (%0), %%mm0       \n\t"
1891        "movd                    2(%0), %%mm1       \n\t"
1892        "punpcklbw               %%mm7, %%mm0       \n\t"
1893        "punpcklbw               %%mm7, %%mm1       \n\t"
1894        "movq                    %%mm0, %%mm2       \n\t"
1895        "movq                    %%mm1, %%mm3       \n\t"
1896        "pmaddwd                    %4, %%mm0       \n\t"
1897        "pmaddwd                  8+%4, %%mm1       \n\t"
1898        "pmaddwd                 16+%4, %%mm2       \n\t"
1899        "pmaddwd                 %%mm6, %%mm3       \n\t"
1900        "paddd                   %%mm1, %%mm0       \n\t"
1901        "paddd                   %%mm3, %%mm2       \n\t"
1902
1903        "movd                    6(%0), %%mm1       \n\t"
1904        "movd                    8(%0), %%mm3       \n\t"
1905        "add                       $12, %0          \n\t"
1906        "punpcklbw               %%mm7, %%mm1       \n\t"
1907        "punpcklbw               %%mm7, %%mm3       \n\t"
1908        "movq                    %%mm1, %%mm4       \n\t"
1909        "movq                    %%mm3, %%mm5       \n\t"
1910        "pmaddwd                    %4, %%mm1       \n\t"
1911        "pmaddwd                  8+%4, %%mm3       \n\t"
1912        "pmaddwd                 16+%4, %%mm4       \n\t"
1913        "pmaddwd                 %%mm6, %%mm5       \n\t"
1914        "paddd                   %%mm3, %%mm1       \n\t"
1915        "paddd                   %%mm5, %%mm4       \n\t"
1916
1917        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1918        "paddd                   %%mm3, %%mm0       \n\t"
1919        "paddd                   %%mm3, %%mm2       \n\t"
1920        "paddd                   %%mm3, %%mm1       \n\t"
1921        "paddd                   %%mm3, %%mm4       \n\t"
1922        "psrad                     $15, %%mm0       \n\t"
1923        "psrad                     $15, %%mm2       \n\t"
1924        "psrad                     $15, %%mm1       \n\t"
1925        "psrad                     $15, %%mm4       \n\t"
1926        "packssdw                %%mm1, %%mm0       \n\t"
1927        "packssdw                %%mm4, %%mm2       \n\t"
1928        "packuswb                %%mm0, %%mm0       \n\t"
1929        "packuswb                %%mm2, %%mm2       \n\t"
1930        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1931        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1932        "add                        $4, %%"REG_a"   \n\t"
1933        " js                        1b              \n\t"
1934    : "+r" (src)
1935    : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1936    : "%"REG_a
1937    );
1938}
1939#endif
1940
1941static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1942{
1943#if COMPILE_TEMPLATE_MMX
1944    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1945#else
1946    int i;
1947    for (i=0; i<width; i++) {
1948        int b= src[i*3+0];
1949        int g= src[i*3+1];
1950        int r= src[i*3+2];
1951
1952        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1953    }
1954#endif /* COMPILE_TEMPLATE_MMX */
1955}
1956
1957static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1958{
1959#if COMPILE_TEMPLATE_MMX
1960    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1961#else
1962    int i;
1963    for (i=0; i<width; i++) {
1964        int b= src1[3*i + 0];
1965        int g= src1[3*i + 1];
1966        int r= src1[3*i + 2];
1967
1968        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1969        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1970    }
1971#endif /* COMPILE_TEMPLATE_MMX */
1972    assert(src1 == src2);
1973}
1974
1975static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1976{
1977    int i;
1978    for (i=0; i<width; i++) {
1979        int b= src1[6*i + 0] + src1[6*i + 3];
1980        int g= src1[6*i + 1] + src1[6*i + 4];
1981        int r= src1[6*i + 2] + src1[6*i + 5];
1982
1983        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1984        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1985    }
1986    assert(src1 == src2);
1987}
1988
1989static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1990{
1991#if COMPILE_TEMPLATE_MMX
1992    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1993#else
1994    int i;
1995    for (i=0; i<width; i++) {
1996        int r= src[i*3+0];
1997        int g= src[i*3+1];
1998        int b= src[i*3+2];
1999
2000        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2001    }
2002#endif
2003}
2004
2005static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2006{
2007#if COMPILE_TEMPLATE_MMX
2008    assert(src1==src2);
2009    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2010#else
2011    int i;
2012    assert(src1==src2);
2013    for (i=0; i<width; i++) {
2014        int r= src1[3*i + 0];
2015        int g= src1[3*i + 1];
2016        int b= src1[3*i + 2];
2017
2018        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2019        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2020    }
2021#endif
2022}
2023
2024static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2025{
2026    int i;
2027    assert(src1==src2);
2028    for (i=0; i<width; i++) {
2029        int r= src1[6*i + 0] + src1[6*i + 3];
2030        int g= src1[6*i + 1] + src1[6*i + 4];
2031        int b= src1[6*i + 2] + src1[6*i + 5];
2032
2033        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2034        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2035    }
2036}
2037
2038
2039// bilinear / bicubic scaling
2040static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2041                                  const int16_t *filter, const int16_t *filterPos, long filterSize)
2042{
2043#if COMPILE_TEMPLATE_MMX
2044    assert(filterSize % 4 == 0 && filterSize>0);
2045    if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2046        x86_reg counter= -2*dstW;
2047        filter-= counter*2;
2048        filterPos-= counter/2;
2049        dst-= counter/2;
2050        __asm__ volatile(
2051#if defined(PIC)
2052            "push            %%"REG_b"              \n\t"
2053#endif
2054            "pxor                %%mm7, %%mm7       \n\t"
2055            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2056            "mov             %%"REG_a", %%"REG_BP"  \n\t"
2057            ASMALIGN(4)
2058            "1:                                     \n\t"
2059            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2060            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2061            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2062            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2063            "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2064            "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2065            "punpcklbw           %%mm7, %%mm0       \n\t"
2066            "punpcklbw           %%mm7, %%mm2       \n\t"
2067            "pmaddwd             %%mm1, %%mm0       \n\t"
2068            "pmaddwd             %%mm2, %%mm3       \n\t"
2069            "movq                %%mm0, %%mm4       \n\t"
2070            "punpckldq           %%mm3, %%mm0       \n\t"
2071            "punpckhdq           %%mm3, %%mm4       \n\t"
2072            "paddd               %%mm4, %%mm0       \n\t"
2073            "psrad                  $7, %%mm0       \n\t"
2074            "packssdw            %%mm0, %%mm0       \n\t"
2075            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2076            "add                    $4, %%"REG_BP"  \n\t"
2077            " jnc                   1b              \n\t"
2078
2079            "pop            %%"REG_BP"              \n\t"
2080#if defined(PIC)
2081            "pop             %%"REG_b"              \n\t"
2082#endif
2083            : "+a" (counter)
2084            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2085#if !defined(PIC)
2086            : "%"REG_b
2087#endif
2088        );
2089    } else if (filterSize==8) {
2090        x86_reg counter= -2*dstW;
2091        filter-= counter*4;
2092        filterPos-= counter/2;
2093        dst-= counter/2;
2094        __asm__ volatile(
2095#if defined(PIC)
2096            "push             %%"REG_b"             \n\t"
2097#endif
2098            "pxor                 %%mm7, %%mm7      \n\t"
2099            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2100            "mov              %%"REG_a", %%"REG_BP" \n\t"
2101            ASMALIGN(4)
2102            "1:                                     \n\t"
2103            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2104            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2105            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2106            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2107            "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2108            "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2109            "punpcklbw            %%mm7, %%mm0      \n\t"
2110            "punpcklbw            %%mm7, %%mm2      \n\t"
2111            "pmaddwd              %%mm1, %%mm0      \n\t"
2112            "pmaddwd              %%mm2, %%mm3      \n\t"
2113
2114            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2115            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2116            "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2117            "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2118            "punpcklbw            %%mm7, %%mm4      \n\t"
2119            "punpcklbw            %%mm7, %%mm2      \n\t"
2120            "pmaddwd              %%mm1, %%mm4      \n\t"
2121            "pmaddwd              %%mm2, %%mm5      \n\t"
2122            "paddd                %%mm4, %%mm0      \n\t"
2123            "paddd                %%mm5, %%mm3      \n\t"
2124            "movq                 %%mm0, %%mm4      \n\t"
2125            "punpckldq            %%mm3, %%mm0      \n\t"
2126            "punpckhdq            %%mm3, %%mm4      \n\t"
2127            "paddd                %%mm4, %%mm0      \n\t"
2128            "psrad                   $7, %%mm0      \n\t"
2129            "packssdw             %%mm0, %%mm0      \n\t"
2130            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2131            "add                     $4, %%"REG_BP" \n\t"
2132            " jnc                    1b             \n\t"
2133
2134            "pop             %%"REG_BP"             \n\t"
2135#if defined(PIC)
2136            "pop              %%"REG_b"             \n\t"
2137#endif
2138            : "+a" (counter)
2139            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2140#if !defined(PIC)
2141            : "%"REG_b
2142#endif
2143        );
2144    } else {
2145        const uint8_t *offset = src+filterSize;
2146        x86_reg counter= -2*dstW;
2147        //filter-= counter*filterSize/2;
2148        filterPos-= counter/2;
2149        dst-= counter/2;
2150        __asm__ volatile(
2151            "pxor                  %%mm7, %%mm7     \n\t"
2152            ASMALIGN(4)
2153            "1:                                     \n\t"
2154            "mov                      %2, %%"REG_c" \n\t"
2155            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2156            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2157            "mov                      %5, %%"REG_c" \n\t"
2158            "pxor                  %%mm4, %%mm4     \n\t"
2159            "pxor                  %%mm5, %%mm5     \n\t"
2160            "2:                                     \n\t"
2161            "movq                   (%1), %%mm1     \n\t"
2162            "movq               (%1, %6), %%mm3     \n\t"
2163            "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2164            "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2165            "punpcklbw             %%mm7, %%mm0     \n\t"
2166            "punpcklbw             %%mm7, %%mm2     \n\t"
2167            "pmaddwd               %%mm1, %%mm0     \n\t"
2168            "pmaddwd               %%mm2, %%mm3     \n\t"
2169            "paddd                 %%mm3, %%mm5     \n\t"
2170            "paddd                 %%mm0, %%mm4     \n\t"
2171            "add                      $8, %1        \n\t"
2172            "add                      $4, %%"REG_c" \n\t"
2173            "cmp                      %4, %%"REG_c" \n\t"
2174            " jb                      2b            \n\t"
2175            "add                      %6, %1        \n\t"
2176            "movq                  %%mm4, %%mm0     \n\t"
2177            "punpckldq             %%mm5, %%mm4     \n\t"
2178            "punpckhdq             %%mm5, %%mm0     \n\t"
2179            "paddd                 %%mm0, %%mm4     \n\t"
2180            "psrad                    $7, %%mm4     \n\t"
2181            "packssdw              %%mm4, %%mm4     \n\t"
2182            "mov                      %3, %%"REG_a" \n\t"
2183            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2184            "add                      $4, %0        \n\t"
2185            " jnc                     1b            \n\t"
2186
2187            : "+r" (counter), "+r" (filter)
2188            : "m" (filterPos), "m" (dst), "m"(offset),
2189            "m" (src), "r" ((x86_reg)filterSize*2)
2190            : "%"REG_a, "%"REG_c, "%"REG_d
2191        );
2192    }
2193#else
2194#if COMPILE_TEMPLATE_ALTIVEC
2195    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2196#else
2197    int i;
2198    for (i=0; i<dstW; i++) {
2199        int j;
2200        int srcPos= filterPos[i];
2201        int val=0;
2202        //printf("filterPos: %d\n", filterPos[i]);
2203        for (j=0; j<filterSize; j++) {
2204            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2205            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2206        }
2207        //filter += hFilterSize;
2208        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2209        //dst[i] = val>>7;
2210    }
2211#endif /* COMPILE_TEMPLATE_ALTIVEC */
2212#endif /* COMPILE_MMX */
2213}
2214
2215//FIXME all pal and rgb srcFormats could do this convertion as well
2216//FIXME all scalers more complex than bilinear could do half of this transform
2217static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2218{
2219    int i;
2220    for (i = 0; i < width; i++) {
2221        dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2222        dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2223    }
2224}
2225static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2226{
2227    int i;
2228    for (i = 0; i < width; i++) {
2229        dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
2230        dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2231    }
2232}
2233static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2234{
2235    int i;
2236    for (i = 0; i < width; i++)
2237        dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2238}
2239static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2240{
2241    int i;
2242    for (i = 0; i < width; i++)
2243        dst[i] = (dst[i]*14071 + 33561947)>>14;
2244}
2245
2246#define FAST_BILINEAR_X86 \
2247    "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
2248    "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
2249    "shll      $16, %%edi    \n\t"                                              \
2250    "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
2251    "mov        %1, %%"REG_D"\n\t"                                              \
2252    "shrl       $9, %%esi    \n\t"                                              \
2253
2254static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2255                                        long dstWidth, const uint8_t *src, int srcW,
2256                                        int xInc)
2257{
2258#if ARCH_X86
2259#if COMPILE_TEMPLATE_MMX2
2260    int32_t *filterPos = c->hLumFilterPos;
2261    int16_t *filter    = c->hLumFilter;
2262    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2263    void    *mmx2FilterCode= c->lumMmx2FilterCode;
2264    int i;
2265#if defined(PIC)
2266    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2267#endif
2268    if (canMMX2BeUsed) {
2269        __asm__ volatile(
2270#if defined(PIC)
2271            "mov               %%"REG_b", %5        \n\t"
2272#endif
2273            "pxor                  %%mm7, %%mm7     \n\t"
2274            "mov                      %0, %%"REG_c" \n\t"
2275            "mov                      %1, %%"REG_D" \n\t"
2276            "mov                      %2, %%"REG_d" \n\t"
2277            "mov                      %3, %%"REG_b" \n\t"
2278            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2279            PREFETCH"        (%%"REG_c")            \n\t"
2280            PREFETCH"      32(%%"REG_c")            \n\t"
2281            PREFETCH"      64(%%"REG_c")            \n\t"
2282
2283#if ARCH_X86_64
2284
2285#define CALL_MMX2_FILTER_CODE \
2286            "movl            (%%"REG_b"), %%esi     \n\t"\
2287            "call                    *%4            \n\t"\
2288            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2289            "add               %%"REG_S", %%"REG_c" \n\t"\
2290            "add               %%"REG_a", %%"REG_D" \n\t"\
2291            "xor               %%"REG_a", %%"REG_a" \n\t"\
2292
2293#else
2294
2295#define CALL_MMX2_FILTER_CODE \
2296            "movl (%%"REG_b"), %%esi        \n\t"\
2297            "call         *%4                       \n\t"\
2298            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2299            "add               %%"REG_a", %%"REG_D" \n\t"\
2300            "xor               %%"REG_a", %%"REG_a" \n\t"\
2301
2302#endif /* ARCH_X86_64 */
2303
2304            CALL_MMX2_FILTER_CODE
2305            CALL_MMX2_FILTER_CODE
2306            CALL_MMX2_FILTER_CODE
2307            CALL_MMX2_FILTER_CODE
2308            CALL_MMX2_FILTER_CODE
2309            CALL_MMX2_FILTER_CODE
2310            CALL_MMX2_FILTER_CODE
2311            CALL_MMX2_FILTER_CODE
2312
2313#if defined(PIC)
2314            "mov                      %5, %%"REG_b" \n\t"
2315#endif
2316            :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2317            "m" (mmx2FilterCode)
2318#if defined(PIC)
2319            ,"m" (ebxsave)
2320#endif
2321            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2322#if !defined(PIC)
2323            ,"%"REG_b
2324#endif
2325        );
2326        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2327    } else {
2328#endif /* COMPILE_TEMPLATE_MMX2 */
2329    x86_reg xInc_shr16 = xInc >> 16;
2330    uint16_t xInc_mask = xInc & 0xffff;
2331    //NO MMX just normal asm ...
2332    __asm__ volatile(
2333        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2334        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2335        "xorl    %%ecx, %%ecx                \n\t" // xalpha
2336        ASMALIGN(4)
2337        "1:                                  \n\t"
2338        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2339        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2340        FAST_BILINEAR_X86
2341        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2342        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2343        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2344
2345        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2346        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2347        FAST_BILINEAR_X86
2348        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2349        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2350        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2351
2352
2353        "add        $2, %%"REG_a"            \n\t"
2354        "cmp        %2, %%"REG_a"            \n\t"
2355        " jb        1b                       \n\t"
2356
2357
2358        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2359        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2360    );
2361#if COMPILE_TEMPLATE_MMX2
2362    } //if MMX2 can't be used
2363#endif
2364#else
2365    int i;
2366    unsigned int xpos=0;
2367    for (i=0;i<dstWidth;i++) {
2368        register unsigned int xx=xpos>>16;
2369        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2370        dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2371        xpos+=xInc;
2372    }
2373#endif /* ARCH_X86 */
2374}
2375
2376      // *** horizontal scale Y line to temp buffer
2377static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2378                                   const int16_t *hLumFilter,
2379                                   const int16_t *hLumFilterPos, int hLumFilterSize,
2380                                   uint8_t *formatConvBuffer,
2381                                   uint32_t *pal, int isAlpha)
2382{
2383    void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2384    void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2385
2386    src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2387
2388    if (toYV12) {
2389        toYV12(formatConvBuffer, src, srcW, pal);
2390        src= formatConvBuffer;
2391    }
2392
2393    if (!c->hyscale_fast) {
2394        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2395    } else { // fast bilinear upscale / crap downscale
2396        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2397    }
2398
2399    if (convertRange)
2400        convertRange(dst, dstWidth);
2401}
2402
2403static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2404                                        long dstWidth, const uint8_t *src1,
2405                                        const uint8_t *src2, int srcW, int xInc)
2406{
2407#if ARCH_X86
2408#if COMPILE_TEMPLATE_MMX2
2409    int32_t *filterPos = c->hChrFilterPos;
2410    int16_t *filter    = c->hChrFilter;
2411    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2412    void    *mmx2FilterCode= c->chrMmx2FilterCode;
2413    int i;
2414#if defined(PIC)
2415    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2416#endif
2417    if (canMMX2BeUsed) {
2418        __asm__ volatile(
2419#if defined(PIC)
2420            "mov          %%"REG_b", %6         \n\t"
2421#endif
2422            "pxor             %%mm7, %%mm7      \n\t"
2423            "mov                 %0, %%"REG_c"  \n\t"
2424            "mov                 %1, %%"REG_D"  \n\t"
2425            "mov                 %2, %%"REG_d"  \n\t"
2426            "mov                 %3, %%"REG_b"  \n\t"
2427            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2428            PREFETCH"   (%%"REG_c")             \n\t"
2429            PREFETCH" 32(%%"REG_c")             \n\t"
2430            PREFETCH" 64(%%"REG_c")             \n\t"
2431
2432            CALL_MMX2_FILTER_CODE
2433            CALL_MMX2_FILTER_CODE
2434            CALL_MMX2_FILTER_CODE
2435            CALL_MMX2_FILTER_CODE
2436            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2437            "mov                 %5, %%"REG_c"  \n\t" // src
2438            "mov                 %1, %%"REG_D"  \n\t" // buf1
2439            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2440            PREFETCH"   (%%"REG_c")             \n\t"
2441            PREFETCH" 32(%%"REG_c")             \n\t"
2442            PREFETCH" 64(%%"REG_c")             \n\t"
2443
2444            CALL_MMX2_FILTER_CODE
2445            CALL_MMX2_FILTER_CODE
2446            CALL_MMX2_FILTER_CODE
2447            CALL_MMX2_FILTER_CODE
2448
2449#if defined(PIC)
2450            "mov %6, %%"REG_b"    \n\t"
2451#endif
2452            :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2453            "m" (mmx2FilterCode), "m" (src2)
2454#if defined(PIC)
2455            ,"m" (ebxsave)
2456#endif
2457            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2458#if !defined(PIC)
2459            ,"%"REG_b
2460#endif
2461        );
2462        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2463            //printf("%d %d %d\n", dstWidth, i, srcW);
2464            dst[i] = src1[srcW-1]*128;
2465            dst[i+VOFW] = src2[srcW-1]*128;
2466        }
2467    } else {
2468#endif /* COMPILE_TEMPLATE_MMX2 */
2469        x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2470        uint16_t xInc_mask = xInc & 0xffff;
2471        __asm__ volatile(
2472            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2473            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2474            "xorl    %%ecx, %%ecx                   \n\t" // xalpha
2475            ASMALIGN(4)
2476            "1:                                     \n\t"
2477            "mov        %0, %%"REG_S"               \n\t"
2478            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2479            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2480            FAST_BILINEAR_X86
2481            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2482
2483            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2484            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2485            FAST_BILINEAR_X86
2486            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2487
2488            "addw       %4, %%cx                    \n\t" //xalpha += xInc&0xFFFF
2489            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>16 + carry
2490            "add        $1, %%"REG_a"               \n\t"
2491            "cmp        %2, %%"REG_a"               \n\t"
2492            " jb        1b                          \n\t"
2493
2494/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2495which is needed to support GCC 4.0. */
2496#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2497            :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2498#else
2499            :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2500#endif
2501            "r" (src2)
2502            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2503        );
2504#if COMPILE_TEMPLATE_MMX2
2505    } //if MMX2 can't be used
2506#endif
2507#else
2508    int i;
2509    unsigned int xpos=0;
2510    for (i=0;i<dstWidth;i++) {
2511        register unsigned int xx=xpos>>16;
2512        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2513        dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2514        dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2515        /* slower
2516        dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2517        dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2518        */
2519        xpos+=xInc;
2520    }
2521#endif /* ARCH_X86 */
2522}
2523
2524inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2525                                   int srcW, int xInc, const int16_t *hChrFilter,
2526                                   const int16_t *hChrFilterPos, int hChrFilterSize,
2527                                   uint8_t *formatConvBuffer,
2528                                   uint32_t *pal)
2529{
2530
2531    src1 += c->chrSrcOffset;
2532    src2 += c->chrSrcOffset;
2533
2534    if (c->chrToYV12) {
2535        c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2536        src1= formatConvBuffer;
2537        src2= formatConvBuffer+VOFW;
2538    }
2539
2540    if (!c->hcscale_fast) {
2541        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2542        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2543    } else { // fast bilinear upscale / crap downscale
2544        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2545    }
2546
2547    if (c->chrConvertRange)
2548        c->chrConvertRange(dst, dstWidth);
2549}
2550
2551#define DEBUG_SWSCALE_BUFFERS 0
2552#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2553
2554static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2555                           int srcSliceH, uint8_t* dst[], int dstStride[])
2556{
2557    /* load a few things into local vars to make the code more readable? and faster */
2558    const int srcW= c->srcW;
2559    const int dstW= c->dstW;
2560    const int dstH= c->dstH;
2561    const int chrDstW= c->chrDstW;
2562    const int chrSrcW= c->chrSrcW;
2563    const int lumXInc= c->lumXInc;
2564    const int chrXInc= c->chrXInc;
2565    const enum PixelFormat dstFormat= c->dstFormat;
2566    const int flags= c->flags;
2567    int16_t *vLumFilterPos= c->vLumFilterPos;
2568    int16_t *vChrFilterPos= c->vChrFilterPos;
2569    int16_t *hLumFilterPos= c->hLumFilterPos;
2570    int16_t *hChrFilterPos= c->hChrFilterPos;
2571    int16_t *vLumFilter= c->vLumFilter;
2572    int16_t *vChrFilter= c->vChrFilter;
2573    int16_t *hLumFilter= c->hLumFilter;
2574    int16_t *hChrFilter= c->hChrFilter;
2575    int32_t *lumMmxFilter= c->lumMmxFilter;
2576    int32_t *chrMmxFilter= c->chrMmxFilter;
2577    int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2578    const int vLumFilterSize= c->vLumFilterSize;
2579    const int vChrFilterSize= c->vChrFilterSize;
2580    const int hLumFilterSize= c->hLumFilterSize;
2581    const int hChrFilterSize= c->hChrFilterSize;
2582    int16_t **lumPixBuf= c->lumPixBuf;
2583    int16_t **chrPixBuf= c->chrPixBuf;
2584    int16_t **alpPixBuf= c->alpPixBuf;
2585    const int vLumBufSize= c->vLumBufSize;
2586    const int vChrBufSize= c->vChrBufSize;
2587    uint8_t *formatConvBuffer= c->formatConvBuffer;
2588    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2589    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2590    int lastDstY;
2591    uint32_t *pal=c->pal_yuv;
2592
2593    /* vars which will change and which we need to store back in the context */
2594    int dstY= c->dstY;
2595    int lumBufIndex= c->lumBufIndex;
2596    int chrBufIndex= c->chrBufIndex;
2597    int lastInLumBuf= c->lastInLumBuf;
2598    int lastInChrBuf= c->lastInChrBuf;
2599
2600    if (isPacked(c->srcFormat)) {
2601        src[0]=
2602        src[1]=
2603        src[2]=
2604        src[3]= src[0];
2605        srcStride[0]=
2606        srcStride[1]=
2607        srcStride[2]=
2608        srcStride[3]= srcStride[0];
2609    }
2610    srcStride[1]<<= c->vChrDrop;
2611    srcStride[2]<<= c->vChrDrop;
2612
2613    DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2614                  src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2615                  dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2616    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2617                   srcSliceY,    srcSliceH,    dstY,    dstH);
2618    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2619                   vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2620
2621    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2622        static int warnedAlready=0; //FIXME move this into the context perhaps
2623        if (flags & SWS_PRINT_INFO && !warnedAlready) {
2624            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2625                   "         ->cannot do aligned memory accesses anymore\n");
2626            warnedAlready=1;
2627        }
2628    }
2629
2630    /* Note the user might start scaling the picture in the middle so this
2631       will not get executed. This is not really intended but works
2632       currently, so people might do it. */
2633    if (srcSliceY ==0) {
2634        lumBufIndex=-1;
2635        chrBufIndex=-1;
2636        dstY=0;
2637        lastInLumBuf= -1;
2638        lastInChrBuf= -1;
2639    }
2640
2641    lastDstY= dstY;
2642
2643    for (;dstY < dstH; dstY++) {
2644        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2645        const int chrDstY= dstY>>c->chrDstVSubSample;
2646        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2647        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2648        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2649
2650        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2651        const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2652        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2653        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2654        int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2655        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2656        int enough_lines;
2657
2658        //handle holes (FAST_BILINEAR & weird filters)
2659        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2660        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2661        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2662        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2663
2664        DEBUG_BUFFERS("dstY: %d\n", dstY);
2665        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2666                         firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2667        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2668                         firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2669
2670        // Do we have enough lines in this slice to output the dstY line
2671        enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2672
2673        if (!enough_lines) {
2674            lastLumSrcY = srcSliceY + srcSliceH - 1;
2675            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2676            DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2677                                            lastLumSrcY, lastChrSrcY);
2678        }
2679
2680        //Do horizontal scaling
2681        while(lastInLumBuf < lastLumSrcY) {
2682            const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2683            const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2684            lumBufIndex++;
2685            assert(lumBufIndex < 2*vLumBufSize);
2686            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2687            assert(lastInLumBuf + 1 - srcSliceY >= 0);
2688            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2689                            hLumFilter, hLumFilterPos, hLumFilterSize,
2690                            formatConvBuffer,
2691                            pal, 0);
2692            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2693                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2694                                hLumFilter, hLumFilterPos, hLumFilterSize,
2695                                formatConvBuffer,
2696                                pal, 1);
2697            lastInLumBuf++;
2698            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2699                               lumBufIndex,    lastInLumBuf);
2700        }
2701        while(lastInChrBuf < lastChrSrcY) {
2702            const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2703            const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2704            chrBufIndex++;
2705            assert(chrBufIndex < 2*vChrBufSize);
2706            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2707            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2708            //FIXME replace parameters through context struct (some at least)
2709
2710            if (c->needs_hcscale)
2711                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2712                                hChrFilter, hChrFilterPos, hChrFilterSize,
2713                                formatConvBuffer,
2714                                pal);
2715            lastInChrBuf++;
2716            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2717                               chrBufIndex,    lastInChrBuf);
2718        }
2719        //wrap buf index around to stay inside the ring buffer
2720        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2721        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2722        if (!enough_lines)
2723            break; //we can't output a dstY line so let's try with the next slice
2724
2725#if COMPILE_TEMPLATE_MMX
2726        c->blueDither= ff_dither8[dstY&1];
2727        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2728            c->greenDither= ff_dither8[dstY&1];
2729        else
2730            c->greenDither= ff_dither4[dstY&1];
2731        c->redDither= ff_dither8[(dstY+1)&1];
2732#endif
2733        if (dstY < dstH-2) {
2734            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2735            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2736            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2737#if COMPILE_TEMPLATE_MMX
2738            int i;
2739            if (flags & SWS_ACCURATE_RND) {
2740                int s= APCK_SIZE / 8;
2741                for (i=0; i<vLumFilterSize; i+=2) {
2742                    *(const void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
2743                    *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
2744                              lumMmxFilter[s*i+APCK_COEF/4  ]=
2745                              lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
2746                        + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2747                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2748                        *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
2749                        *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
2750                                  alpMmxFilter[s*i+APCK_COEF/4  ]=
2751                                  alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
2752                    }
2753                }
2754                for (i=0; i<vChrFilterSize; i+=2) {
2755                    *(const void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
2756                    *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
2757                              chrMmxFilter[s*i+APCK_COEF/4  ]=
2758                              chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
2759                        + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2760                }
2761            } else {
2762                for (i=0; i<vLumFilterSize; i++) {
2763                    lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2764                    lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2765                    lumMmxFilter[4*i+2]=
2766                    lumMmxFilter[4*i+3]=
2767                        ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2768                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2769                        alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2770                        alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2771                        alpMmxFilter[4*i+2]=
2772                        alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2773                    }
2774                }
2775                for (i=0; i<vChrFilterSize; i++) {
2776                    chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2777                    chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2778                    chrMmxFilter[4*i+2]=
2779                    chrMmxFilter[4*i+3]=
2780                        ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2781                }
2782            }
2783#endif
2784            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2785                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2786                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2787                c->yuv2nv12X(c,
2788                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2789                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2790                             dest, uDest, dstW, chrDstW, dstFormat);
2791            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2792                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2793                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2794                if (is16BPS(dstFormat)) {
2795                    yuv2yuvX16inC(
2796                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2797                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2798                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2799                                  dstFormat);
2800                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2801                    const int16_t *lumBuf = lumSrcPtr[0];
2802                    const int16_t *chrBuf= chrSrcPtr[0];
2803                    const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2804                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2805                } else { //General YV12
2806                    c->yuv2yuvX(c,
2807                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2808                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2809                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2810                }
2811            } else {
2812                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2813                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2814                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2815                    int chrAlpha= vChrFilter[2*dstY+1];
2816                    if(flags & SWS_FULL_CHR_H_INT) {
2817                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2818                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2819                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2820                                         alpSrcPtr, dest, dstW, dstY);
2821                    } else {
2822                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2823                                       alpPixBuf ? *alpSrcPtr : NULL,
2824                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
2825                    }
2826                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2827                    int lumAlpha= vLumFilter[2*dstY+1];
2828                    int chrAlpha= vChrFilter[2*dstY+1];
2829                    lumMmxFilter[2]=
2830                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2831                    chrMmxFilter[2]=
2832                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2833                    if(flags & SWS_FULL_CHR_H_INT) {
2834                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2835                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2836                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2837                                         alpSrcPtr, dest, dstW, dstY);
2838                    } else {
2839                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2840                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2841                                       dest, dstW, lumAlpha, chrAlpha, dstY);
2842                    }
2843                } else { //general RGB
2844                    if(flags & SWS_FULL_CHR_H_INT) {
2845                        yuv2rgbXinC_full(c,
2846                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2847                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2848                                         alpSrcPtr, dest, dstW, dstY);
2849                    } else {
2850                        c->yuv2packedX(c,
2851                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2852                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2853                                       alpSrcPtr, dest, dstW, dstY);
2854                    }
2855                }
2856            }
2857        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2858            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2859            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2860            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2861            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2862                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2863                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2864                yuv2nv12XinC(
2865                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2866                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2867                             dest, uDest, dstW, chrDstW, dstFormat);
2868            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2869                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2870                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2871                if (is16BPS(dstFormat)) {
2872                    yuv2yuvX16inC(
2873                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2874                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2875                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2876                                  dstFormat);
2877                } else {
2878                    yuv2yuvXinC(
2879                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2880                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2881                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2882                }
2883            } else {
2884                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2885                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2886                if(flags & SWS_FULL_CHR_H_INT) {
2887                    yuv2rgbXinC_full(c,
2888                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2889                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2890                                     alpSrcPtr, dest, dstW, dstY);
2891                } else {
2892                    yuv2packedXinC(c,
2893                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2894                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2895                                   alpSrcPtr, dest, dstW, dstY);
2896                }
2897            }
2898        }
2899    }
2900
2901    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2902        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2903
2904#if COMPILE_TEMPLATE_MMX
2905    if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
2906    /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2907    if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
2908    else                             __asm__ volatile("emms"  :::"memory");
2909#endif
2910    /* store changed local vars back in the context */
2911    c->dstY= dstY;
2912    c->lumBufIndex= lumBufIndex;
2913    c->chrBufIndex= chrBufIndex;
2914    c->lastInLumBuf= lastInLumBuf;
2915    c->lastInChrBuf= lastInChrBuf;
2916
2917    return dstY - lastDstY;
2918}
2919
2920static void RENAME(sws_init_swScale)(SwsContext *c)
2921{
2922    enum PixelFormat srcFormat = c->srcFormat;
2923
2924    c->yuv2nv12X    = RENAME(yuv2nv12X   );
2925    c->yuv2yuv1     = RENAME(yuv2yuv1    );
2926    c->yuv2yuvX     = RENAME(yuv2yuvX    );
2927    c->yuv2packed1  = RENAME(yuv2packed1 );
2928    c->yuv2packed2  = RENAME(yuv2packed2 );
2929    c->yuv2packedX  = RENAME(yuv2packedX );
2930
2931    c->hScale       = RENAME(hScale      );
2932
2933#if COMPILE_TEMPLATE_MMX
2934    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2935    if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2936#else
2937    if (c->flags & SWS_FAST_BILINEAR)
2938#endif
2939    {
2940        c->hyscale_fast = RENAME(hyscale_fast);
2941        c->hcscale_fast = RENAME(hcscale_fast);
2942    }
2943
2944    c->chrToYV12 = NULL;
2945    switch(srcFormat) {
2946        case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
2947        case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
2948        case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
2949        case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
2950        case PIX_FMT_RGB8     :
2951        case PIX_FMT_BGR8     :
2952        case PIX_FMT_PAL8     :
2953        case PIX_FMT_BGR4_BYTE:
2954        case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
2955        case PIX_FMT_YUV420P16BE:
2956        case PIX_FMT_YUV422P16BE:
2957        case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2958        case PIX_FMT_YUV420P16LE:
2959        case PIX_FMT_YUV422P16LE:
2960        case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2961    }
2962    if (c->chrSrcHSubSample) {
2963        switch(srcFormat) {
2964        case PIX_FMT_RGB48BE:
2965        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
2966        case PIX_FMT_RGB32  :
2967        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV_half; break;
2968        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2969        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
2970        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
2971        case PIX_FMT_BGR32  :
2972        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV_half; break;
2973        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2974        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
2975        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
2976        }
2977    } else {
2978        switch(srcFormat) {
2979        case PIX_FMT_RGB48BE:
2980        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
2981        case PIX_FMT_RGB32  :
2982        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV; break;
2983        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
2984        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
2985        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
2986        case PIX_FMT_BGR32  :
2987        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV; break;
2988        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
2989        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
2990        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
2991        }
2992    }
2993
2994    c->lumToYV12 = NULL;
2995    c->alpToYV12 = NULL;
2996    switch (srcFormat) {
2997    case PIX_FMT_YUYV422  :
2998    case PIX_FMT_YUV420P16BE:
2999    case PIX_FMT_YUV422P16BE:
3000    case PIX_FMT_YUV444P16BE:
3001    case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
3002    case PIX_FMT_UYVY422  :
3003    case PIX_FMT_YUV420P16LE:
3004    case PIX_FMT_YUV422P16LE:
3005    case PIX_FMT_YUV444P16LE:
3006    case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
3007    case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
3008    case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY; break;
3009    case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY; break;
3010    case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
3011    case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY; break;
3012    case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY; break;
3013    case PIX_FMT_RGB8     :
3014    case PIX_FMT_BGR8     :
3015    case PIX_FMT_PAL8     :
3016    case PIX_FMT_BGR4_BYTE:
3017    case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
3018    case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
3019    case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
3020    case PIX_FMT_RGB32  :
3021    case PIX_FMT_RGB32_1: c->lumToYV12 = bgr32ToY; break;
3022    case PIX_FMT_BGR32  :
3023    case PIX_FMT_BGR32_1: c->lumToYV12 = rgb32ToY; break;
3024    case PIX_FMT_RGB48BE:
3025    case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
3026    }
3027    if (c->alpPixBuf) {
3028        switch (srcFormat) {
3029        case PIX_FMT_RGB32  :
3030        case PIX_FMT_RGB32_1:
3031        case PIX_FMT_BGR32  :
3032        case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
3033        }
3034    }
3035
3036    switch (srcFormat) {
3037    case PIX_FMT_RGB32  :
3038    case PIX_FMT_BGR32  :
3039        c->alpSrcOffset = 3;
3040        break;
3041    case PIX_FMT_RGB32_1:
3042    case PIX_FMT_BGR32_1:
3043        c->lumSrcOffset = ALT32_CORR;
3044        c->chrSrcOffset = ALT32_CORR;
3045        break;
3046    case PIX_FMT_RGB48LE:
3047        c->lumSrcOffset = 1;
3048        c->chrSrcOffset = 1;
3049        c->alpSrcOffset = 1;
3050        break;
3051    }
3052
3053    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
3054        if (c->srcRange) {
3055            c->lumConvertRange = RENAME(lumRangeFromJpeg);
3056            c->chrConvertRange = RENAME(chrRangeFromJpeg);
3057        } else {
3058            c->lumConvertRange = RENAME(lumRangeToJpeg);
3059            c->chrConvertRange = RENAME(chrRangeToJpeg);
3060        }
3061    }
3062
3063    if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3064          srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3065        c->needs_hcscale = 1;
3066}
3067