1/*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
22 */
23
24#undef REAL_MOVNTQ
25#undef MOVNTQ
26#undef PAVGB
27#undef PREFETCH
28#undef PREFETCHW
29#undef EMMS
30#undef SFENCE
31
32#if HAVE_AMD3DNOW
33/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
34#define EMMS     "femms"
35#else
36#define EMMS     "emms"
37#endif
38
39#if HAVE_AMD3DNOW
40#define PREFETCH  "prefetch"
41#define PREFETCHW "prefetchw"
42#elif HAVE_MMX2
43#define PREFETCH "prefetchnta"
44#define PREFETCHW "prefetcht0"
45#else
46#define PREFETCH  " # nop"
47#define PREFETCHW " # nop"
48#endif
49
50#if HAVE_MMX2
51#define SFENCE "sfence"
52#else
53#define SFENCE " # nop"
54#endif
55
56#if HAVE_MMX2
57#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58#elif HAVE_AMD3DNOW
59#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60#endif
61
62#if HAVE_MMX2
63#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64#else
65#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66#endif
67#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
68
69#if HAVE_ALTIVEC
70#include "swscale_altivec_template.c"
71#endif
72
73#define YSCALEYUV2YV12X(x, offset, dest, width) \
74    __asm__ volatile(\
75    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
76    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
77    "movq                             %%mm3, %%mm4      \n\t"\
78    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
79    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
80    ASMALIGN(4) /* FIXME Unroll? */\
81    "1:                                                 \n\t"\
82    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
83    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
84    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
85    "add                                $16, %%"REG_d"  \n\t"\
86    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
87    "test                         %%"REG_S", %%"REG_S"  \n\t"\
88    "pmulhw                           %%mm0, %%mm2      \n\t"\
89    "pmulhw                           %%mm0, %%mm5      \n\t"\
90    "paddw                            %%mm2, %%mm3      \n\t"\
91    "paddw                            %%mm5, %%mm4      \n\t"\
92    " jnz                                1b             \n\t"\
93    "psraw                               $3, %%mm3      \n\t"\
94    "psraw                               $3, %%mm4      \n\t"\
95    "packuswb                         %%mm4, %%mm3      \n\t"\
96    MOVNTQ(%%mm3, (%1, %%REGa))\
97    "add                                 $8, %%"REG_a"  \n\t"\
98    "cmp                                 %2, %%"REG_a"  \n\t"\
99    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
100    "movq                             %%mm3, %%mm4      \n\t"\
101    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
102    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103    "jb                                  1b             \n\t"\
104    :: "r" (&c->redDither),\
105    "r" (dest), "g" (width)\
106    : "%"REG_a, "%"REG_d, "%"REG_S\
107    );
108
109#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110    __asm__ volatile(\
111    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
112    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
113    "pxor                             %%mm4, %%mm4      \n\t"\
114    "pxor                             %%mm5, %%mm5      \n\t"\
115    "pxor                             %%mm6, %%mm6      \n\t"\
116    "pxor                             %%mm7, %%mm7      \n\t"\
117    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
118    ASMALIGN(4) \
119    "1:                                                 \n\t"\
120    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
121    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
122    "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
123    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
124    "movq                             %%mm0, %%mm3      \n\t"\
125    "punpcklwd                        %%mm1, %%mm0      \n\t"\
126    "punpckhwd                        %%mm1, %%mm3      \n\t"\
127    "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
128    "pmaddwd                          %%mm1, %%mm0      \n\t"\
129    "pmaddwd                          %%mm1, %%mm3      \n\t"\
130    "paddd                            %%mm0, %%mm4      \n\t"\
131    "paddd                            %%mm3, %%mm5      \n\t"\
132    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
133    "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
134    "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
135    "test                         %%"REG_S", %%"REG_S"  \n\t"\
136    "movq                             %%mm2, %%mm0      \n\t"\
137    "punpcklwd                        %%mm3, %%mm2      \n\t"\
138    "punpckhwd                        %%mm3, %%mm0      \n\t"\
139    "pmaddwd                          %%mm1, %%mm2      \n\t"\
140    "pmaddwd                          %%mm1, %%mm0      \n\t"\
141    "paddd                            %%mm2, %%mm6      \n\t"\
142    "paddd                            %%mm0, %%mm7      \n\t"\
143    " jnz                                1b             \n\t"\
144    "psrad                              $16, %%mm4      \n\t"\
145    "psrad                              $16, %%mm5      \n\t"\
146    "psrad                              $16, %%mm6      \n\t"\
147    "psrad                              $16, %%mm7      \n\t"\
148    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
149    "packssdw                         %%mm5, %%mm4      \n\t"\
150    "packssdw                         %%mm7, %%mm6      \n\t"\
151    "paddw                            %%mm0, %%mm4      \n\t"\
152    "paddw                            %%mm0, %%mm6      \n\t"\
153    "psraw                               $3, %%mm4      \n\t"\
154    "psraw                               $3, %%mm6      \n\t"\
155    "packuswb                         %%mm6, %%mm4      \n\t"\
156    MOVNTQ(%%mm4, (%1, %%REGa))\
157    "add                                 $8, %%"REG_a"  \n\t"\
158    "cmp                                 %2, %%"REG_a"  \n\t"\
159    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
160    "pxor                             %%mm4, %%mm4      \n\t"\
161    "pxor                             %%mm5, %%mm5      \n\t"\
162    "pxor                             %%mm6, %%mm6      \n\t"\
163    "pxor                             %%mm7, %%mm7      \n\t"\
164    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
165    "jb                                  1b             \n\t"\
166    :: "r" (&c->redDither),\
167    "r" (dest), "g" (width)\
168    : "%"REG_a, "%"REG_d, "%"REG_S\
169    );
170
171#define YSCALEYUV2YV121 \
172    "mov %2, %%"REG_a"                    \n\t"\
173    ASMALIGN(4) /* FIXME Unroll? */\
174    "1:                                   \n\t"\
175    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
176    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
177    "psraw                 $7, %%mm0      \n\t"\
178    "psraw                 $7, %%mm1      \n\t"\
179    "packuswb           %%mm1, %%mm0      \n\t"\
180    MOVNTQ(%%mm0, (%1, %%REGa))\
181    "add                   $8, %%"REG_a"  \n\t"\
182    "jnc                   1b             \n\t"
183
184#define YSCALEYUV2YV121_ACCURATE \
185    "mov %2, %%"REG_a"                    \n\t"\
186    "pcmpeqw %%mm7, %%mm7                 \n\t"\
187    "psrlw                 $15, %%mm7     \n\t"\
188    "psllw                  $6, %%mm7     \n\t"\
189    ASMALIGN(4) /* FIXME Unroll? */\
190    "1:                                   \n\t"\
191    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
192    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
193    "paddsw             %%mm7, %%mm0      \n\t"\
194    "paddsw             %%mm7, %%mm1      \n\t"\
195    "psraw                 $7, %%mm0      \n\t"\
196    "psraw                 $7, %%mm1      \n\t"\
197    "packuswb           %%mm1, %%mm0      \n\t"\
198    MOVNTQ(%%mm0, (%1, %%REGa))\
199    "add                   $8, %%"REG_a"  \n\t"\
200    "jnc                   1b             \n\t"
201
202/*
203    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205       "r" (dest), "m" (dstW),
206       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
208*/
209#define YSCALEYUV2PACKEDX_UV \
210    __asm__ volatile(\
211    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
212    ASMALIGN(4)\
213    "nop                                            \n\t"\
214    "1:                                             \n\t"\
215    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
216    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
218    "movq                      %%mm3, %%mm4         \n\t"\
219    ASMALIGN(4)\
220    "2:                                             \n\t"\
221    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
222    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
223    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
224    "add                         $16, %%"REG_d"     \n\t"\
225    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226    "pmulhw                    %%mm0, %%mm2         \n\t"\
227    "pmulhw                    %%mm0, %%mm5         \n\t"\
228    "paddw                     %%mm2, %%mm3         \n\t"\
229    "paddw                     %%mm5, %%mm4         \n\t"\
230    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231    " jnz                         2b                \n\t"\
232
233#define YSCALEYUV2PACKEDX_YA(offset) \
234    "lea                "offset"(%0), %%"REG_d"     \n\t"\
235    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
236    "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
237    "movq                      %%mm1, %%mm7         \n\t"\
238    ASMALIGN(4)\
239    "2:                                             \n\t"\
240    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
241    "movq  (%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y1srcData */\
242    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5         \n\t" /* Y2srcData */\
243    "add                         $16, %%"REG_d"            \n\t"\
244    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
245    "pmulhw                    %%mm0, %%mm2         \n\t"\
246    "pmulhw                    %%mm0, %%mm5         \n\t"\
247    "paddw                     %%mm2, %%mm1         \n\t"\
248    "paddw                     %%mm5, %%mm7         \n\t"\
249    "test                  %%"REG_S", %%"REG_S"     \n\t"\
250    " jnz                         2b                \n\t"\
251
252#define YSCALEYUV2PACKEDX \
253    YSCALEYUV2PACKEDX_UV \
254    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET) \
255
256#define YSCALEYUV2PACKEDX_END                 \
257    :: "r" (&c->redDither),                   \
258        "m" (dummy), "m" (dummy), "m" (dummy),\
259        "r" (dest), "m" (dstW)                \
260    : "%"REG_a, "%"REG_d, "%"REG_S            \
261    );
262
263#define YSCALEYUV2PACKEDX_ACCURATE_UV \
264    __asm__ volatile(\
265    "xor %%"REG_a", %%"REG_a"                       \n\t"\
266    ASMALIGN(4)\
267    "nop                                            \n\t"\
268    "1:                                             \n\t"\
269    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
270    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
271    "pxor                      %%mm4, %%mm4         \n\t"\
272    "pxor                      %%mm5, %%mm5         \n\t"\
273    "pxor                      %%mm6, %%mm6         \n\t"\
274    "pxor                      %%mm7, %%mm7         \n\t"\
275    ASMALIGN(4)\
276    "2:                                             \n\t"\
277    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
278    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
279    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
280    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
281    "movq                      %%mm0, %%mm3         \n\t"\
282    "punpcklwd                 %%mm1, %%mm0         \n\t"\
283    "punpckhwd                 %%mm1, %%mm3         \n\t"\
284    "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
285    "pmaddwd                   %%mm1, %%mm0         \n\t"\
286    "pmaddwd                   %%mm1, %%mm3         \n\t"\
287    "paddd                     %%mm0, %%mm4         \n\t"\
288    "paddd                     %%mm3, %%mm5         \n\t"\
289    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
290    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
291    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
292    "test                  %%"REG_S", %%"REG_S"     \n\t"\
293    "movq                      %%mm2, %%mm0         \n\t"\
294    "punpcklwd                 %%mm3, %%mm2         \n\t"\
295    "punpckhwd                 %%mm3, %%mm0         \n\t"\
296    "pmaddwd                   %%mm1, %%mm2         \n\t"\
297    "pmaddwd                   %%mm1, %%mm0         \n\t"\
298    "paddd                     %%mm2, %%mm6         \n\t"\
299    "paddd                     %%mm0, %%mm7         \n\t"\
300    " jnz                         2b                \n\t"\
301    "psrad                       $16, %%mm4         \n\t"\
302    "psrad                       $16, %%mm5         \n\t"\
303    "psrad                       $16, %%mm6         \n\t"\
304    "psrad                       $16, %%mm7         \n\t"\
305    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
306    "packssdw                  %%mm5, %%mm4         \n\t"\
307    "packssdw                  %%mm7, %%mm6         \n\t"\
308    "paddw                     %%mm0, %%mm4         \n\t"\
309    "paddw                     %%mm0, %%mm6         \n\t"\
310    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
311    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
312
313#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
314    "lea                "offset"(%0), %%"REG_d"     \n\t"\
315    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
316    "pxor                      %%mm1, %%mm1         \n\t"\
317    "pxor                      %%mm5, %%mm5         \n\t"\
318    "pxor                      %%mm7, %%mm7         \n\t"\
319    "pxor                      %%mm6, %%mm6         \n\t"\
320    ASMALIGN(4)\
321    "2:                                             \n\t"\
322    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
323    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
324    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
325    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
326    "movq                      %%mm0, %%mm3         \n\t"\
327    "punpcklwd                 %%mm4, %%mm0         \n\t"\
328    "punpckhwd                 %%mm4, %%mm3         \n\t"\
329    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
330    "pmaddwd                   %%mm4, %%mm0         \n\t"\
331    "pmaddwd                   %%mm4, %%mm3         \n\t"\
332    "paddd                     %%mm0, %%mm1         \n\t"\
333    "paddd                     %%mm3, %%mm5         \n\t"\
334    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
335    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
336    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
337    "test                  %%"REG_S", %%"REG_S"     \n\t"\
338    "movq                      %%mm2, %%mm0         \n\t"\
339    "punpcklwd                 %%mm3, %%mm2         \n\t"\
340    "punpckhwd                 %%mm3, %%mm0         \n\t"\
341    "pmaddwd                   %%mm4, %%mm2         \n\t"\
342    "pmaddwd                   %%mm4, %%mm0         \n\t"\
343    "paddd                     %%mm2, %%mm7         \n\t"\
344    "paddd                     %%mm0, %%mm6         \n\t"\
345    " jnz                         2b                \n\t"\
346    "psrad                       $16, %%mm1         \n\t"\
347    "psrad                       $16, %%mm5         \n\t"\
348    "psrad                       $16, %%mm7         \n\t"\
349    "psrad                       $16, %%mm6         \n\t"\
350    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
351    "packssdw                  %%mm5, %%mm1         \n\t"\
352    "packssdw                  %%mm6, %%mm7         \n\t"\
353    "paddw                     %%mm0, %%mm1         \n\t"\
354    "paddw                     %%mm0, %%mm7         \n\t"\
355    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
356    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
357
358#define YSCALEYUV2PACKEDX_ACCURATE \
359    YSCALEYUV2PACKEDX_ACCURATE_UV \
360    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
361
362#define YSCALEYUV2RGBX \
363    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
364    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
365    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
366    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
367    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
368    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
369/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
370    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
371    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
372    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
373    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
374    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
375    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
376/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
377    "paddw           %%mm3, %%mm4       \n\t"\
378    "movq            %%mm2, %%mm0       \n\t"\
379    "movq            %%mm5, %%mm6       \n\t"\
380    "movq            %%mm4, %%mm3       \n\t"\
381    "punpcklwd       %%mm2, %%mm2       \n\t"\
382    "punpcklwd       %%mm5, %%mm5       \n\t"\
383    "punpcklwd       %%mm4, %%mm4       \n\t"\
384    "paddw           %%mm1, %%mm2       \n\t"\
385    "paddw           %%mm1, %%mm5       \n\t"\
386    "paddw           %%mm1, %%mm4       \n\t"\
387    "punpckhwd       %%mm0, %%mm0       \n\t"\
388    "punpckhwd       %%mm6, %%mm6       \n\t"\
389    "punpckhwd       %%mm3, %%mm3       \n\t"\
390    "paddw           %%mm7, %%mm0       \n\t"\
391    "paddw           %%mm7, %%mm6       \n\t"\
392    "paddw           %%mm7, %%mm3       \n\t"\
393    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
394    "packuswb        %%mm0, %%mm2       \n\t"\
395    "packuswb        %%mm6, %%mm5       \n\t"\
396    "packuswb        %%mm3, %%mm4       \n\t"\
397
398#define REAL_YSCALEYUV2PACKED(index, c) \
399    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
400    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
401    "psraw                $3, %%mm0                           \n\t"\
402    "psraw                $3, %%mm1                           \n\t"\
403    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
404    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
405    "xor            "#index", "#index"                        \n\t"\
406    ASMALIGN(4)\
407    "1:                                 \n\t"\
408    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
409    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
410    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
411    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
412    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
413    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
414    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
415    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
416    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
417    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
418    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
419    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
420    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
421    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
422    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
423    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
424    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
425    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
426    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
427    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
428    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
429    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
430    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
431    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
432    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
433
434#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
435
436#define REAL_YSCALEYUV2RGB_UV(index, c) \
437    "xor            "#index", "#index"  \n\t"\
438    ASMALIGN(4)\
439    "1:                                 \n\t"\
440    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
441    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
442    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
443    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
444    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
445    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
446    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
447    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
448    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
449    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
450    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
451    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
452    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
453    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
454    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
455    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
456    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
457    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
458    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
459    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
460
461#define REAL_YSCALEYUV2RGB_YA(index, c) \
462    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
463    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
464    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
465    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
466    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
467    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
468    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
469    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
470    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
471    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
472    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
473    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
474
475#define REAL_YSCALEYUV2RGB_COEFF(c) \
476    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
477    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
478    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
479    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
480    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
481    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
482    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
483    "paddw             %%mm3, %%mm4     \n\t"\
484    "movq              %%mm2, %%mm0     \n\t"\
485    "movq              %%mm5, %%mm6     \n\t"\
486    "movq              %%mm4, %%mm3     \n\t"\
487    "punpcklwd         %%mm2, %%mm2     \n\t"\
488    "punpcklwd         %%mm5, %%mm5     \n\t"\
489    "punpcklwd         %%mm4, %%mm4     \n\t"\
490    "paddw             %%mm1, %%mm2     \n\t"\
491    "paddw             %%mm1, %%mm5     \n\t"\
492    "paddw             %%mm1, %%mm4     \n\t"\
493    "punpckhwd         %%mm0, %%mm0     \n\t"\
494    "punpckhwd         %%mm6, %%mm6     \n\t"\
495    "punpckhwd         %%mm3, %%mm3     \n\t"\
496    "paddw             %%mm7, %%mm0     \n\t"\
497    "paddw             %%mm7, %%mm6     \n\t"\
498    "paddw             %%mm7, %%mm3     \n\t"\
499    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
500    "packuswb          %%mm0, %%mm2     \n\t"\
501    "packuswb          %%mm6, %%mm5     \n\t"\
502    "packuswb          %%mm3, %%mm4     \n\t"\
503
504#define YSCALEYUV2RGB_YA(index, c) REAL_YSCALEYUV2RGB_YA(index, c)
505
506#define YSCALEYUV2RGB(index, c) \
507    REAL_YSCALEYUV2RGB_UV(index, c) \
508    REAL_YSCALEYUV2RGB_YA(index, c) \
509    REAL_YSCALEYUV2RGB_COEFF(c)
510
511#define REAL_YSCALEYUV2PACKED1(index, c) \
512    "xor            "#index", "#index"  \n\t"\
513    ASMALIGN(4)\
514    "1:                                 \n\t"\
515    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
516    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
517    "psraw                $7, %%mm3     \n\t" \
518    "psraw                $7, %%mm4     \n\t" \
519    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
520    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
521    "psraw                $7, %%mm1     \n\t" \
522    "psraw                $7, %%mm7     \n\t" \
523
524#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
525
526#define REAL_YSCALEYUV2RGB1(index, c) \
527    "xor            "#index", "#index"  \n\t"\
528    ASMALIGN(4)\
529    "1:                                 \n\t"\
530    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
531    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
532    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
533    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
534    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
535    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
536    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
537    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
538    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
539    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
540    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
541    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
542    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
543    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
544    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
545    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
546    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
547    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
548    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
549    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
550    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
551    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
552    "paddw             %%mm3, %%mm4     \n\t"\
553    "movq              %%mm2, %%mm0     \n\t"\
554    "movq              %%mm5, %%mm6     \n\t"\
555    "movq              %%mm4, %%mm3     \n\t"\
556    "punpcklwd         %%mm2, %%mm2     \n\t"\
557    "punpcklwd         %%mm5, %%mm5     \n\t"\
558    "punpcklwd         %%mm4, %%mm4     \n\t"\
559    "paddw             %%mm1, %%mm2     \n\t"\
560    "paddw             %%mm1, %%mm5     \n\t"\
561    "paddw             %%mm1, %%mm4     \n\t"\
562    "punpckhwd         %%mm0, %%mm0     \n\t"\
563    "punpckhwd         %%mm6, %%mm6     \n\t"\
564    "punpckhwd         %%mm3, %%mm3     \n\t"\
565    "paddw             %%mm7, %%mm0     \n\t"\
566    "paddw             %%mm7, %%mm6     \n\t"\
567    "paddw             %%mm7, %%mm3     \n\t"\
568    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
569    "packuswb          %%mm0, %%mm2     \n\t"\
570    "packuswb          %%mm6, %%mm5     \n\t"\
571    "packuswb          %%mm3, %%mm4     \n\t"\
572
573#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
574
575#define REAL_YSCALEYUV2PACKED1b(index, c) \
576    "xor "#index", "#index"             \n\t"\
577    ASMALIGN(4)\
578    "1:                                 \n\t"\
579    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
580    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
581    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
582    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
583    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585    "psrlw                $8, %%mm3     \n\t" \
586    "psrlw                $8, %%mm4     \n\t" \
587    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
588    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
589    "psraw                $7, %%mm1     \n\t" \
590    "psraw                $7, %%mm7     \n\t"
591#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
592
593// do vertical chrominance interpolation
594#define REAL_YSCALEYUV2RGB1b(index, c) \
595    "xor            "#index", "#index"  \n\t"\
596    ASMALIGN(4)\
597    "1:                                 \n\t"\
598    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
599    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
600    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
601    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
602    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
603    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
604    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
605    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
606    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
607    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
608    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
609    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
610    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
611    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
612    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
613    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
614    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
615    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
616    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
617    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
618    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
619    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
620    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
621    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
622    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
623    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
624    "paddw             %%mm3, %%mm4     \n\t"\
625    "movq              %%mm2, %%mm0     \n\t"\
626    "movq              %%mm5, %%mm6     \n\t"\
627    "movq              %%mm4, %%mm3     \n\t"\
628    "punpcklwd         %%mm2, %%mm2     \n\t"\
629    "punpcklwd         %%mm5, %%mm5     \n\t"\
630    "punpcklwd         %%mm4, %%mm4     \n\t"\
631    "paddw             %%mm1, %%mm2     \n\t"\
632    "paddw             %%mm1, %%mm5     \n\t"\
633    "paddw             %%mm1, %%mm4     \n\t"\
634    "punpckhwd         %%mm0, %%mm0     \n\t"\
635    "punpckhwd         %%mm6, %%mm6     \n\t"\
636    "punpckhwd         %%mm3, %%mm3     \n\t"\
637    "paddw             %%mm7, %%mm0     \n\t"\
638    "paddw             %%mm7, %%mm6     \n\t"\
639    "paddw             %%mm7, %%mm3     \n\t"\
640    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
641    "packuswb          %%mm0, %%mm2     \n\t"\
642    "packuswb          %%mm6, %%mm5     \n\t"\
643    "packuswb          %%mm3, %%mm4     \n\t"\
644
645#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
646
647#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
648    "movq       "#b", "#q2"     \n\t" /* B */\
649    "movq       "#r", "#t"      \n\t" /* R */\
650    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
651    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
652    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
653    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
654    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
655    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
656    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
657    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
658    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
659    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
660\
661    MOVNTQ(   q0,   (dst, index, 4))\
662    MOVNTQ(    b,  8(dst, index, 4))\
663    MOVNTQ(   q2, 16(dst, index, 4))\
664    MOVNTQ(   q3, 24(dst, index, 4))\
665\
666    "add      $8, "#index"      \n\t"\
667    "cmp "#dstw", "#index"      \n\t"\
668    " jb      1b                \n\t"
669#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
670
671#define REAL_WRITERGB16(dst, dstw, index) \
672    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
673    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
674    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
675    "psrlq           $3, %%mm2  \n\t"\
676\
677    "movq         %%mm2, %%mm1  \n\t"\
678    "movq         %%mm4, %%mm3  \n\t"\
679\
680    "punpcklbw    %%mm7, %%mm3  \n\t"\
681    "punpcklbw    %%mm5, %%mm2  \n\t"\
682    "punpckhbw    %%mm7, %%mm4  \n\t"\
683    "punpckhbw    %%mm5, %%mm1  \n\t"\
684\
685    "psllq           $3, %%mm3  \n\t"\
686    "psllq           $3, %%mm4  \n\t"\
687\
688    "por          %%mm3, %%mm2  \n\t"\
689    "por          %%mm4, %%mm1  \n\t"\
690\
691    MOVNTQ(%%mm2,  (dst, index, 2))\
692    MOVNTQ(%%mm1, 8(dst, index, 2))\
693\
694    "add             $8, "#index"   \n\t"\
695    "cmp        "#dstw", "#index"   \n\t"\
696    " jb             1b             \n\t"
697#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
698
699#define REAL_WRITERGB15(dst, dstw, index) \
700    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
701    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
702    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
703    "psrlq           $3, %%mm2  \n\t"\
704    "psrlq           $1, %%mm5  \n\t"\
705\
706    "movq         %%mm2, %%mm1  \n\t"\
707    "movq         %%mm4, %%mm3  \n\t"\
708\
709    "punpcklbw    %%mm7, %%mm3  \n\t"\
710    "punpcklbw    %%mm5, %%mm2  \n\t"\
711    "punpckhbw    %%mm7, %%mm4  \n\t"\
712    "punpckhbw    %%mm5, %%mm1  \n\t"\
713\
714    "psllq           $2, %%mm3  \n\t"\
715    "psllq           $2, %%mm4  \n\t"\
716\
717    "por          %%mm3, %%mm2  \n\t"\
718    "por          %%mm4, %%mm1  \n\t"\
719\
720    MOVNTQ(%%mm2,  (dst, index, 2))\
721    MOVNTQ(%%mm1, 8(dst, index, 2))\
722\
723    "add             $8, "#index"   \n\t"\
724    "cmp        "#dstw", "#index"   \n\t"\
725    " jb             1b             \n\t"
726#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
727
728#define WRITEBGR24OLD(dst, dstw, index) \
729    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
730    "movq      %%mm2, %%mm1             \n\t" /* B */\
731    "movq      %%mm5, %%mm6             \n\t" /* R */\
732    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
733    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
734    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
735    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
736    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
737    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
738    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
739    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
740    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
741    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
742\
743    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
744    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
745    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
746    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
747    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
748    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
749    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
750    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
751\
752    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
753    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
754    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
755    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
756    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
757    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
758    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
759    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
760    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
761    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
762    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
763    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
764    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
765\
766    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
767    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
768    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
769    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
770    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
771    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
772    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
773    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
774\
775    MOVNTQ(%%mm0,   (dst))\
776    MOVNTQ(%%mm2,  8(dst))\
777    MOVNTQ(%%mm3, 16(dst))\
778    "add         $24, "#dst"            \n\t"\
779\
780    "add          $8, "#index"          \n\t"\
781    "cmp     "#dstw", "#index"          \n\t"\
782    " jb          1b                    \n\t"
783
784#define WRITEBGR24MMX(dst, dstw, index) \
785    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
786    "movq      %%mm2, %%mm1     \n\t" /* B */\
787    "movq      %%mm5, %%mm6     \n\t" /* R */\
788    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
789    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
790    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
791    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
792    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
793    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
794    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
795    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
796    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
797    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
798\
799    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
800    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
801    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
802    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
803\
804    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
805    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
806    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
807    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
808\
809    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
810    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
811    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
812    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
813\
814    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
815    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
816    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
817    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
818    MOVNTQ(%%mm0, (dst))\
819\
820    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
821    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
822    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
823    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
824    MOVNTQ(%%mm6, 8(dst))\
825\
826    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
827    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
828    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
829    MOVNTQ(%%mm5, 16(dst))\
830\
831    "add         $24, "#dst"    \n\t"\
832\
833    "add          $8, "#index"  \n\t"\
834    "cmp     "#dstw", "#index"  \n\t"\
835    " jb          1b            \n\t"
836
837#define WRITEBGR24MMX2(dst, dstw, index) \
838    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
839    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
840    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
841    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
842    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
843    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
844\
845    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
846    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
847    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
848\
849    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
850    "por    %%mm1, %%mm6        \n\t"\
851    "por    %%mm3, %%mm6        \n\t"\
852    MOVNTQ(%%mm6, (dst))\
853\
854    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
855    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
856    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
857    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
858\
859    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
860    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
861    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
862\
863    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
864    "por    %%mm3, %%mm6        \n\t"\
865    MOVNTQ(%%mm6, 8(dst))\
866\
867    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
868    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
869    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
870\
871    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
872    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
873    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
874\
875    "por    %%mm1, %%mm3        \n\t"\
876    "por    %%mm3, %%mm6        \n\t"\
877    MOVNTQ(%%mm6, 16(dst))\
878\
879    "add      $24, "#dst"       \n\t"\
880\
881    "add       $8, "#index"     \n\t"\
882    "cmp  "#dstw", "#index"     \n\t"\
883    " jb       1b               \n\t"
884
885#if HAVE_MMX2
886#undef WRITEBGR24
887#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
888#else
889#undef WRITEBGR24
890#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
891#endif
892
893#define REAL_WRITEYUY2(dst, dstw, index) \
894    "packuswb  %%mm3, %%mm3     \n\t"\
895    "packuswb  %%mm4, %%mm4     \n\t"\
896    "packuswb  %%mm7, %%mm1     \n\t"\
897    "punpcklbw %%mm4, %%mm3     \n\t"\
898    "movq      %%mm1, %%mm7     \n\t"\
899    "punpcklbw %%mm3, %%mm1     \n\t"\
900    "punpckhbw %%mm3, %%mm7     \n\t"\
901\
902    MOVNTQ(%%mm1, (dst, index, 2))\
903    MOVNTQ(%%mm7, 8(dst, index, 2))\
904\
905    "add          $8, "#index"  \n\t"\
906    "cmp     "#dstw", "#index"  \n\t"\
907    " jb          1b            \n\t"
908#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
909
910
911static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
912                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
913                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
914{
915#if HAVE_MMX
916    if(!(c->flags & SWS_BITEXACT)){
917        if (c->flags & SWS_ACCURATE_RND){
918            if (uDest){
919                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
920                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
921            }
922
923            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
924        }else{
925            if (uDest){
926                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
927                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
928            }
929
930            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
931        }
932        return;
933    }
934#endif
935#if HAVE_ALTIVEC
936yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
937                      chrFilter, chrSrc, chrFilterSize,
938                      dest, uDest, vDest, dstW, chrDstW);
939#else //HAVE_ALTIVEC
940yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
941            chrFilter, chrSrc, chrFilterSize,
942            dest, uDest, vDest, dstW, chrDstW);
943#endif //!HAVE_ALTIVEC
944}
945
946static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
947                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
948                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
949{
950yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
951             chrFilter, chrSrc, chrFilterSize,
952             dest, uDest, dstW, chrDstW, dstFormat);
953}
954
955static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
956                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
957{
958    int i;
959#if HAVE_MMX
960    if(!(c->flags & SWS_BITEXACT)){
961        long p= uDest ? 3 : 1;
962        uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
963        uint8_t *dst[3]= {dest, uDest, vDest};
964        long counter[3] = {dstW, chrDstW, chrDstW};
965
966        if (c->flags & SWS_ACCURATE_RND){
967            while(p--){
968                __asm__ volatile(
969                    YSCALEYUV2YV121_ACCURATE
970                    :: "r" (src[p]), "r" (dst[p] + counter[p]),
971                    "g" (-counter[p])
972                    : "%"REG_a
973                );
974            }
975        }else{
976            while(p--){
977                __asm__ volatile(
978                    YSCALEYUV2YV121
979                    :: "r" (src[p]), "r" (dst[p] + counter[p]),
980                    "g" (-counter[p])
981                    : "%"REG_a
982                );
983            }
984        }
985        return;
986    }
987#endif
988    for (i=0; i<dstW; i++)
989    {
990        int val= (lumSrc[i]+64)>>7;
991
992        if (val&256){
993            if (val<0) val=0;
994            else       val=255;
995        }
996
997        dest[i]= val;
998    }
999
1000    if (uDest)
1001        for (i=0; i<chrDstW; i++)
1002        {
1003            int u=(chrSrc[i       ]+64)>>7;
1004            int v=(chrSrc[i + VOFW]+64)>>7;
1005
1006            if ((u|v)&256){
1007                if (u<0)        u=0;
1008                else if (u>255) u=255;
1009                if (v<0)        v=0;
1010                else if (v>255) v=255;
1011            }
1012
1013            uDest[i]= u;
1014            vDest[i]= v;
1015        }
1016}
1017
1018
1019/**
1020 * vertical scale YV12 to RGB
1021 */
1022static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1023                                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1024                                       uint8_t *dest, long dstW, long dstY)
1025{
1026#if HAVE_MMX
1027    long dummy=0;
1028    if(!(c->flags & SWS_BITEXACT)){
1029        if (c->flags & SWS_ACCURATE_RND){
1030            switch(c->dstFormat){
1031            case PIX_FMT_RGB32:
1032                YSCALEYUV2PACKEDX_ACCURATE
1033                YSCALEYUV2RGBX
1034                "pcmpeqd %%mm7, %%mm7 \n\t"
1035                WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1036
1037                YSCALEYUV2PACKEDX_END
1038                return;
1039            case PIX_FMT_BGR24:
1040                YSCALEYUV2PACKEDX_ACCURATE
1041                YSCALEYUV2RGBX
1042                "pxor %%mm7, %%mm7 \n\t"
1043                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1044                "add %4, %%"REG_c"                        \n\t"
1045                WRITEBGR24(%%REGc, %5, %%REGa)
1046
1047
1048                :: "r" (&c->redDither),
1049                "m" (dummy), "m" (dummy), "m" (dummy),
1050                "r" (dest), "m" (dstW)
1051                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1052                );
1053                return;
1054            case PIX_FMT_RGB555:
1055                YSCALEYUV2PACKEDX_ACCURATE
1056                YSCALEYUV2RGBX
1057                "pxor %%mm7, %%mm7 \n\t"
1058                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1059#ifdef DITHER1XBPP
1060                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1061                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1062                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1063#endif
1064
1065                WRITERGB15(%4, %5, %%REGa)
1066                YSCALEYUV2PACKEDX_END
1067                return;
1068            case PIX_FMT_RGB565:
1069                YSCALEYUV2PACKEDX_ACCURATE
1070                YSCALEYUV2RGBX
1071                "pxor %%mm7, %%mm7 \n\t"
1072                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1073#ifdef DITHER1XBPP
1074                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1075                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1076                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1077#endif
1078
1079                WRITERGB16(%4, %5, %%REGa)
1080                YSCALEYUV2PACKEDX_END
1081                return;
1082            case PIX_FMT_YUYV422:
1083                YSCALEYUV2PACKEDX_ACCURATE
1084                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1085
1086                "psraw $3, %%mm3    \n\t"
1087                "psraw $3, %%mm4    \n\t"
1088                "psraw $3, %%mm1    \n\t"
1089                "psraw $3, %%mm7    \n\t"
1090                WRITEYUY2(%4, %5, %%REGa)
1091                YSCALEYUV2PACKEDX_END
1092                return;
1093            }
1094        }else{
1095            switch(c->dstFormat)
1096            {
1097            case PIX_FMT_RGB32:
1098                YSCALEYUV2PACKEDX
1099                YSCALEYUV2RGBX
1100                "pcmpeqd %%mm7, %%mm7 \n\t"
1101                WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1102                YSCALEYUV2PACKEDX_END
1103                return;
1104            case PIX_FMT_BGR24:
1105                YSCALEYUV2PACKEDX
1106                YSCALEYUV2RGBX
1107                "pxor                    %%mm7, %%mm7       \n\t"
1108                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1109                "add                        %4, %%"REG_c"   \n\t"
1110                WRITEBGR24(%%REGc, %5, %%REGa)
1111
1112                :: "r" (&c->redDither),
1113                "m" (dummy), "m" (dummy), "m" (dummy),
1114                "r" (dest),  "m" (dstW)
1115                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1116                );
1117                return;
1118            case PIX_FMT_RGB555:
1119                YSCALEYUV2PACKEDX
1120                YSCALEYUV2RGBX
1121                "pxor %%mm7, %%mm7 \n\t"
1122                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1123#ifdef DITHER1XBPP
1124                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1125                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1126                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1127#endif
1128
1129                WRITERGB15(%4, %5, %%REGa)
1130                YSCALEYUV2PACKEDX_END
1131                return;
1132            case PIX_FMT_RGB565:
1133                YSCALEYUV2PACKEDX
1134                YSCALEYUV2RGBX
1135                "pxor %%mm7, %%mm7 \n\t"
1136                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1137#ifdef DITHER1XBPP
1138                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1139                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1140                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1141#endif
1142
1143                WRITERGB16(%4, %5, %%REGa)
1144                YSCALEYUV2PACKEDX_END
1145                return;
1146            case PIX_FMT_YUYV422:
1147                YSCALEYUV2PACKEDX
1148                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1149
1150                "psraw $3, %%mm3    \n\t"
1151                "psraw $3, %%mm4    \n\t"
1152                "psraw $3, %%mm1    \n\t"
1153                "psraw $3, %%mm7    \n\t"
1154                WRITEYUY2(%4, %5, %%REGa)
1155                YSCALEYUV2PACKEDX_END
1156                return;
1157            }
1158        }
1159    }
1160#endif /* HAVE_MMX */
1161#if HAVE_ALTIVEC
1162    /* The following list of supported dstFormat values should
1163       match what's found in the body of altivec_yuv2packedX() */
1164    if (!(c->flags & SWS_BITEXACT) &&
1165       (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1166        c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1167        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1168            altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1169                                 chrFilter, chrSrc, chrFilterSize,
1170                                 dest, dstW, dstY);
1171    else
1172#endif
1173        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1174                       chrFilter, chrSrc, chrFilterSize,
1175                       dest, dstW, dstY);
1176}
1177
1178/**
1179 * vertical bilinear scale YV12 to RGB
1180 */
1181static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1182                          uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1183{
1184    int  yalpha1=4095- yalpha;
1185    int uvalpha1=4095-uvalpha;
1186    int i;
1187
1188#if HAVE_MMX
1189    if(!(c->flags & SWS_BITEXACT)){
1190        switch(c->dstFormat)
1191        {
1192            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1193            case PIX_FMT_RGB32:
1194                __asm__ volatile(
1195                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1196                "mov        %4, %%"REG_b"               \n\t"
1197                "push %%"REG_BP"                        \n\t"
1198                YSCALEYUV2RGB(%%REGBP, %5)
1199                "pcmpeqd %%mm7, %%mm7                   \n\t"
1200                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1201                "pop %%"REG_BP"                         \n\t"
1202                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1203
1204                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1205                "a" (&c->redDither)
1206                );
1207                return;
1208            case PIX_FMT_BGR24:
1209                __asm__ volatile(
1210                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1211                "mov        %4, %%"REG_b"               \n\t"
1212                "push %%"REG_BP"                        \n\t"
1213                YSCALEYUV2RGB(%%REGBP, %5)
1214                "pxor    %%mm7, %%mm7                   \n\t"
1215                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1216                "pop %%"REG_BP"                         \n\t"
1217                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1218                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1219                "a" (&c->redDither)
1220                );
1221                return;
1222            case PIX_FMT_RGB555:
1223                __asm__ volatile(
1224                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1225                "mov        %4, %%"REG_b"               \n\t"
1226                "push %%"REG_BP"                        \n\t"
1227                YSCALEYUV2RGB(%%REGBP, %5)
1228                "pxor    %%mm7, %%mm7                   \n\t"
1229                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1230#ifdef DITHER1XBPP
1231                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1232                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1233                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1234#endif
1235
1236                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1237                "pop %%"REG_BP"                         \n\t"
1238                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1239
1240                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1241                "a" (&c->redDither)
1242                );
1243                return;
1244            case PIX_FMT_RGB565:
1245                __asm__ volatile(
1246                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1247                "mov        %4, %%"REG_b"               \n\t"
1248                "push %%"REG_BP"                        \n\t"
1249                YSCALEYUV2RGB(%%REGBP, %5)
1250                "pxor    %%mm7, %%mm7                   \n\t"
1251                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1252#ifdef DITHER1XBPP
1253                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1254                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1255                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1256#endif
1257
1258                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1259                "pop %%"REG_BP"                         \n\t"
1260                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1261                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1262                "a" (&c->redDither)
1263                );
1264                return;
1265            case PIX_FMT_YUYV422:
1266                __asm__ volatile(
1267                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1268                "mov %4, %%"REG_b"                        \n\t"
1269                "push %%"REG_BP"                        \n\t"
1270                YSCALEYUV2PACKED(%%REGBP, %5)
1271                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1272                "pop %%"REG_BP"                         \n\t"
1273                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1274                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1275                "a" (&c->redDither)
1276                );
1277                return;
1278            default: break;
1279        }
1280    }
1281#endif //HAVE_MMX
1282YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1283}
1284
1285/**
1286 * YV12 to RGB without scaling or interpolating
1287 */
1288static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1289                          uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1290{
1291    const int yalpha1=0;
1292    int i;
1293
1294    uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1295    const int yalpha= 4096; //FIXME ...
1296
1297    if (flags&SWS_FULL_CHR_H_INT)
1298    {
1299        RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1300        return;
1301    }
1302
1303#if HAVE_MMX
1304    if(!(flags & SWS_BITEXACT)){
1305        if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1306        {
1307            switch(dstFormat)
1308            {
1309            case PIX_FMT_RGB32:
1310                __asm__ volatile(
1311                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1312                "mov        %4, %%"REG_b"               \n\t"
1313                "push %%"REG_BP"                        \n\t"
1314                YSCALEYUV2RGB1(%%REGBP, %5)
1315                "pcmpeqd %%mm7, %%mm7                   \n\t"
1316                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1317                "pop %%"REG_BP"                         \n\t"
1318                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1319
1320                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1321                "a" (&c->redDither)
1322                );
1323                return;
1324            case PIX_FMT_BGR24:
1325                __asm__ volatile(
1326                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1327                "mov        %4, %%"REG_b"               \n\t"
1328                "push %%"REG_BP"                        \n\t"
1329                YSCALEYUV2RGB1(%%REGBP, %5)
1330                "pxor    %%mm7, %%mm7                   \n\t"
1331                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1332                "pop %%"REG_BP"                         \n\t"
1333                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1334
1335                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1336                "a" (&c->redDither)
1337                );
1338                return;
1339            case PIX_FMT_RGB555:
1340                __asm__ volatile(
1341                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1342                "mov        %4, %%"REG_b"               \n\t"
1343                "push %%"REG_BP"                        \n\t"
1344                YSCALEYUV2RGB1(%%REGBP, %5)
1345                "pxor    %%mm7, %%mm7                   \n\t"
1346                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1347#ifdef DITHER1XBPP
1348                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1349                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1350                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1351#endif
1352                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1353                "pop %%"REG_BP"                         \n\t"
1354                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1355
1356                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1357                "a" (&c->redDither)
1358                );
1359                return;
1360            case PIX_FMT_RGB565:
1361                __asm__ volatile(
1362                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1363                "mov        %4, %%"REG_b"               \n\t"
1364                "push %%"REG_BP"                        \n\t"
1365                YSCALEYUV2RGB1(%%REGBP, %5)
1366                "pxor    %%mm7, %%mm7                   \n\t"
1367                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1368#ifdef DITHER1XBPP
1369                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1370                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1371                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1372#endif
1373
1374                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1375                "pop %%"REG_BP"                         \n\t"
1376                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1377
1378                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1379                "a" (&c->redDither)
1380                );
1381                return;
1382            case PIX_FMT_YUYV422:
1383                __asm__ volatile(
1384                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1385                "mov        %4, %%"REG_b"               \n\t"
1386                "push %%"REG_BP"                        \n\t"
1387                YSCALEYUV2PACKED1(%%REGBP, %5)
1388                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1389                "pop %%"REG_BP"                         \n\t"
1390                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1391
1392                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1393                "a" (&c->redDither)
1394                );
1395                return;
1396            }
1397        }
1398        else
1399        {
1400            switch(dstFormat)
1401            {
1402            case PIX_FMT_RGB32:
1403                __asm__ volatile(
1404                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1405                "mov        %4, %%"REG_b"               \n\t"
1406                "push %%"REG_BP"                        \n\t"
1407                YSCALEYUV2RGB1b(%%REGBP, %5)
1408                "pcmpeqd %%mm7, %%mm7                   \n\t"
1409                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1410                "pop %%"REG_BP"                         \n\t"
1411                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1412
1413                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1414                "a" (&c->redDither)
1415                );
1416                return;
1417            case PIX_FMT_BGR24:
1418                __asm__ volatile(
1419                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1420                "mov        %4, %%"REG_b"               \n\t"
1421                "push %%"REG_BP"                        \n\t"
1422                YSCALEYUV2RGB1b(%%REGBP, %5)
1423                "pxor    %%mm7, %%mm7                   \n\t"
1424                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1425                "pop %%"REG_BP"                         \n\t"
1426                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1427
1428                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1429                "a" (&c->redDither)
1430                );
1431                return;
1432            case PIX_FMT_RGB555:
1433                __asm__ volatile(
1434                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1435                "mov        %4, %%"REG_b"               \n\t"
1436                "push %%"REG_BP"                        \n\t"
1437                YSCALEYUV2RGB1b(%%REGBP, %5)
1438                "pxor    %%mm7, %%mm7                   \n\t"
1439                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1440#ifdef DITHER1XBPP
1441                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1442                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1443                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1444#endif
1445                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1446                "pop %%"REG_BP"                         \n\t"
1447                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1448
1449                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1450                "a" (&c->redDither)
1451                );
1452                return;
1453            case PIX_FMT_RGB565:
1454                __asm__ volatile(
1455                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1456                "mov        %4, %%"REG_b"               \n\t"
1457                "push %%"REG_BP"                        \n\t"
1458                YSCALEYUV2RGB1b(%%REGBP, %5)
1459                "pxor    %%mm7, %%mm7                   \n\t"
1460                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1461#ifdef DITHER1XBPP
1462                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1463                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1464                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1465#endif
1466
1467                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1468                "pop %%"REG_BP"                         \n\t"
1469                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1470
1471                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1472                "a" (&c->redDither)
1473                );
1474                return;
1475            case PIX_FMT_YUYV422:
1476                __asm__ volatile(
1477                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1478                "mov        %4, %%"REG_b"               \n\t"
1479                "push %%"REG_BP"                        \n\t"
1480                YSCALEYUV2PACKED1b(%%REGBP, %5)
1481                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1482                "pop %%"REG_BP"                         \n\t"
1483                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1484
1485                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1486                "a" (&c->redDither)
1487                );
1488                return;
1489            }
1490        }
1491    }
1492#endif /* HAVE_MMX */
1493    if (uvalpha < 2048)
1494    {
1495        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1496    }else{
1497        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1498    }
1499}
1500
1501//FIXME yuy2* can read up to 7 samples too much
1502
1503static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1504{
1505#if HAVE_MMX
1506    __asm__ volatile(
1507    "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1508    "mov                    %0, %%"REG_a"       \n\t"
1509    "1:                                         \n\t"
1510    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1511    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1512    "pand                %%mm2, %%mm0           \n\t"
1513    "pand                %%mm2, %%mm1           \n\t"
1514    "packuswb            %%mm1, %%mm0           \n\t"
1515    "movq                %%mm0, (%2, %%"REG_a") \n\t"
1516    "add                    $8, %%"REG_a"       \n\t"
1517    " js                    1b                  \n\t"
1518    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1519    : "%"REG_a
1520    );
1521#else
1522    int i;
1523    for (i=0; i<width; i++)
1524        dst[i]= src[2*i];
1525#endif
1526}
1527
1528static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1529{
1530#if HAVE_MMX
1531    __asm__ volatile(
1532    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1533    "mov                    %0, %%"REG_a"       \n\t"
1534    "1:                                         \n\t"
1535    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1536    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1537    "psrlw                  $8, %%mm0           \n\t"
1538    "psrlw                  $8, %%mm1           \n\t"
1539    "packuswb            %%mm1, %%mm0           \n\t"
1540    "movq                %%mm0, %%mm1           \n\t"
1541    "psrlw                  $8, %%mm0           \n\t"
1542    "pand                %%mm4, %%mm1           \n\t"
1543    "packuswb            %%mm0, %%mm0           \n\t"
1544    "packuswb            %%mm1, %%mm1           \n\t"
1545    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1546    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1547    "add                    $4, %%"REG_a"       \n\t"
1548    " js                    1b                  \n\t"
1549    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1550    : "%"REG_a
1551    );
1552#else
1553    int i;
1554    for (i=0; i<width; i++)
1555    {
1556        dstU[i]= src1[4*i + 1];
1557        dstV[i]= src1[4*i + 3];
1558    }
1559#endif
1560    assert(src1 == src2);
1561}
1562
1563/* This is almost identical to the previous, end exists only because
1564 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1565static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1566{
1567#if HAVE_MMX
1568    __asm__ volatile(
1569    "mov                  %0, %%"REG_a"         \n\t"
1570    "1:                                         \n\t"
1571    "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1572    "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1573    "psrlw                $8, %%mm0             \n\t"
1574    "psrlw                $8, %%mm1             \n\t"
1575    "packuswb          %%mm1, %%mm0             \n\t"
1576    "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1577    "add                  $8, %%"REG_a"         \n\t"
1578    " js                  1b                    \n\t"
1579    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1580    : "%"REG_a
1581    );
1582#else
1583    int i;
1584    for (i=0; i<width; i++)
1585        dst[i]= src[2*i+1];
1586#endif
1587}
1588
1589static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1590{
1591#if HAVE_MMX
1592    __asm__ volatile(
1593    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1594    "mov                    %0, %%"REG_a"       \n\t"
1595    "1:                                         \n\t"
1596    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1597    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1598    "pand                %%mm4, %%mm0           \n\t"
1599    "pand                %%mm4, %%mm1           \n\t"
1600    "packuswb            %%mm1, %%mm0           \n\t"
1601    "movq                %%mm0, %%mm1           \n\t"
1602    "psrlw                  $8, %%mm0           \n\t"
1603    "pand                %%mm4, %%mm1           \n\t"
1604    "packuswb            %%mm0, %%mm0           \n\t"
1605    "packuswb            %%mm1, %%mm1           \n\t"
1606    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1607    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1608    "add                    $4, %%"REG_a"       \n\t"
1609    " js                    1b                  \n\t"
1610    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1611    : "%"REG_a
1612    );
1613#else
1614    int i;
1615    for (i=0; i<width; i++)
1616    {
1617        dstU[i]= src1[4*i + 0];
1618        dstV[i]= src1[4*i + 2];
1619    }
1620#endif
1621    assert(src1 == src2);
1622}
1623
1624#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1625static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1626{\
1627    int i;\
1628    for (i=0; i<width; i++)\
1629    {\
1630        int b= (((type*)src)[i]>>shb)&maskb;\
1631        int g= (((type*)src)[i]>>shg)&maskg;\
1632        int r= (((type*)src)[i]>>shr)&maskr;\
1633\
1634        dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1635    }\
1636}
1637
1638BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1639BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1640BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1641BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1642BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1643BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1644
1645#define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1646static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1647{\
1648    int i;\
1649    for (i=0; i<width; i++)\
1650    {\
1651        int b= (((type*)src)[i]&maskb)>>shb;\
1652        int g= (((type*)src)[i]&maskg)>>shg;\
1653        int r= (((type*)src)[i]&maskr)>>shr;\
1654\
1655        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1656        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1657    }\
1658}\
1659static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1660{\
1661    int i;\
1662    for (i=0; i<width; i++)\
1663    {\
1664        int pix0= ((type*)src)[2*i+0];\
1665        int pix1= ((type*)src)[2*i+1];\
1666        int g= (pix0&(maskg|maska))+(pix1&(maskg|maska));\
1667        int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1668        int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1669        g&= maskg|(2*maskg);\
1670\
1671        g>>=shg;\
1672\
1673        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1674        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1675    }\
1676}
1677
1678BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1679BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1680BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0,          0,   0x001F, 0x07E0,   0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1681BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0,          0,   0x001F, 0x03E0,   0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1682BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0,          0,   0xF800, 0x07E0,   0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1683BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0,          0,   0x7C00, 0x03E0,   0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1684
1685#if HAVE_MMX
1686static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1687{
1688
1689    if(srcFormat == PIX_FMT_BGR24){
1690        __asm__ volatile(
1691            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1692            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1693            :
1694        );
1695    }else{
1696        __asm__ volatile(
1697            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1698            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1699            :
1700        );
1701    }
1702
1703    __asm__ volatile(
1704        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1705        "mov                        %2, %%"REG_a"   \n\t"
1706        "pxor                    %%mm7, %%mm7       \n\t"
1707        "1:                                         \n\t"
1708        PREFETCH"               64(%0)              \n\t"
1709        "movd                     (%0), %%mm0       \n\t"
1710        "movd                    2(%0), %%mm1       \n\t"
1711        "movd                    6(%0), %%mm2       \n\t"
1712        "movd                    8(%0), %%mm3       \n\t"
1713        "add                       $12, %0          \n\t"
1714        "punpcklbw               %%mm7, %%mm0       \n\t"
1715        "punpcklbw               %%mm7, %%mm1       \n\t"
1716        "punpcklbw               %%mm7, %%mm2       \n\t"
1717        "punpcklbw               %%mm7, %%mm3       \n\t"
1718        "pmaddwd                 %%mm5, %%mm0       \n\t"
1719        "pmaddwd                 %%mm6, %%mm1       \n\t"
1720        "pmaddwd                 %%mm5, %%mm2       \n\t"
1721        "pmaddwd                 %%mm6, %%mm3       \n\t"
1722        "paddd                   %%mm1, %%mm0       \n\t"
1723        "paddd                   %%mm3, %%mm2       \n\t"
1724        "paddd                   %%mm4, %%mm0       \n\t"
1725        "paddd                   %%mm4, %%mm2       \n\t"
1726        "psrad                     $15, %%mm0       \n\t"
1727        "psrad                     $15, %%mm2       \n\t"
1728        "packssdw                %%mm2, %%mm0       \n\t"
1729        "packuswb                %%mm0, %%mm0       \n\t"
1730        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1731        "add                        $4, %%"REG_a"   \n\t"
1732        " js                        1b              \n\t"
1733    : "+r" (src)
1734    : "r" (dst+width), "g" (-width)
1735    : "%"REG_a
1736    );
1737}
1738
1739static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1740{
1741    __asm__ volatile(
1742        "movq                    24+%4, %%mm6       \n\t"
1743        "mov                        %3, %%"REG_a"   \n\t"
1744        "pxor                    %%mm7, %%mm7       \n\t"
1745        "1:                                         \n\t"
1746        PREFETCH"               64(%0)              \n\t"
1747        "movd                     (%0), %%mm0       \n\t"
1748        "movd                    2(%0), %%mm1       \n\t"
1749        "punpcklbw               %%mm7, %%mm0       \n\t"
1750        "punpcklbw               %%mm7, %%mm1       \n\t"
1751        "movq                    %%mm0, %%mm2       \n\t"
1752        "movq                    %%mm1, %%mm3       \n\t"
1753        "pmaddwd                    %4, %%mm0       \n\t"
1754        "pmaddwd                  8+%4, %%mm1       \n\t"
1755        "pmaddwd                 16+%4, %%mm2       \n\t"
1756        "pmaddwd                 %%mm6, %%mm3       \n\t"
1757        "paddd                   %%mm1, %%mm0       \n\t"
1758        "paddd                   %%mm3, %%mm2       \n\t"
1759
1760        "movd                    6(%0), %%mm1       \n\t"
1761        "movd                    8(%0), %%mm3       \n\t"
1762        "add                       $12, %0          \n\t"
1763        "punpcklbw               %%mm7, %%mm1       \n\t"
1764        "punpcklbw               %%mm7, %%mm3       \n\t"
1765        "movq                    %%mm1, %%mm4       \n\t"
1766        "movq                    %%mm3, %%mm5       \n\t"
1767        "pmaddwd                    %4, %%mm1       \n\t"
1768        "pmaddwd                  8+%4, %%mm3       \n\t"
1769        "pmaddwd                 16+%4, %%mm4       \n\t"
1770        "pmaddwd                 %%mm6, %%mm5       \n\t"
1771        "paddd                   %%mm3, %%mm1       \n\t"
1772        "paddd                   %%mm5, %%mm4       \n\t"
1773
1774        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1775        "paddd                   %%mm3, %%mm0       \n\t"
1776        "paddd                   %%mm3, %%mm2       \n\t"
1777        "paddd                   %%mm3, %%mm1       \n\t"
1778        "paddd                   %%mm3, %%mm4       \n\t"
1779        "psrad                     $15, %%mm0       \n\t"
1780        "psrad                     $15, %%mm2       \n\t"
1781        "psrad                     $15, %%mm1       \n\t"
1782        "psrad                     $15, %%mm4       \n\t"
1783        "packssdw                %%mm1, %%mm0       \n\t"
1784        "packssdw                %%mm4, %%mm2       \n\t"
1785        "packuswb                %%mm0, %%mm0       \n\t"
1786        "packuswb                %%mm2, %%mm2       \n\t"
1787        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1788        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1789        "add                        $4, %%"REG_a"   \n\t"
1790        " js                        1b              \n\t"
1791    : "+r" (src)
1792    : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1793    : "%"REG_a
1794    );
1795}
1796#endif
1797
1798static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1799{
1800#if HAVE_MMX
1801    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1802#else
1803    int i;
1804    for (i=0; i<width; i++)
1805    {
1806        int b= src[i*3+0];
1807        int g= src[i*3+1];
1808        int r= src[i*3+2];
1809
1810        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1811    }
1812#endif /* HAVE_MMX */
1813}
1814
1815static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1816{
1817#if HAVE_MMX
1818    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1819#else
1820    int i;
1821    for (i=0; i<width; i++)
1822    {
1823        int b= src1[3*i + 0];
1824        int g= src1[3*i + 1];
1825        int r= src1[3*i + 2];
1826
1827        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1828        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1829    }
1830#endif /* HAVE_MMX */
1831    assert(src1 == src2);
1832}
1833
1834static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1835{
1836    int i;
1837    for (i=0; i<width; i++)
1838    {
1839        int b= src1[6*i + 0] + src1[6*i + 3];
1840        int g= src1[6*i + 1] + src1[6*i + 4];
1841        int r= src1[6*i + 2] + src1[6*i + 5];
1842
1843        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1844        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1845    }
1846    assert(src1 == src2);
1847}
1848
1849static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1850{
1851#if HAVE_MMX
1852    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1853#else
1854    int i;
1855    for (i=0; i<width; i++)
1856    {
1857        int r= src[i*3+0];
1858        int g= src[i*3+1];
1859        int b= src[i*3+2];
1860
1861        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1862    }
1863#endif
1864}
1865
1866static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1867{
1868#if HAVE_MMX
1869    assert(src1==src2);
1870    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1871#else
1872    int i;
1873    assert(src1==src2);
1874    for (i=0; i<width; i++)
1875    {
1876        int r= src1[3*i + 0];
1877        int g= src1[3*i + 1];
1878        int b= src1[3*i + 2];
1879
1880        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1881        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1882    }
1883#endif
1884}
1885
1886static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1887{
1888    int i;
1889    assert(src1==src2);
1890    for (i=0; i<width; i++)
1891    {
1892        int r= src1[6*i + 0] + src1[6*i + 3];
1893        int g= src1[6*i + 1] + src1[6*i + 4];
1894        int b= src1[6*i + 2] + src1[6*i + 5];
1895
1896        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1897        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1898    }
1899}
1900
1901
1902static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
1903{
1904    int i;
1905    for (i=0; i<width; i++)
1906    {
1907        int d= src[i];
1908
1909        dst[i]= pal[d] & 0xFF;
1910    }
1911}
1912
1913static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
1914{
1915    int i;
1916    assert(src1 == src2);
1917    for (i=0; i<width; i++)
1918    {
1919        int p= pal[src1[i]];
1920
1921        dstU[i]= p>>8;
1922        dstV[i]= p>>16;
1923    }
1924}
1925
1926static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1927{
1928    int i, j;
1929    for (i=0; i<width/8; i++){
1930        int d= ~src[i];
1931        for(j=0; j<8; j++)
1932            dst[8*i+j]= ((d>>(7-j))&1)*255;
1933    }
1934}
1935
1936static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1937{
1938    int i, j;
1939    for (i=0; i<width/8; i++){
1940        int d= src[i];
1941        for(j=0; j<8; j++)
1942            dst[8*i+j]= ((d>>(7-j))&1)*255;
1943    }
1944}
1945
1946// bilinear / bicubic scaling
1947static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1948                                  int16_t *filter, int16_t *filterPos, long filterSize)
1949{
1950#if HAVE_MMX
1951    assert(filterSize % 4 == 0 && filterSize>0);
1952    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
1953    {
1954        long counter= -2*dstW;
1955        filter-= counter*2;
1956        filterPos-= counter/2;
1957        dst-= counter/2;
1958        __asm__ volatile(
1959#if defined(PIC)
1960        "push            %%"REG_b"              \n\t"
1961#endif
1962        "pxor                %%mm7, %%mm7       \n\t"
1963        "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
1964        "mov             %%"REG_a", %%"REG_BP"  \n\t"
1965        ASMALIGN(4)
1966        "1:                                     \n\t"
1967        "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
1968        "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
1969        "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
1970        "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
1971        "movd      (%3, %%"REG_a"), %%mm0       \n\t"
1972        "movd      (%3, %%"REG_b"), %%mm2       \n\t"
1973        "punpcklbw           %%mm7, %%mm0       \n\t"
1974        "punpcklbw           %%mm7, %%mm2       \n\t"
1975        "pmaddwd             %%mm1, %%mm0       \n\t"
1976        "pmaddwd             %%mm2, %%mm3       \n\t"
1977        "movq                %%mm0, %%mm4       \n\t"
1978        "punpckldq           %%mm3, %%mm0       \n\t"
1979        "punpckhdq           %%mm3, %%mm4       \n\t"
1980        "paddd               %%mm4, %%mm0       \n\t"
1981        "psrad                  $7, %%mm0       \n\t"
1982        "packssdw            %%mm0, %%mm0       \n\t"
1983        "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
1984        "add                    $4, %%"REG_BP"  \n\t"
1985        " jnc                   1b              \n\t"
1986
1987        "pop            %%"REG_BP"              \n\t"
1988#if defined(PIC)
1989        "pop             %%"REG_b"              \n\t"
1990#endif
1991        : "+a" (counter)
1992        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1993#if !defined(PIC)
1994        : "%"REG_b
1995#endif
1996        );
1997    }
1998    else if (filterSize==8)
1999    {
2000        long counter= -2*dstW;
2001        filter-= counter*4;
2002        filterPos-= counter/2;
2003        dst-= counter/2;
2004        __asm__ volatile(
2005#if defined(PIC)
2006        "push             %%"REG_b"             \n\t"
2007#endif
2008        "pxor                 %%mm7, %%mm7      \n\t"
2009        "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2010        "mov              %%"REG_a", %%"REG_BP" \n\t"
2011        ASMALIGN(4)
2012        "1:                                     \n\t"
2013        "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2014        "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2015        "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2016        "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2017        "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2018        "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2019        "punpcklbw            %%mm7, %%mm0      \n\t"
2020        "punpcklbw            %%mm7, %%mm2      \n\t"
2021        "pmaddwd              %%mm1, %%mm0      \n\t"
2022        "pmaddwd              %%mm2, %%mm3      \n\t"
2023
2024        "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2025        "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2026        "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2027        "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2028        "punpcklbw            %%mm7, %%mm4      \n\t"
2029        "punpcklbw            %%mm7, %%mm2      \n\t"
2030        "pmaddwd              %%mm1, %%mm4      \n\t"
2031        "pmaddwd              %%mm2, %%mm5      \n\t"
2032        "paddd                %%mm4, %%mm0      \n\t"
2033        "paddd                %%mm5, %%mm3      \n\t"
2034        "movq                 %%mm0, %%mm4      \n\t"
2035        "punpckldq            %%mm3, %%mm0      \n\t"
2036        "punpckhdq            %%mm3, %%mm4      \n\t"
2037        "paddd                %%mm4, %%mm0      \n\t"
2038        "psrad                   $7, %%mm0      \n\t"
2039        "packssdw             %%mm0, %%mm0      \n\t"
2040        "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2041        "add                     $4, %%"REG_BP" \n\t"
2042        " jnc                    1b             \n\t"
2043
2044        "pop             %%"REG_BP"             \n\t"
2045#if defined(PIC)
2046        "pop              %%"REG_b"             \n\t"
2047#endif
2048        : "+a" (counter)
2049        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2050#if !defined(PIC)
2051        : "%"REG_b
2052#endif
2053        );
2054    }
2055    else
2056    {
2057        uint8_t *offset = src+filterSize;
2058        long counter= -2*dstW;
2059        //filter-= counter*filterSize/2;
2060        filterPos-= counter/2;
2061        dst-= counter/2;
2062        __asm__ volatile(
2063        "pxor                  %%mm7, %%mm7     \n\t"
2064        ASMALIGN(4)
2065        "1:                                     \n\t"
2066        "mov                      %2, %%"REG_c" \n\t"
2067        "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2068        "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2069        "mov                      %5, %%"REG_c" \n\t"
2070        "pxor                  %%mm4, %%mm4     \n\t"
2071        "pxor                  %%mm5, %%mm5     \n\t"
2072        "2:                                     \n\t"
2073        "movq                   (%1), %%mm1     \n\t"
2074        "movq               (%1, %6), %%mm3     \n\t"
2075        "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2076        "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2077        "punpcklbw             %%mm7, %%mm0     \n\t"
2078        "punpcklbw             %%mm7, %%mm2     \n\t"
2079        "pmaddwd               %%mm1, %%mm0     \n\t"
2080        "pmaddwd               %%mm2, %%mm3     \n\t"
2081        "paddd                 %%mm3, %%mm5     \n\t"
2082        "paddd                 %%mm0, %%mm4     \n\t"
2083        "add                      $8, %1        \n\t"
2084        "add                      $4, %%"REG_c" \n\t"
2085        "cmp                      %4, %%"REG_c" \n\t"
2086        " jb                      2b            \n\t"
2087        "add                      %6, %1        \n\t"
2088        "movq                  %%mm4, %%mm0     \n\t"
2089        "punpckldq             %%mm5, %%mm4     \n\t"
2090        "punpckhdq             %%mm5, %%mm0     \n\t"
2091        "paddd                 %%mm0, %%mm4     \n\t"
2092        "psrad                    $7, %%mm4     \n\t"
2093        "packssdw              %%mm4, %%mm4     \n\t"
2094        "mov                      %3, %%"REG_a" \n\t"
2095        "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2096        "add                      $4, %0        \n\t"
2097        " jnc                     1b            \n\t"
2098
2099        : "+r" (counter), "+r" (filter)
2100        : "m" (filterPos), "m" (dst), "m"(offset),
2101          "m" (src), "r" (filterSize*2)
2102        : "%"REG_a, "%"REG_c, "%"REG_d
2103        );
2104    }
2105#else
2106#if HAVE_ALTIVEC
2107    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2108#else
2109    int i;
2110    for (i=0; i<dstW; i++)
2111    {
2112        int j;
2113        int srcPos= filterPos[i];
2114        int val=0;
2115        //printf("filterPos: %d\n", filterPos[i]);
2116        for (j=0; j<filterSize; j++)
2117        {
2118            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2119            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2120        }
2121        //filter += hFilterSize;
2122        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2123        //dst[i] = val>>7;
2124    }
2125#endif /* HAVE_ALTIVEC */
2126#endif /* HAVE_MMX */
2127}
2128      // *** horizontal scale Y line to temp buffer
2129static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2130                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2131                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2132                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2133                                   int32_t *mmx2FilterPos, uint32_t *pal)
2134{
2135    if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2136    {
2137        RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2138        src= formatConvBuffer;
2139    }
2140    else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2141    {
2142        RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2143        src= formatConvBuffer;
2144    }
2145    else if (srcFormat==PIX_FMT_RGB32)
2146    {
2147        RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2148        src= formatConvBuffer;
2149    }
2150    else if (srcFormat==PIX_FMT_RGB32_1)
2151    {
2152        RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2153        src= formatConvBuffer;
2154    }
2155    else if (srcFormat==PIX_FMT_BGR24)
2156    {
2157        RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2158        src= formatConvBuffer;
2159    }
2160    else if (srcFormat==PIX_FMT_BGR565)
2161    {
2162        RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2163        src= formatConvBuffer;
2164    }
2165    else if (srcFormat==PIX_FMT_BGR555)
2166    {
2167        RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2168        src= formatConvBuffer;
2169    }
2170    else if (srcFormat==PIX_FMT_BGR32)
2171    {
2172        RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2173        src= formatConvBuffer;
2174    }
2175    else if (srcFormat==PIX_FMT_BGR32_1)
2176    {
2177        RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2178        src= formatConvBuffer;
2179    }
2180    else if (srcFormat==PIX_FMT_RGB24)
2181    {
2182        RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2183        src= formatConvBuffer;
2184    }
2185    else if (srcFormat==PIX_FMT_RGB565)
2186    {
2187        RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2188        src= formatConvBuffer;
2189    }
2190    else if (srcFormat==PIX_FMT_RGB555)
2191    {
2192        RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2193        src= formatConvBuffer;
2194    }
2195    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2196    {
2197        RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2198        src= formatConvBuffer;
2199    }
2200    else if (srcFormat==PIX_FMT_MONOBLACK)
2201    {
2202        RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
2203        src= formatConvBuffer;
2204    }
2205    else if (srcFormat==PIX_FMT_MONOWHITE)
2206    {
2207        RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
2208        src= formatConvBuffer;
2209    }
2210
2211#if HAVE_MMX
2212    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2213    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2214#else
2215    if (!(flags&SWS_FAST_BILINEAR))
2216#endif
2217    {
2218        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2219    }
2220    else // fast bilinear upscale / crap downscale
2221    {
2222#if ARCH_X86 && CONFIG_GPL
2223#if HAVE_MMX2
2224        int i;
2225#if defined(PIC)
2226        uint64_t ebxsave __attribute__((aligned(8)));
2227#endif
2228        if (canMMX2BeUsed)
2229        {
2230            __asm__ volatile(
2231#if defined(PIC)
2232            "mov               %%"REG_b", %5        \n\t"
2233#endif
2234            "pxor                  %%mm7, %%mm7     \n\t"
2235            "mov                      %0, %%"REG_c" \n\t"
2236            "mov                      %1, %%"REG_D" \n\t"
2237            "mov                      %2, %%"REG_d" \n\t"
2238            "mov                      %3, %%"REG_b" \n\t"
2239            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2240            PREFETCH"        (%%"REG_c")            \n\t"
2241            PREFETCH"      32(%%"REG_c")            \n\t"
2242            PREFETCH"      64(%%"REG_c")            \n\t"
2243
2244#if ARCH_X86_64
2245
2246#define FUNNY_Y_CODE \
2247            "movl            (%%"REG_b"), %%esi     \n\t"\
2248            "call                    *%4            \n\t"\
2249            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2250            "add               %%"REG_S", %%"REG_c" \n\t"\
2251            "add               %%"REG_a", %%"REG_D" \n\t"\
2252            "xor               %%"REG_a", %%"REG_a" \n\t"\
2253
2254#else
2255
2256#define FUNNY_Y_CODE \
2257            "movl (%%"REG_b"), %%esi        \n\t"\
2258            "call         *%4                       \n\t"\
2259            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2260            "add               %%"REG_a", %%"REG_D" \n\t"\
2261            "xor               %%"REG_a", %%"REG_a" \n\t"\
2262
2263#endif /* ARCH_X86_64 */
2264
2265FUNNY_Y_CODE
2266FUNNY_Y_CODE
2267FUNNY_Y_CODE
2268FUNNY_Y_CODE
2269FUNNY_Y_CODE
2270FUNNY_Y_CODE
2271FUNNY_Y_CODE
2272FUNNY_Y_CODE
2273
2274#if defined(PIC)
2275            "mov                      %5, %%"REG_b" \n\t"
2276#endif
2277            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2278            "m" (funnyYCode)
2279#if defined(PIC)
2280            ,"m" (ebxsave)
2281#endif
2282            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2283#if !defined(PIC)
2284            ,"%"REG_b
2285#endif
2286            );
2287            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2288        }
2289        else
2290        {
2291#endif /* HAVE_MMX2 */
2292        long xInc_shr16 = xInc >> 16;
2293        uint16_t xInc_mask = xInc & 0xffff;
2294        //NO MMX just normal asm ...
2295        __asm__ volatile(
2296        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2297        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2298        "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
2299        ASMALIGN(4)
2300        "1:                                  \n\t"
2301        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2302        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2303        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2304        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2305        "shll      $16, %%edi                \n\t"
2306        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2307        "mov        %1, %%"REG_D"            \n\t"
2308        "shrl       $9, %%esi                \n\t"
2309        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2310        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2311        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2312
2313        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2314        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2315        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2316        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2317        "shll      $16, %%edi                \n\t"
2318        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2319        "mov        %1, %%"REG_D"            \n\t"
2320        "shrl       $9, %%esi                \n\t"
2321        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2322        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2323        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2324
2325
2326        "add        $2, %%"REG_a"            \n\t"
2327        "cmp        %2, %%"REG_a"            \n\t"
2328        " jb        1b                       \n\t"
2329
2330
2331        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2332        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2333        );
2334#if HAVE_MMX2
2335        } //if MMX2 can't be used
2336#endif
2337#else
2338        int i;
2339        unsigned int xpos=0;
2340        for (i=0;i<dstWidth;i++)
2341        {
2342            register unsigned int xx=xpos>>16;
2343            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2344            dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2345            xpos+=xInc;
2346        }
2347#endif /* ARCH_X86 */
2348    }
2349
2350    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2351        int i;
2352        //FIXME all pal and rgb srcFormats could do this convertion as well
2353        //FIXME all scalers more complex than bilinear could do half of this transform
2354        if(c->srcRange){
2355            for (i=0; i<dstWidth; i++)
2356                dst[i]= (dst[i]*14071 + 33561947)>>14;
2357        }else{
2358            for (i=0; i<dstWidth; i++)
2359                dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2360        }
2361    }
2362}
2363
2364inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2365                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2366                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2367                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2368                                   int32_t *mmx2FilterPos, uint32_t *pal)
2369{
2370    if (srcFormat==PIX_FMT_YUYV422)
2371    {
2372        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2373        src1= formatConvBuffer;
2374        src2= formatConvBuffer+VOFW;
2375    }
2376    else if (srcFormat==PIX_FMT_UYVY422)
2377    {
2378        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2379        src1= formatConvBuffer;
2380        src2= formatConvBuffer+VOFW;
2381    }
2382    else if (srcFormat==PIX_FMT_RGB32)
2383    {
2384        if(c->chrSrcHSubSample)
2385            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2386        else
2387            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2388        src1= formatConvBuffer;
2389        src2= formatConvBuffer+VOFW;
2390    }
2391    else if (srcFormat==PIX_FMT_RGB32_1)
2392    {
2393        if(c->chrSrcHSubSample)
2394            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2395        else
2396            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2397        src1= formatConvBuffer;
2398        src2= formatConvBuffer+VOFW;
2399    }
2400    else if (srcFormat==PIX_FMT_BGR24)
2401    {
2402        if(c->chrSrcHSubSample)
2403            RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2404        else
2405            RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2406        src1= formatConvBuffer;
2407        src2= formatConvBuffer+VOFW;
2408    }
2409    else if (srcFormat==PIX_FMT_BGR565)
2410    {
2411        if(c->chrSrcHSubSample)
2412            RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2413        else
2414            RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2415        src1= formatConvBuffer;
2416        src2= formatConvBuffer+VOFW;
2417    }
2418    else if (srcFormat==PIX_FMT_BGR555)
2419    {
2420        if(c->chrSrcHSubSample)
2421            RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2422        else
2423            RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2424        src1= formatConvBuffer;
2425        src2= formatConvBuffer+VOFW;
2426    }
2427    else if (srcFormat==PIX_FMT_BGR32)
2428    {
2429        if(c->chrSrcHSubSample)
2430            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2431        else
2432            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2433        src1= formatConvBuffer;
2434        src2= formatConvBuffer+VOFW;
2435    }
2436    else if (srcFormat==PIX_FMT_BGR32_1)
2437    {
2438        if(c->chrSrcHSubSample)
2439            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2440        else
2441            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2442        src1= formatConvBuffer;
2443        src2= formatConvBuffer+VOFW;
2444    }
2445    else if (srcFormat==PIX_FMT_RGB24)
2446    {
2447        if(c->chrSrcHSubSample)
2448            RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2449        else
2450            RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2451        src1= formatConvBuffer;
2452        src2= formatConvBuffer+VOFW;
2453    }
2454    else if (srcFormat==PIX_FMT_RGB565)
2455    {
2456        if(c->chrSrcHSubSample)
2457            RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2458        else
2459            RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2460        src1= formatConvBuffer;
2461        src2= formatConvBuffer+VOFW;
2462    }
2463    else if (srcFormat==PIX_FMT_RGB555)
2464    {
2465        if(c->chrSrcHSubSample)
2466            RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2467        else
2468            RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2469        src1= formatConvBuffer;
2470        src2= formatConvBuffer+VOFW;
2471    }
2472    else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2473    {
2474        return;
2475    }
2476    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2477    {
2478        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2479        src1= formatConvBuffer;
2480        src2= formatConvBuffer+VOFW;
2481    }
2482
2483#if HAVE_MMX
2484    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2485    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2486#else
2487    if (!(flags&SWS_FAST_BILINEAR))
2488#endif
2489    {
2490        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2491        RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2492    }
2493    else // fast bilinear upscale / crap downscale
2494    {
2495#if ARCH_X86 && CONFIG_GPL
2496#if HAVE_MMX2
2497        int i;
2498#if defined(PIC)
2499        uint64_t ebxsave __attribute__((aligned(8)));
2500#endif
2501        if (canMMX2BeUsed)
2502        {
2503            __asm__ volatile(
2504#if defined(PIC)
2505            "mov          %%"REG_b", %6         \n\t"
2506#endif
2507            "pxor             %%mm7, %%mm7      \n\t"
2508            "mov                 %0, %%"REG_c"  \n\t"
2509            "mov                 %1, %%"REG_D"  \n\t"
2510            "mov                 %2, %%"REG_d"  \n\t"
2511            "mov                 %3, %%"REG_b"  \n\t"
2512            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2513            PREFETCH"   (%%"REG_c")             \n\t"
2514            PREFETCH" 32(%%"REG_c")             \n\t"
2515            PREFETCH" 64(%%"REG_c")             \n\t"
2516
2517#if ARCH_X86_64
2518
2519#define FUNNY_UV_CODE \
2520            "movl       (%%"REG_b"), %%esi      \n\t"\
2521            "call               *%4             \n\t"\
2522            "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2523            "add          %%"REG_S", %%"REG_c"  \n\t"\
2524            "add          %%"REG_a", %%"REG_D"  \n\t"\
2525            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2526
2527#else
2528
2529#define FUNNY_UV_CODE \
2530            "movl       (%%"REG_b"), %%esi      \n\t"\
2531            "call               *%4             \n\t"\
2532            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2533            "add          %%"REG_a", %%"REG_D"  \n\t"\
2534            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2535
2536#endif /* ARCH_X86_64 */
2537
2538FUNNY_UV_CODE
2539FUNNY_UV_CODE
2540FUNNY_UV_CODE
2541FUNNY_UV_CODE
2542            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2543            "mov                 %5, %%"REG_c"  \n\t" // src
2544            "mov                 %1, %%"REG_D"  \n\t" // buf1
2545            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2546            PREFETCH"   (%%"REG_c")             \n\t"
2547            PREFETCH" 32(%%"REG_c")             \n\t"
2548            PREFETCH" 64(%%"REG_c")             \n\t"
2549
2550FUNNY_UV_CODE
2551FUNNY_UV_CODE
2552FUNNY_UV_CODE
2553FUNNY_UV_CODE
2554
2555#if defined(PIC)
2556            "mov %6, %%"REG_b"    \n\t"
2557#endif
2558            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2559            "m" (funnyUVCode), "m" (src2)
2560#if defined(PIC)
2561            ,"m" (ebxsave)
2562#endif
2563            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2564#if !defined(PIC)
2565             ,"%"REG_b
2566#endif
2567            );
2568            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2569            {
2570                //printf("%d %d %d\n", dstWidth, i, srcW);
2571                dst[i] = src1[srcW-1]*128;
2572                dst[i+VOFW] = src2[srcW-1]*128;
2573            }
2574        }
2575        else
2576        {
2577#endif /* HAVE_MMX2 */
2578            long xInc_shr16 = (long) (xInc >> 16);
2579            uint16_t xInc_mask = xInc & 0xffff;
2580            __asm__ volatile(
2581            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2582            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2583            "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
2584            ASMALIGN(4)
2585            "1:                                     \n\t"
2586            "mov        %0, %%"REG_S"               \n\t"
2587            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2588            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2589            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2590            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2591            "shll      $16, %%edi                   \n\t"
2592            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2593            "mov        %1, %%"REG_D"               \n\t"
2594            "shrl       $9, %%esi                   \n\t"
2595            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2596
2597            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2598            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2599            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2600            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2601            "shll      $16, %%edi                   \n\t"
2602            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2603            "mov        %1, %%"REG_D"               \n\t"
2604            "shrl       $9, %%esi                   \n\t"
2605            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2606
2607            "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
2608            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
2609            "add        $1, %%"REG_a"               \n\t"
2610            "cmp        %2, %%"REG_a"               \n\t"
2611            " jb        1b                          \n\t"
2612
2613/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2614   which is needed to support GCC 4.0. */
2615#if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2616            :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2617#else
2618            :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2619#endif
2620            "r" (src2)
2621            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2622            );
2623#if HAVE_MMX2
2624        } //if MMX2 can't be used
2625#endif
2626#else
2627        int i;
2628        unsigned int xpos=0;
2629        for (i=0;i<dstWidth;i++)
2630        {
2631            register unsigned int xx=xpos>>16;
2632            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2633            dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2634            dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2635            /* slower
2636            dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2637            dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2638            */
2639            xpos+=xInc;
2640        }
2641#endif /* ARCH_X86 */
2642    }
2643    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2644        int i;
2645        //FIXME all pal and rgb srcFormats could do this convertion as well
2646        //FIXME all scalers more complex than bilinear could do half of this transform
2647        if(c->srcRange){
2648            for (i=0; i<dstWidth; i++){
2649                dst[i     ]= (dst[i     ]*1799 + 4081085)>>11; //1469
2650                dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2651            }
2652        }else{
2653            for (i=0; i<dstWidth; i++){
2654                dst[i     ]= (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2655                dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2656            }
2657        }
2658    }
2659}
2660
2661static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2662                           int srcSliceH, uint8_t* dst[], int dstStride[]){
2663
2664    /* load a few things into local vars to make the code more readable? and faster */
2665    const int srcW= c->srcW;
2666    const int dstW= c->dstW;
2667    const int dstH= c->dstH;
2668    const int chrDstW= c->chrDstW;
2669    const int chrSrcW= c->chrSrcW;
2670    const int lumXInc= c->lumXInc;
2671    const int chrXInc= c->chrXInc;
2672    const int dstFormat= c->dstFormat;
2673    const int srcFormat= c->srcFormat;
2674    const int flags= c->flags;
2675    const int canMMX2BeUsed= c->canMMX2BeUsed;
2676    int16_t *vLumFilterPos= c->vLumFilterPos;
2677    int16_t *vChrFilterPos= c->vChrFilterPos;
2678    int16_t *hLumFilterPos= c->hLumFilterPos;
2679    int16_t *hChrFilterPos= c->hChrFilterPos;
2680    int16_t *vLumFilter= c->vLumFilter;
2681    int16_t *vChrFilter= c->vChrFilter;
2682    int16_t *hLumFilter= c->hLumFilter;
2683    int16_t *hChrFilter= c->hChrFilter;
2684    int32_t *lumMmxFilter= c->lumMmxFilter;
2685    int32_t *chrMmxFilter= c->chrMmxFilter;
2686    const int vLumFilterSize= c->vLumFilterSize;
2687    const int vChrFilterSize= c->vChrFilterSize;
2688    const int hLumFilterSize= c->hLumFilterSize;
2689    const int hChrFilterSize= c->hChrFilterSize;
2690    int16_t **lumPixBuf= c->lumPixBuf;
2691    int16_t **chrPixBuf= c->chrPixBuf;
2692    const int vLumBufSize= c->vLumBufSize;
2693    const int vChrBufSize= c->vChrBufSize;
2694    uint8_t *funnyYCode= c->funnyYCode;
2695    uint8_t *funnyUVCode= c->funnyUVCode;
2696    uint8_t *formatConvBuffer= c->formatConvBuffer;
2697    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2698    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2699    int lastDstY;
2700    uint32_t *pal=c->pal_yuv;
2701
2702    /* vars which will change and which we need to store back in the context */
2703    int dstY= c->dstY;
2704    int lumBufIndex= c->lumBufIndex;
2705    int chrBufIndex= c->chrBufIndex;
2706    int lastInLumBuf= c->lastInLumBuf;
2707    int lastInChrBuf= c->lastInChrBuf;
2708
2709    if (isPacked(c->srcFormat)){
2710        src[0]=
2711        src[1]=
2712        src[2]= src[0];
2713        srcStride[0]=
2714        srcStride[1]=
2715        srcStride[2]= srcStride[0];
2716    }
2717    srcStride[1]<<= c->vChrDrop;
2718    srcStride[2]<<= c->vChrDrop;
2719
2720    //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2721    //       (int)dst[0], (int)dst[1], (int)dst[2]);
2722
2723#if 0 //self test FIXME move to a vfilter or something
2724    {
2725    static volatile int i=0;
2726    i++;
2727    if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2728        selfTest(src, srcStride, c->srcW, c->srcH);
2729    i--;
2730    }
2731#endif
2732
2733    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2734    //dstStride[0],dstStride[1],dstStride[2]);
2735
2736    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2737    {
2738        static int warnedAlready=0; //FIXME move this into the context perhaps
2739        if (flags & SWS_PRINT_INFO && !warnedAlready)
2740        {
2741            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2742                   "         ->cannot do aligned memory accesses anymore\n");
2743            warnedAlready=1;
2744        }
2745    }
2746
2747    /* Note the user might start scaling the picture in the middle so this
2748       will not get executed. This is not really intended but works
2749       currently, so people might do it. */
2750    if (srcSliceY ==0){
2751        lumBufIndex=0;
2752        chrBufIndex=0;
2753        dstY=0;
2754        lastInLumBuf= -1;
2755        lastInChrBuf= -1;
2756    }
2757
2758    lastDstY= dstY;
2759
2760    for (;dstY < dstH; dstY++){
2761        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2762        const int chrDstY= dstY>>c->chrDstVSubSample;
2763        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2764        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2765
2766        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2767        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2768        const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2769        const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2770
2771        //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2772        // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2773        //handle holes (FAST_BILINEAR & weird filters)
2774        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2775        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2776        //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2777        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2778        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2779
2780        // Do we have enough lines in this slice to output the dstY line
2781        if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2782        {
2783            //Do horizontal scaling
2784            while(lastInLumBuf < lastLumSrcY)
2785            {
2786                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2787                lumBufIndex++;
2788                //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2789                assert(lumBufIndex < 2*vLumBufSize);
2790                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2791                assert(lastInLumBuf + 1 - srcSliceY >= 0);
2792                //printf("%d %d\n", lumBufIndex, vLumBufSize);
2793                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2794                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2795                                funnyYCode, c->srcFormat, formatConvBuffer,
2796                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2797                lastInLumBuf++;
2798            }
2799            while(lastInChrBuf < lastChrSrcY)
2800            {
2801                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2802                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2803                chrBufIndex++;
2804                assert(chrBufIndex < 2*vChrBufSize);
2805                assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2806                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2807                //FIXME replace parameters through context struct (some at least)
2808
2809                if (!(isGray(srcFormat) || isGray(dstFormat)))
2810                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2811                                    flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2812                                    funnyUVCode, c->srcFormat, formatConvBuffer,
2813                                    c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2814                lastInChrBuf++;
2815            }
2816            //wrap buf index around to stay inside the ring buffer
2817            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2818            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2819        }
2820        else // not enough lines left in this slice -> load the rest in the buffer
2821        {
2822            /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2823            firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2824            lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2825            vChrBufSize, vLumBufSize);*/
2826
2827            //Do horizontal scaling
2828            while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2829            {
2830                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2831                lumBufIndex++;
2832                assert(lumBufIndex < 2*vLumBufSize);
2833                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2834                assert(lastInLumBuf + 1 - srcSliceY >= 0);
2835                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2836                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2837                                funnyYCode, c->srcFormat, formatConvBuffer,
2838                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2839                lastInLumBuf++;
2840            }
2841            while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2842            {
2843                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2844                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2845                chrBufIndex++;
2846                assert(chrBufIndex < 2*vChrBufSize);
2847                assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
2848                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2849
2850                if (!(isGray(srcFormat) || isGray(dstFormat)))
2851                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2852                            flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2853                            funnyUVCode, c->srcFormat, formatConvBuffer,
2854                            c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2855                lastInChrBuf++;
2856            }
2857            //wrap buf index around to stay inside the ring buffer
2858            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2859            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2860            break; //we can't output a dstY line so let's try with the next slice
2861        }
2862
2863#if HAVE_MMX
2864        c->blueDither= ff_dither8[dstY&1];
2865        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2866            c->greenDither= ff_dither8[dstY&1];
2867        else
2868            c->greenDither= ff_dither4[dstY&1];
2869        c->redDither= ff_dither8[(dstY+1)&1];
2870#endif
2871        if (dstY < dstH-2)
2872        {
2873            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2874            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2875#if HAVE_MMX
2876            int i;
2877        if (flags & SWS_ACCURATE_RND){
2878            int s= APCK_SIZE / 8;
2879            for (i=0; i<vLumFilterSize; i+=2){
2880                *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
2881                *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
2882                          lumMmxFilter[s*i+APCK_COEF/4  ]=
2883                          lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
2884                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2885            }
2886            for (i=0; i<vChrFilterSize; i+=2){
2887                *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
2888                *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
2889                          chrMmxFilter[s*i+APCK_COEF/4  ]=
2890                          chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
2891                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2892            }
2893        }else{
2894            for (i=0; i<vLumFilterSize; i++)
2895            {
2896                lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2897                lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2898                lumMmxFilter[4*i+2]=
2899                lumMmxFilter[4*i+3]=
2900                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2901            }
2902            for (i=0; i<vChrFilterSize; i++)
2903            {
2904                chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2905                chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2906                chrMmxFilter[4*i+2]=
2907                chrMmxFilter[4*i+3]=
2908                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2909            }
2910        }
2911#endif
2912            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2913                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2914                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2915                RENAME(yuv2nv12X)(c,
2916                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2917                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2918                    dest, uDest, dstW, chrDstW, dstFormat);
2919            }
2920            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
2921            {
2922                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2923                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2924                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
2925                {
2926                    int16_t *lumBuf = lumPixBuf[0];
2927                    int16_t *chrBuf= chrPixBuf[0];
2928                    RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2929                }
2930                else //General YV12
2931                {
2932                    RENAME(yuv2yuvX)(c,
2933                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2934                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2935                        dest, uDest, vDest, dstW, chrDstW);
2936                }
2937            }
2938            else
2939            {
2940                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2941                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2942                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
2943                {
2944                    int chrAlpha= vChrFilter[2*dstY+1];
2945                    if(flags & SWS_FULL_CHR_H_INT){
2946                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2947                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2948                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2949                            dest, dstW, dstY);
2950                    }else{
2951                        RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2952                            dest, dstW, chrAlpha, dstFormat, flags, dstY);
2953                    }
2954                }
2955                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
2956                {
2957                    int lumAlpha= vLumFilter[2*dstY+1];
2958                    int chrAlpha= vChrFilter[2*dstY+1];
2959                    lumMmxFilter[2]=
2960                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2961                    chrMmxFilter[2]=
2962                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2963                    if(flags & SWS_FULL_CHR_H_INT){
2964                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2965                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2966                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2967                            dest, dstW, dstY);
2968                    }else{
2969                        RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2970                            dest, dstW, lumAlpha, chrAlpha, dstY);
2971                    }
2972                }
2973                else //general RGB
2974                {
2975                    if(flags & SWS_FULL_CHR_H_INT){
2976                        yuv2rgbXinC_full(c,
2977                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2978                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2979                            dest, dstW, dstY);
2980                    }else{
2981                        RENAME(yuv2packedX)(c,
2982                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2983                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2984                            dest, dstW, dstY);
2985                    }
2986                }
2987            }
2988        }
2989        else // hmm looks like we can't use MMX here without overwriting this array's tail
2990        {
2991            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2992            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2993            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2994                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2995                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2996                yuv2nv12XinC(
2997                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2998                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2999                    dest, uDest, dstW, chrDstW, dstFormat);
3000            }
3001            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3002            {
3003                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3004                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3005                yuv2yuvXinC(
3006                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3007                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3008                    dest, uDest, vDest, dstW, chrDstW);
3009            }
3010            else
3011            {
3012                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3013                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3014                if(flags & SWS_FULL_CHR_H_INT){
3015                    yuv2rgbXinC_full(c,
3016                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3017                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3018                        dest, dstW, dstY);
3019                }else{
3020                    yuv2packedXinC(c,
3021                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3022                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3023                        dest, dstW, dstY);
3024                }
3025            }
3026        }
3027    }
3028
3029#if HAVE_MMX
3030    __asm__ volatile(SFENCE:::"memory");
3031    __asm__ volatile(EMMS:::"memory");
3032#endif
3033    /* store changed local vars back in the context */
3034    c->dstY= dstY;
3035    c->lumBufIndex= lumBufIndex;
3036    c->chrBufIndex= chrBufIndex;
3037    c->lastInLumBuf= lastInLumBuf;
3038    c->lastInChrBuf= lastInChrBuf;
3039
3040    return dstY - lastDstY;
3041}
3042