1/*
2 * AltiVec acceleration for colorspace conversion
3 *
4 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23/*
24Convert I420 YV12 to RGB in various formats,
25  it rejects images that are not in 420 formats,
26  it rejects images that don't have widths of multiples of 16,
27  it rejects images that don't have heights of multiples of 2.
28Reject defers to C simulation code.
29
30Lots of optimizations to be done here.
31
321. Need to fix saturation code. I just couldn't get it to fly with packs
33   and adds, so we currently use max/min to clip.
34
352. The inefficient use of chroma loading needs a bit of brushing up.
36
373. Analysis of pipeline stalls needs to be done. Use shark to identify
38   pipeline stalls.
39
40
41MODIFIED to calculate coeffs from currently selected color space.
42MODIFIED core to be a macro where you specify the output format.
43ADDED UYVY conversion which is never called due to some thing in swscale.
44CORRECTED algorithim selection to be strict on input formats.
45ADDED runtime detection of AltiVec.
46
47ADDED altivec_yuv2packedX vertical scl + RGB converter
48
49March 27,2004
50PERFORMANCE ANALYSIS
51
52The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53used as test.
54The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55same sequence.
56
57720 * 480 * 30  ~10MPS
58
59so we have roughly 10 clocks per pixel. This is too high, something has
60to be wrong.
61
62OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63need for vec_min.
64
65OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66the input video frame, it was just decompressed so it probably resides in L1
67caches. However, we are creating the output video stream. This needs to use the
68DSTST instruction to optimize for the cache. We couple this with the fact that
69we are not going to be visiting the input buffer again so we mark it Least
70Recently Used. This shaves 25% of the processor cycles off.
71
72Now memcpy is the largest mips consumer in the system, probably due
73to the inefficient X11 stuff.
74
75GL libraries seem to be very slow on this machine 1.33Ghz PB running
76Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
77a versioning issue, however I have libGL.1.2.dylib for both
78machines. (We need to figure this out now.)
79
80GL2 libraries work now with patch for RGB32.
81
82NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83
84Integrated luma prescaling adjustment for saturation/contrast/brightness
85adjustment.
86*/
87
88#include <stdio.h>
89#include <stdlib.h>
90#include <string.h>
91#include <inttypes.h>
92#include <assert.h>
93#include "config.h"
94#include "rgb2rgb.h"
95#include "swscale.h"
96#include "swscale_internal.h"
97
98#undef PROFILE_THE_BEAST
99#undef INC_SCALING
100
101typedef unsigned char ubyte;
102typedef signed char   sbyte;
103
104
105/* RGB interleaver, 16 planar pels 8-bit samples per channel in
106   homogeneous vector registers x0,x1,x2 are interleaved with the
107   following technique:
108
109      o0 = vec_mergeh (x0,x1);
110      o1 = vec_perm (o0, x2, perm_rgb_0);
111      o2 = vec_perm (o0, x2, perm_rgb_1);
112      o3 = vec_mergel (x0,x1);
113      o4 = vec_perm (o3,o2,perm_rgb_2);
114      o5 = vec_perm (o3,o2,perm_rgb_3);
115
116  perm_rgb_0:   o0(RG).h v1(B) --> o1*
117              0   1  2   3   4
118             rgbr|gbrg|brgb|rgbr
119             0010 0100 1001 0010
120             0102 3145 2673 894A
121
122  perm_rgb_1:   o0(RG).h v1(B) --> o2
123              0   1  2   3   4
124             gbrg|brgb|bbbb|bbbb
125             0100 1001 1111 1111
126             B5CD 6EF7 89AB CDEF
127
128  perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
129              0   1  2   3   4
130             gbrg|brgb|rgbr|gbrg
131             1111 1111 0010 0100
132             89AB CDEF 0182 3945
133
134  perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
135              0   1  2   3   4
136             brgb|rgbr|gbrg|brgb
137             1001 0010 0100 1001
138             a67b 89cA BdCD eEFf
139
140*/
141static
142const vector unsigned char
143  perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
144                0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
145  perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
146                0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
147  perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
148                0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
149  perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
150                0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
151
152#define vec_merge3(x2,x1,x0,y0,y1,y2)       \
153do {                                        \
154    __typeof__(x0) o0,o2,o3;                \
155        o0 = vec_mergeh (x0,x1);            \
156        y0 = vec_perm (o0, x2, perm_rgb_0); \
157        o2 = vec_perm (o0, x2, perm_rgb_1); \
158        o3 = vec_mergel (x0,x1);            \
159        y1 = vec_perm (o3,o2,perm_rgb_2);   \
160        y2 = vec_perm (o3,o2,perm_rgb_3);   \
161} while(0)
162
163#define vec_mstbgr24(x0,x1,x2,ptr)      \
164do {                                    \
165    __typeof__(x0) _0,_1,_2;            \
166    vec_merge3 (x0,x1,x2,_0,_1,_2);     \
167    vec_st (_0, 0, ptr++);              \
168    vec_st (_1, 0, ptr++);              \
169    vec_st (_2, 0, ptr++);              \
170}  while (0);
171
172#define vec_mstrgb24(x0,x1,x2,ptr)      \
173do {                                    \
174    __typeof__(x0) _0,_1,_2;            \
175    vec_merge3 (x2,x1,x0,_0,_1,_2);     \
176    vec_st (_0, 0, ptr++);              \
177    vec_st (_1, 0, ptr++);              \
178    vec_st (_2, 0, ptr++);              \
179}  while (0);
180
181/* pack the pixels in rgb0 format
182   msb R
183   lsb 0
184*/
185#define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
186do {                                                                          \
187    T _0,_1,_2,_3;                                                            \
188    _0 = vec_mergeh (x0,x1);                                                  \
189    _1 = vec_mergeh (x2,x3);                                                  \
190    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
191    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
192    vec_st (_2, 0*16, (T *)ptr);                                              \
193    vec_st (_3, 1*16, (T *)ptr);                                              \
194    _0 = vec_mergel (x0,x1);                                                  \
195    _1 = vec_mergel (x2,x3);                                                  \
196    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
197    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
198    vec_st (_2, 2*16, (T *)ptr);                                              \
199    vec_st (_3, 3*16, (T *)ptr);                                              \
200    ptr += 4;                                                                 \
201}  while (0);
202
203/*
204
205  | 1     0       1.4021   | | Y |
206  | 1    -0.3441 -0.7142   |x| Cb|
207  | 1     1.7718  0        | | Cr|
208
209
210  Y:      [-128 127]
211  Cb/Cr : [-128 127]
212
213  typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
214
215*/
216
217
218
219
220#define vec_unh(x) \
221    (vector signed short) \
222        vec_perm(x,(__typeof__(x)){0}, \
223                 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
224                                         0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
225#define vec_unl(x) \
226    (vector signed short) \
227        vec_perm(x,(__typeof__(x)){0}, \
228                 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
229                                         0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
230
231#define vec_clip_s16(x) \
232    vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
233                         ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
234
235#define vec_packclp(x,y) \
236    (vector unsigned char)vec_packs \
237        ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
238         (vector unsigned short)vec_max (y,((vector signed short) {0})))
239
240//#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
241
242
243static inline void cvtyuvtoRGB (SwsContext *c,
244                                vector signed short Y, vector signed short U, vector signed short V,
245                                vector signed short *R, vector signed short *G, vector signed short *B)
246{
247    vector signed   short vx,ux,uvx;
248
249    Y = vec_mradds (Y, c->CY, c->OY);
250    U  = vec_sub (U,(vector signed short)
251                    vec_splat((vector signed short){128},0));
252    V  = vec_sub (V,(vector signed short)
253                    vec_splat((vector signed short){128},0));
254
255    //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
256    ux = vec_sl (U, c->CSHIFT);
257    *B = vec_mradds (ux, c->CBU, Y);
258
259    // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
260    vx = vec_sl (V, c->CSHIFT);
261    *R = vec_mradds (vx, c->CRV, Y);
262
263    // uvx = ((CGU*u) + (CGV*v))>>15;
264    uvx = vec_mradds (U, c->CGU, Y);
265    *G  = vec_mradds (V, c->CGV, uvx);
266}
267
268
269/*
270  ------------------------------------------------------------------------------
271  CS converters
272  ------------------------------------------------------------------------------
273*/
274
275
276#define DEFCSP420_CVT(name,out_pixels)                                  \
277static int altivec_##name (SwsContext *c,                               \
278                           unsigned char **in, int *instrides,          \
279                           int srcSliceY,        int srcSliceH,         \
280                           unsigned char **oplanes, int *outstrides)    \
281{                                                                       \
282    int w = c->srcW;                                                    \
283    int h = srcSliceH;                                                  \
284    int i,j;                                                            \
285    int instrides_scl[3];                                               \
286    vector unsigned char y0,y1;                                         \
287                                                                        \
288    vector signed char  u,v;                                            \
289                                                                        \
290    vector signed short Y0,Y1,Y2,Y3;                                    \
291    vector signed short U,V;                                            \
292    vector signed short vx,ux,uvx;                                      \
293    vector signed short vx0,ux0,uvx0;                                   \
294    vector signed short vx1,ux1,uvx1;                                   \
295    vector signed short R0,G0,B0;                                       \
296    vector signed short R1,G1,B1;                                       \
297    vector unsigned char R,G,B;                                         \
298                                                                        \
299    vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
300    vector unsigned char align_perm;                                    \
301                                                                        \
302    vector signed short                                                 \
303        lCY  = c->CY,                                                   \
304        lOY  = c->OY,                                                   \
305        lCRV = c->CRV,                                                  \
306        lCBU = c->CBU,                                                  \
307        lCGU = c->CGU,                                                  \
308        lCGV = c->CGV;                                                  \
309                                                                        \
310    vector unsigned short lCSHIFT = c->CSHIFT;                          \
311                                                                        \
312    ubyte *y1i   = in[0];                                               \
313    ubyte *y2i   = in[0]+instrides[0];                                  \
314    ubyte *ui    = in[1];                                               \
315    ubyte *vi    = in[2];                                               \
316                                                                        \
317    vector unsigned char *oute                                          \
318        = (vector unsigned char *)                                      \
319            (oplanes[0]+srcSliceY*outstrides[0]);                       \
320    vector unsigned char *outo                                          \
321        = (vector unsigned char *)                                      \
322            (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
323                                                                        \
324                                                                        \
325    instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
326    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
327    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
328                                                                        \
329                                                                        \
330    for (i=0;i<h/2;i++) {                                               \
331        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
332        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
333                                                                        \
334        for (j=0;j<w/16;j++) {                                          \
335                                                                        \
336            y1ivP = (vector unsigned char *)y1i;                        \
337            y2ivP = (vector unsigned char *)y2i;                        \
338            uivP  = (vector unsigned char *)ui;                         \
339            vivP  = (vector unsigned char *)vi;                         \
340                                                                        \
341            align_perm = vec_lvsl (0, y1i);                             \
342            y0 = (vector unsigned char)                                 \
343                 vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
344                                                                        \
345            align_perm = vec_lvsl (0, y2i);                             \
346            y1 = (vector unsigned char)                                 \
347                 vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
348                                                                        \
349            align_perm = vec_lvsl (0, ui);                              \
350            u = (vector signed char)                                    \
351                vec_perm (uivP[0], uivP[1], align_perm);                \
352                                                                        \
353            align_perm = vec_lvsl (0, vi);                              \
354            v = (vector signed char)                                    \
355                vec_perm (vivP[0], vivP[1], align_perm);                \
356                                                                        \
357            u  = (vector signed char)                                   \
358                 vec_sub (u,(vector signed char)                        \
359                          vec_splat((vector signed char){128},0));      \
360            v  = (vector signed char)                                   \
361                 vec_sub (v,(vector signed char)                        \
362                          vec_splat((vector signed char){128},0));      \
363                                                                        \
364            U  = vec_unpackh (u);                                       \
365            V  = vec_unpackh (v);                                       \
366                                                                        \
367                                                                        \
368            Y0 = vec_unh (y0);                                          \
369            Y1 = vec_unl (y0);                                          \
370            Y2 = vec_unh (y1);                                          \
371            Y3 = vec_unl (y1);                                          \
372                                                                        \
373            Y0 = vec_mradds (Y0, lCY, lOY);                             \
374            Y1 = vec_mradds (Y1, lCY, lOY);                             \
375            Y2 = vec_mradds (Y2, lCY, lOY);                             \
376            Y3 = vec_mradds (Y3, lCY, lOY);                             \
377                                                                        \
378            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
379            ux = vec_sl (U, lCSHIFT);                                   \
380            ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
381            ux0  = vec_mergeh (ux,ux);                                  \
382            ux1  = vec_mergel (ux,ux);                                  \
383                                                                        \
384            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
385            vx = vec_sl (V, lCSHIFT);                                   \
386            vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
387            vx0  = vec_mergeh (vx,vx);                                  \
388            vx1  = vec_mergel (vx,vx);                                  \
389                                                                        \
390            /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
391            uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
392            uvx = vec_mradds (V, lCGV, uvx);                            \
393            uvx0 = vec_mergeh (uvx,uvx);                                \
394            uvx1 = vec_mergel (uvx,uvx);                                \
395                                                                        \
396            R0 = vec_add (Y0,vx0);                                      \
397            G0 = vec_add (Y0,uvx0);                                     \
398            B0 = vec_add (Y0,ux0);                                      \
399            R1 = vec_add (Y1,vx1);                                      \
400            G1 = vec_add (Y1,uvx1);                                     \
401            B1 = vec_add (Y1,ux1);                                      \
402                                                                        \
403            R  = vec_packclp (R0,R1);                                   \
404            G  = vec_packclp (G0,G1);                                   \
405            B  = vec_packclp (B0,B1);                                   \
406                                                                        \
407            out_pixels(R,G,B,oute);                                     \
408                                                                        \
409            R0 = vec_add (Y2,vx0);                                      \
410            G0 = vec_add (Y2,uvx0);                                     \
411            B0 = vec_add (Y2,ux0);                                      \
412            R1 = vec_add (Y3,vx1);                                      \
413            G1 = vec_add (Y3,uvx1);                                     \
414            B1 = vec_add (Y3,ux1);                                      \
415            R  = vec_packclp (R0,R1);                                   \
416            G  = vec_packclp (G0,G1);                                   \
417            B  = vec_packclp (B0,B1);                                   \
418                                                                        \
419                                                                        \
420            out_pixels(R,G,B,outo);                                     \
421                                                                        \
422            y1i  += 16;                                                 \
423            y2i  += 16;                                                 \
424            ui   += 8;                                                  \
425            vi   += 8;                                                  \
426                                                                        \
427        }                                                               \
428                                                                        \
429        outo  += (outstrides[0])>>4;                                    \
430        oute  += (outstrides[0])>>4;                                    \
431                                                                        \
432        ui    += instrides_scl[1];                                      \
433        vi    += instrides_scl[2];                                      \
434        y1i   += instrides_scl[0];                                      \
435        y2i   += instrides_scl[0];                                      \
436    }                                                                   \
437    return srcSliceH;                                                   \
438}
439
440
441#define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
442#define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
443#define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
444#define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
445#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
446#define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
447
448DEFCSP420_CVT (yuv2_abgr, out_abgr)
449#if 1
450DEFCSP420_CVT (yuv2_bgra, out_bgra)
451#else
452static int altivec_yuv2_bgra32 (SwsContext *c,
453                                unsigned char **in, int *instrides,
454                                int srcSliceY,        int srcSliceH,
455                                unsigned char **oplanes, int *outstrides)
456{
457    int w = c->srcW;
458    int h = srcSliceH;
459    int i,j;
460    int instrides_scl[3];
461    vector unsigned char y0,y1;
462
463    vector signed char  u,v;
464
465    vector signed short Y0,Y1,Y2,Y3;
466    vector signed short U,V;
467    vector signed short vx,ux,uvx;
468    vector signed short vx0,ux0,uvx0;
469    vector signed short vx1,ux1,uvx1;
470    vector signed short R0,G0,B0;
471    vector signed short R1,G1,B1;
472    vector unsigned char R,G,B;
473
474    vector unsigned char *uivP, *vivP;
475    vector unsigned char align_perm;
476
477    vector signed short
478        lCY  = c->CY,
479        lOY  = c->OY,
480        lCRV = c->CRV,
481        lCBU = c->CBU,
482        lCGU = c->CGU,
483        lCGV = c->CGV;
484
485    vector unsigned short lCSHIFT = c->CSHIFT;
486
487    ubyte *y1i   = in[0];
488    ubyte *y2i   = in[0]+w;
489    ubyte *ui    = in[1];
490    ubyte *vi    = in[2];
491
492    vector unsigned char *oute
493        = (vector unsigned char *)
494          (oplanes[0]+srcSliceY*outstrides[0]);
495    vector unsigned char *outo
496        = (vector unsigned char *)
497          (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
498
499
500    instrides_scl[0] = instrides[0];
501    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
502    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
503
504
505    for (i=0;i<h/2;i++) {
506        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
507        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
508
509        for (j=0;j<w/16;j++) {
510
511            y0 = vec_ldl (0,y1i);
512            y1 = vec_ldl (0,y2i);
513            uivP = (vector unsigned char *)ui;
514            vivP = (vector unsigned char *)vi;
515
516            align_perm = vec_lvsl (0, ui);
517            u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
518
519            align_perm = vec_lvsl (0, vi);
520            v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
521            u  = (vector signed char)
522                 vec_sub (u,(vector signed char)
523                          vec_splat((vector signed char){128},0));
524
525            v  = (vector signed char)
526                 vec_sub (v, (vector signed char)
527                          vec_splat((vector signed char){128},0));
528
529            U  = vec_unpackh (u);
530            V  = vec_unpackh (v);
531
532
533            Y0 = vec_unh (y0);
534            Y1 = vec_unl (y0);
535            Y2 = vec_unh (y1);
536            Y3 = vec_unl (y1);
537
538            Y0 = vec_mradds (Y0, lCY, lOY);
539            Y1 = vec_mradds (Y1, lCY, lOY);
540            Y2 = vec_mradds (Y2, lCY, lOY);
541            Y3 = vec_mradds (Y3, lCY, lOY);
542
543            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
544            ux = vec_sl (U, lCSHIFT);
545            ux = vec_mradds (ux, lCBU, (vector signed short){0});
546            ux0  = vec_mergeh (ux,ux);
547            ux1  = vec_mergel (ux,ux);
548
549            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
550            vx = vec_sl (V, lCSHIFT);
551            vx = vec_mradds (vx, lCRV, (vector signed short){0});
552            vx0  = vec_mergeh (vx,vx);
553            vx1  = vec_mergel (vx,vx);
554            /* uvx = ((CGU*u) + (CGV*v))>>15 */
555            uvx = vec_mradds (U, lCGU, (vector signed short){0});
556            uvx = vec_mradds (V, lCGV, uvx);
557            uvx0 = vec_mergeh (uvx,uvx);
558            uvx1 = vec_mergel (uvx,uvx);
559            R0 = vec_add (Y0,vx0);
560            G0 = vec_add (Y0,uvx0);
561            B0 = vec_add (Y0,ux0);
562            R1 = vec_add (Y1,vx1);
563            G1 = vec_add (Y1,uvx1);
564            B1 = vec_add (Y1,ux1);
565            R  = vec_packclp (R0,R1);
566            G  = vec_packclp (G0,G1);
567            B  = vec_packclp (B0,B1);
568
569            out_argb(R,G,B,oute);
570            R0 = vec_add (Y2,vx0);
571            G0 = vec_add (Y2,uvx0);
572            B0 = vec_add (Y2,ux0);
573            R1 = vec_add (Y3,vx1);
574            G1 = vec_add (Y3,uvx1);
575            B1 = vec_add (Y3,ux1);
576            R  = vec_packclp (R0,R1);
577            G  = vec_packclp (G0,G1);
578            B  = vec_packclp (B0,B1);
579
580            out_argb(R,G,B,outo);
581            y1i  += 16;
582            y2i  += 16;
583            ui   += 8;
584            vi   += 8;
585
586        }
587
588        outo  += (outstrides[0])>>4;
589        oute  += (outstrides[0])>>4;
590
591        ui    += instrides_scl[1];
592        vi    += instrides_scl[2];
593        y1i   += instrides_scl[0];
594        y2i   += instrides_scl[0];
595    }
596    return srcSliceH;
597}
598
599#endif
600
601
602DEFCSP420_CVT (yuv2_rgba, out_rgba)
603DEFCSP420_CVT (yuv2_argb, out_argb)
604DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
605DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
606
607
608// uyvy|uyvy|uyvy|uyvy
609// 0123 4567 89ab cdef
610static
611const vector unsigned char
612    demux_u = {0x10,0x00,0x10,0x00,
613               0x10,0x04,0x10,0x04,
614               0x10,0x08,0x10,0x08,
615               0x10,0x0c,0x10,0x0c},
616    demux_v = {0x10,0x02,0x10,0x02,
617               0x10,0x06,0x10,0x06,
618               0x10,0x0A,0x10,0x0A,
619               0x10,0x0E,0x10,0x0E},
620    demux_y = {0x10,0x01,0x10,0x03,
621               0x10,0x05,0x10,0x07,
622               0x10,0x09,0x10,0x0B,
623               0x10,0x0D,0x10,0x0F};
624
625/*
626  this is so I can play live CCIR raw video
627*/
628static int altivec_uyvy_rgb32 (SwsContext *c,
629                               unsigned char **in, int *instrides,
630                               int srcSliceY,        int srcSliceH,
631                               unsigned char **oplanes, int *outstrides)
632{
633    int w = c->srcW;
634    int h = srcSliceH;
635    int i,j;
636    vector unsigned char uyvy;
637    vector signed   short Y,U,V;
638    vector signed   short R0,G0,B0,R1,G1,B1;
639    vector unsigned char  R,G,B;
640    vector unsigned char *out;
641    ubyte *img;
642
643    img = in[0];
644    out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
645
646    for (i=0;i<h;i++) {
647        for (j=0;j<w/16;j++) {
648            uyvy = vec_ld (0, img);
649            U = (vector signed short)
650                vec_perm (uyvy, (vector unsigned char){0}, demux_u);
651
652            V = (vector signed short)
653                vec_perm (uyvy, (vector unsigned char){0}, demux_v);
654
655            Y = (vector signed short)
656                vec_perm (uyvy, (vector unsigned char){0}, demux_y);
657
658            cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
659
660            uyvy = vec_ld (16, img);
661            U = (vector signed short)
662                vec_perm (uyvy, (vector unsigned char){0}, demux_u);
663
664            V = (vector signed short)
665                vec_perm (uyvy, (vector unsigned char){0}, demux_v);
666
667            Y = (vector signed short)
668                vec_perm (uyvy, (vector unsigned char){0}, demux_y);
669
670            cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
671
672            R  = vec_packclp (R0,R1);
673            G  = vec_packclp (G0,G1);
674            B  = vec_packclp (B0,B1);
675
676            //      vec_mstbgr24 (R,G,B, out);
677            out_rgba (R,G,B,out);
678
679            img += 32;
680        }
681    }
682    return srcSliceH;
683}
684
685
686
687/* Ok currently the acceleration routine only supports
688   inputs of widths a multiple of 16
689   and heights a multiple 2
690
691   So we just fall back to the C codes for this.
692*/
693SwsFunc sws_yuv2rgb_init_altivec (SwsContext *c)
694{
695    if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
696        return NULL;
697
698    /*
699      and this seems not to matter too much I tried a bunch of
700      videos with abnormal widths and MPlayer crashes elsewhere.
701      mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
702      boom with X11 bad match.
703
704    */
705    if ((c->srcW & 0xf) != 0)    return NULL;
706
707    switch (c->srcFormat) {
708    case PIX_FMT_YUV410P:
709    case PIX_FMT_YUV420P:
710    /*case IMGFMT_CLPL:        ??? */
711    case PIX_FMT_GRAY8:
712    case PIX_FMT_NV12:
713    case PIX_FMT_NV21:
714        if ((c->srcH & 0x1) != 0)
715            return NULL;
716
717        switch(c->dstFormat){
718        case PIX_FMT_RGB24:
719            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
720            return altivec_yuv2_rgb24;
721        case PIX_FMT_BGR24:
722            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
723            return altivec_yuv2_bgr24;
724        case PIX_FMT_ARGB:
725            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
726            return altivec_yuv2_argb;
727        case PIX_FMT_ABGR:
728            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
729            return altivec_yuv2_abgr;
730        case PIX_FMT_RGBA:
731            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
732            return altivec_yuv2_rgba;
733        case PIX_FMT_BGRA:
734            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
735            return altivec_yuv2_bgra;
736        default: return NULL;
737        }
738        break;
739
740    case PIX_FMT_UYVY422:
741        switch(c->dstFormat){
742        case PIX_FMT_BGR32:
743            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
744            return altivec_uyvy_rgb32;
745        default: return NULL;
746        }
747        break;
748
749    }
750    return NULL;
751}
752
753void sws_yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
754{
755    union {
756        signed short tmp[8] __attribute__ ((aligned(16)));
757        vector signed short vec;
758    } buf;
759
760    buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
761    buf.tmp[1] =  -256*brightness;                                      //oy
762    buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
763    buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
764    buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
765    buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
766
767
768    c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
769    c->CY   = vec_splat ((vector signed short)buf.vec, 0);
770    c->OY   = vec_splat ((vector signed short)buf.vec, 1);
771    c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
772    c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
773    c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
774    c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
775#if 0
776    {
777    int i;
778    char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
779    for (i=0; i<6; i++)
780        printf("%s %d ", v[i],buf.tmp[i] );
781        printf("\n");
782    }
783#endif
784    return;
785}
786
787
788void
789altivec_yuv2packedX (SwsContext *c,
790                     int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
791                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
792                     uint8_t *dest, int dstW, int dstY)
793{
794    int i,j;
795    vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
796    vector signed short R0,G0,B0,R1,G1,B1;
797
798    vector unsigned char R,G,B;
799    vector unsigned char *out,*nout;
800
801    vector signed short   RND = vec_splat_s16(1<<3);
802    vector unsigned short SCL = vec_splat_u16(4);
803    unsigned long scratch[16] __attribute__ ((aligned (16)));
804
805    vector signed short *YCoeffs, *CCoeffs;
806
807    YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
808    CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
809
810    out = (vector unsigned char *)dest;
811
812    for (i=0; i<dstW; i+=16){
813        Y0 = RND;
814        Y1 = RND;
815        /* extract 16 coeffs from lumSrc */
816        for (j=0; j<lumFilterSize; j++) {
817            X0 = vec_ld (0,  &lumSrc[j][i]);
818            X1 = vec_ld (16, &lumSrc[j][i]);
819            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
820            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
821        }
822
823        U = RND;
824        V = RND;
825        /* extract 8 coeffs from U,V */
826        for (j=0; j<chrFilterSize; j++) {
827            X  = vec_ld (0, &chrSrc[j][i/2]);
828            U  = vec_mradds (X, CCoeffs[j], U);
829            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
830            V  = vec_mradds (X, CCoeffs[j], V);
831        }
832
833        /* scale and clip signals */
834        Y0 = vec_sra (Y0, SCL);
835        Y1 = vec_sra (Y1, SCL);
836        U  = vec_sra (U,  SCL);
837        V  = vec_sra (V,  SCL);
838
839        Y0 = vec_clip_s16 (Y0);
840        Y1 = vec_clip_s16 (Y1);
841        U  = vec_clip_s16 (U);
842        V  = vec_clip_s16 (V);
843
844        /* now we have
845          Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
846          U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
847
848          Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
849          U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
850          V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
851        */
852
853        U0 = vec_mergeh (U,U);
854        V0 = vec_mergeh (V,V);
855
856        U1 = vec_mergel (U,U);
857        V1 = vec_mergel (V,V);
858
859        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
860        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
861
862        R  = vec_packclp (R0,R1);
863        G  = vec_packclp (G0,G1);
864        B  = vec_packclp (B0,B1);
865
866        switch(c->dstFormat) {
867            case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
868            case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
869            case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
870            case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
871            case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
872            case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
873            default:
874            {
875                /* If this is reached, the caller should have called yuv2packedXinC
876                   instead. */
877                static int printed_error_message;
878                if (!printed_error_message) {
879                    av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
880                           sws_format_name(c->dstFormat));
881                    printed_error_message=1;
882                }
883                return;
884            }
885        }
886    }
887
888    if (i < dstW) {
889        i -= 16;
890
891        Y0 = RND;
892        Y1 = RND;
893        /* extract 16 coeffs from lumSrc */
894        for (j=0; j<lumFilterSize; j++) {
895            X0 = vec_ld (0,  &lumSrc[j][i]);
896            X1 = vec_ld (16, &lumSrc[j][i]);
897            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
898            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
899        }
900
901        U = RND;
902        V = RND;
903        /* extract 8 coeffs from U,V */
904        for (j=0; j<chrFilterSize; j++) {
905            X  = vec_ld (0, &chrSrc[j][i/2]);
906            U  = vec_mradds (X, CCoeffs[j], U);
907            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
908            V  = vec_mradds (X, CCoeffs[j], V);
909        }
910
911        /* scale and clip signals */
912        Y0 = vec_sra (Y0, SCL);
913        Y1 = vec_sra (Y1, SCL);
914        U  = vec_sra (U,  SCL);
915        V  = vec_sra (V,  SCL);
916
917        Y0 = vec_clip_s16 (Y0);
918        Y1 = vec_clip_s16 (Y1);
919        U  = vec_clip_s16 (U);
920        V  = vec_clip_s16 (V);
921
922        /* now we have
923           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
924           U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
925
926           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
927           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
928           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
929        */
930
931        U0 = vec_mergeh (U,U);
932        V0 = vec_mergeh (V,V);
933
934        U1 = vec_mergel (U,U);
935        V1 = vec_mergel (V,V);
936
937        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
938        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
939
940        R  = vec_packclp (R0,R1);
941        G  = vec_packclp (G0,G1);
942        B  = vec_packclp (B0,B1);
943
944        nout = (vector unsigned char *)scratch;
945        switch(c->dstFormat) {
946            case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
947            case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
948            case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
949            case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
950            case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
951            case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
952            default:
953                /* Unreachable, I think. */
954                av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
955                       sws_format_name(c->dstFormat));
956                return;
957        }
958
959        memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
960    }
961
962}
963