1/* 2 * AltiVec acceleration for colorspace conversion 3 * 4 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23/* 24Convert I420 YV12 to RGB in various formats, 25 it rejects images that are not in 420 formats, 26 it rejects images that don't have widths of multiples of 16, 27 it rejects images that don't have heights of multiples of 2. 28Reject defers to C simulation code. 29 30Lots of optimizations to be done here. 31 321. Need to fix saturation code. I just couldn't get it to fly with packs 33 and adds, so we currently use max/min to clip. 34 352. The inefficient use of chroma loading needs a bit of brushing up. 36 373. Analysis of pipeline stalls needs to be done. Use shark to identify 38 pipeline stalls. 39 40 41MODIFIED to calculate coeffs from currently selected color space. 42MODIFIED core to be a macro where you specify the output format. 43ADDED UYVY conversion which is never called due to some thing in swscale. 44CORRECTED algorithim selection to be strict on input formats. 45ADDED runtime detection of AltiVec. 46 47ADDED altivec_yuv2packedX vertical scl + RGB converter 48 49March 27,2004 50PERFORMANCE ANALYSIS 51 52The C version uses 25% of the processor or ~250Mips for D1 video rawvideo 53used as test. 54The AltiVec version uses 10% of the processor or ~100Mips for D1 video 55same sequence. 56 57720 * 480 * 30 ~10MPS 58 59so we have roughly 10 clocks per pixel. This is too high, something has 60to be wrong. 61 62OPTIMIZED clip codes to utilize vec_max and vec_packs removing the 63need for vec_min. 64 65OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have 66the input video frame, it was just decompressed so it probably resides in L1 67caches. However, we are creating the output video stream. This needs to use the 68DSTST instruction to optimize for the cache. We couple this with the fact that 69we are not going to be visiting the input buffer again so we mark it Least 70Recently Used. This shaves 25% of the processor cycles off. 71 72Now memcpy is the largest mips consumer in the system, probably due 73to the inefficient X11 stuff. 74 75GL libraries seem to be very slow on this machine 1.33Ghz PB running 76Jaguar, this is not the case for my 1Ghz PB. I thought it might be 77a versioning issue, however I have libGL.1.2.dylib for both 78machines. (We need to figure this out now.) 79 80GL2 libraries work now with patch for RGB32. 81 82NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor. 83 84Integrated luma prescaling adjustment for saturation/contrast/brightness 85adjustment. 86*/ 87 88#include <stdio.h> 89#include <stdlib.h> 90#include <string.h> 91#include <inttypes.h> 92#include <assert.h> 93#include "config.h" 94#include "rgb2rgb.h" 95#include "swscale.h" 96#include "swscale_internal.h" 97 98#undef PROFILE_THE_BEAST 99#undef INC_SCALING 100 101typedef unsigned char ubyte; 102typedef signed char sbyte; 103 104 105/* RGB interleaver, 16 planar pels 8-bit samples per channel in 106 homogeneous vector registers x0,x1,x2 are interleaved with the 107 following technique: 108 109 o0 = vec_mergeh (x0,x1); 110 o1 = vec_perm (o0, x2, perm_rgb_0); 111 o2 = vec_perm (o0, x2, perm_rgb_1); 112 o3 = vec_mergel (x0,x1); 113 o4 = vec_perm (o3,o2,perm_rgb_2); 114 o5 = vec_perm (o3,o2,perm_rgb_3); 115 116 perm_rgb_0: o0(RG).h v1(B) --> o1* 117 0 1 2 3 4 118 rgbr|gbrg|brgb|rgbr 119 0010 0100 1001 0010 120 0102 3145 2673 894A 121 122 perm_rgb_1: o0(RG).h v1(B) --> o2 123 0 1 2 3 4 124 gbrg|brgb|bbbb|bbbb 125 0100 1001 1111 1111 126 B5CD 6EF7 89AB CDEF 127 128 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4* 129 0 1 2 3 4 130 gbrg|brgb|rgbr|gbrg 131 1111 1111 0010 0100 132 89AB CDEF 0182 3945 133 134 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5* 135 0 1 2 3 4 136 brgb|rgbr|gbrg|brgb 137 1001 0010 0100 1001 138 a67b 89cA BdCD eEFf 139 140*/ 141static 142const vector unsigned char 143 perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05, 144 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a}, 145 perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17, 146 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f}, 147 perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, 148 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05}, 149 perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a, 150 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f}; 151 152#define vec_merge3(x2,x1,x0,y0,y1,y2) \ 153do { \ 154 __typeof__(x0) o0,o2,o3; \ 155 o0 = vec_mergeh (x0,x1); \ 156 y0 = vec_perm (o0, x2, perm_rgb_0); \ 157 o2 = vec_perm (o0, x2, perm_rgb_1); \ 158 o3 = vec_mergel (x0,x1); \ 159 y1 = vec_perm (o3,o2,perm_rgb_2); \ 160 y2 = vec_perm (o3,o2,perm_rgb_3); \ 161} while(0) 162 163#define vec_mstbgr24(x0,x1,x2,ptr) \ 164do { \ 165 __typeof__(x0) _0,_1,_2; \ 166 vec_merge3 (x0,x1,x2,_0,_1,_2); \ 167 vec_st (_0, 0, ptr++); \ 168 vec_st (_1, 0, ptr++); \ 169 vec_st (_2, 0, ptr++); \ 170} while (0); 171 172#define vec_mstrgb24(x0,x1,x2,ptr) \ 173do { \ 174 __typeof__(x0) _0,_1,_2; \ 175 vec_merge3 (x2,x1,x0,_0,_1,_2); \ 176 vec_st (_0, 0, ptr++); \ 177 vec_st (_1, 0, ptr++); \ 178 vec_st (_2, 0, ptr++); \ 179} while (0); 180 181/* pack the pixels in rgb0 format 182 msb R 183 lsb 0 184*/ 185#define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \ 186do { \ 187 T _0,_1,_2,_3; \ 188 _0 = vec_mergeh (x0,x1); \ 189 _1 = vec_mergeh (x2,x3); \ 190 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ 191 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ 192 vec_st (_2, 0*16, (T *)ptr); \ 193 vec_st (_3, 1*16, (T *)ptr); \ 194 _0 = vec_mergel (x0,x1); \ 195 _1 = vec_mergel (x2,x3); \ 196 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ 197 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ 198 vec_st (_2, 2*16, (T *)ptr); \ 199 vec_st (_3, 3*16, (T *)ptr); \ 200 ptr += 4; \ 201} while (0); 202 203/* 204 205 | 1 0 1.4021 | | Y | 206 | 1 -0.3441 -0.7142 |x| Cb| 207 | 1 1.7718 0 | | Cr| 208 209 210 Y: [-128 127] 211 Cb/Cr : [-128 127] 212 213 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode. 214 215*/ 216 217 218 219 220#define vec_unh(x) \ 221 (vector signed short) \ 222 vec_perm(x,(__typeof__(x)){0}, \ 223 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\ 224 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07})) 225#define vec_unl(x) \ 226 (vector signed short) \ 227 vec_perm(x,(__typeof__(x)){0}, \ 228 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\ 229 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F})) 230 231#define vec_clip_s16(x) \ 232 vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \ 233 ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16})) 234 235#define vec_packclp(x,y) \ 236 (vector unsigned char)vec_packs \ 237 ((vector unsigned short)vec_max (x,((vector signed short) {0})), \ 238 (vector unsigned short)vec_max (y,((vector signed short) {0}))) 239 240//#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr) 241 242 243static inline void cvtyuvtoRGB (SwsContext *c, 244 vector signed short Y, vector signed short U, vector signed short V, 245 vector signed short *R, vector signed short *G, vector signed short *B) 246{ 247 vector signed short vx,ux,uvx; 248 249 Y = vec_mradds (Y, c->CY, c->OY); 250 U = vec_sub (U,(vector signed short) 251 vec_splat((vector signed short){128},0)); 252 V = vec_sub (V,(vector signed short) 253 vec_splat((vector signed short){128},0)); 254 255 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15; 256 ux = vec_sl (U, c->CSHIFT); 257 *B = vec_mradds (ux, c->CBU, Y); 258 259 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15; 260 vx = vec_sl (V, c->CSHIFT); 261 *R = vec_mradds (vx, c->CRV, Y); 262 263 // uvx = ((CGU*u) + (CGV*v))>>15; 264 uvx = vec_mradds (U, c->CGU, Y); 265 *G = vec_mradds (V, c->CGV, uvx); 266} 267 268 269/* 270 ------------------------------------------------------------------------------ 271 CS converters 272 ------------------------------------------------------------------------------ 273*/ 274 275 276#define DEFCSP420_CVT(name,out_pixels) \ 277static int altivec_##name (SwsContext *c, \ 278 unsigned char **in, int *instrides, \ 279 int srcSliceY, int srcSliceH, \ 280 unsigned char **oplanes, int *outstrides) \ 281{ \ 282 int w = c->srcW; \ 283 int h = srcSliceH; \ 284 int i,j; \ 285 int instrides_scl[3]; \ 286 vector unsigned char y0,y1; \ 287 \ 288 vector signed char u,v; \ 289 \ 290 vector signed short Y0,Y1,Y2,Y3; \ 291 vector signed short U,V; \ 292 vector signed short vx,ux,uvx; \ 293 vector signed short vx0,ux0,uvx0; \ 294 vector signed short vx1,ux1,uvx1; \ 295 vector signed short R0,G0,B0; \ 296 vector signed short R1,G1,B1; \ 297 vector unsigned char R,G,B; \ 298 \ 299 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \ 300 vector unsigned char align_perm; \ 301 \ 302 vector signed short \ 303 lCY = c->CY, \ 304 lOY = c->OY, \ 305 lCRV = c->CRV, \ 306 lCBU = c->CBU, \ 307 lCGU = c->CGU, \ 308 lCGV = c->CGV; \ 309 \ 310 vector unsigned short lCSHIFT = c->CSHIFT; \ 311 \ 312 ubyte *y1i = in[0]; \ 313 ubyte *y2i = in[0]+instrides[0]; \ 314 ubyte *ui = in[1]; \ 315 ubyte *vi = in[2]; \ 316 \ 317 vector unsigned char *oute \ 318 = (vector unsigned char *) \ 319 (oplanes[0]+srcSliceY*outstrides[0]); \ 320 vector unsigned char *outo \ 321 = (vector unsigned char *) \ 322 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \ 323 \ 324 \ 325 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \ 326 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \ 327 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \ 328 \ 329 \ 330 for (i=0;i<h/2;i++) { \ 331 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \ 332 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \ 333 \ 334 for (j=0;j<w/16;j++) { \ 335 \ 336 y1ivP = (vector unsigned char *)y1i; \ 337 y2ivP = (vector unsigned char *)y2i; \ 338 uivP = (vector unsigned char *)ui; \ 339 vivP = (vector unsigned char *)vi; \ 340 \ 341 align_perm = vec_lvsl (0, y1i); \ 342 y0 = (vector unsigned char) \ 343 vec_perm (y1ivP[0], y1ivP[1], align_perm); \ 344 \ 345 align_perm = vec_lvsl (0, y2i); \ 346 y1 = (vector unsigned char) \ 347 vec_perm (y2ivP[0], y2ivP[1], align_perm); \ 348 \ 349 align_perm = vec_lvsl (0, ui); \ 350 u = (vector signed char) \ 351 vec_perm (uivP[0], uivP[1], align_perm); \ 352 \ 353 align_perm = vec_lvsl (0, vi); \ 354 v = (vector signed char) \ 355 vec_perm (vivP[0], vivP[1], align_perm); \ 356 \ 357 u = (vector signed char) \ 358 vec_sub (u,(vector signed char) \ 359 vec_splat((vector signed char){128},0)); \ 360 v = (vector signed char) \ 361 vec_sub (v,(vector signed char) \ 362 vec_splat((vector signed char){128},0)); \ 363 \ 364 U = vec_unpackh (u); \ 365 V = vec_unpackh (v); \ 366 \ 367 \ 368 Y0 = vec_unh (y0); \ 369 Y1 = vec_unl (y0); \ 370 Y2 = vec_unh (y1); \ 371 Y3 = vec_unl (y1); \ 372 \ 373 Y0 = vec_mradds (Y0, lCY, lOY); \ 374 Y1 = vec_mradds (Y1, lCY, lOY); \ 375 Y2 = vec_mradds (Y2, lCY, lOY); \ 376 Y3 = vec_mradds (Y3, lCY, lOY); \ 377 \ 378 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \ 379 ux = vec_sl (U, lCSHIFT); \ 380 ux = vec_mradds (ux, lCBU, (vector signed short){0}); \ 381 ux0 = vec_mergeh (ux,ux); \ 382 ux1 = vec_mergel (ux,ux); \ 383 \ 384 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \ 385 vx = vec_sl (V, lCSHIFT); \ 386 vx = vec_mradds (vx, lCRV, (vector signed short){0}); \ 387 vx0 = vec_mergeh (vx,vx); \ 388 vx1 = vec_mergel (vx,vx); \ 389 \ 390 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \ 391 uvx = vec_mradds (U, lCGU, (vector signed short){0}); \ 392 uvx = vec_mradds (V, lCGV, uvx); \ 393 uvx0 = vec_mergeh (uvx,uvx); \ 394 uvx1 = vec_mergel (uvx,uvx); \ 395 \ 396 R0 = vec_add (Y0,vx0); \ 397 G0 = vec_add (Y0,uvx0); \ 398 B0 = vec_add (Y0,ux0); \ 399 R1 = vec_add (Y1,vx1); \ 400 G1 = vec_add (Y1,uvx1); \ 401 B1 = vec_add (Y1,ux1); \ 402 \ 403 R = vec_packclp (R0,R1); \ 404 G = vec_packclp (G0,G1); \ 405 B = vec_packclp (B0,B1); \ 406 \ 407 out_pixels(R,G,B,oute); \ 408 \ 409 R0 = vec_add (Y2,vx0); \ 410 G0 = vec_add (Y2,uvx0); \ 411 B0 = vec_add (Y2,ux0); \ 412 R1 = vec_add (Y3,vx1); \ 413 G1 = vec_add (Y3,uvx1); \ 414 B1 = vec_add (Y3,ux1); \ 415 R = vec_packclp (R0,R1); \ 416 G = vec_packclp (G0,G1); \ 417 B = vec_packclp (B0,B1); \ 418 \ 419 \ 420 out_pixels(R,G,B,outo); \ 421 \ 422 y1i += 16; \ 423 y2i += 16; \ 424 ui += 8; \ 425 vi += 8; \ 426 \ 427 } \ 428 \ 429 outo += (outstrides[0])>>4; \ 430 oute += (outstrides[0])>>4; \ 431 \ 432 ui += instrides_scl[1]; \ 433 vi += instrides_scl[2]; \ 434 y1i += instrides_scl[0]; \ 435 y2i += instrides_scl[0]; \ 436 } \ 437 return srcSliceH; \ 438} 439 440 441#define out_abgr(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr) 442#define out_bgra(a,b,c,ptr) vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr) 443#define out_rgba(a,b,c,ptr) vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr) 444#define out_argb(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr) 445#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr) 446#define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr) 447 448DEFCSP420_CVT (yuv2_abgr, out_abgr) 449#if 1 450DEFCSP420_CVT (yuv2_bgra, out_bgra) 451#else 452static int altivec_yuv2_bgra32 (SwsContext *c, 453 unsigned char **in, int *instrides, 454 int srcSliceY, int srcSliceH, 455 unsigned char **oplanes, int *outstrides) 456{ 457 int w = c->srcW; 458 int h = srcSliceH; 459 int i,j; 460 int instrides_scl[3]; 461 vector unsigned char y0,y1; 462 463 vector signed char u,v; 464 465 vector signed short Y0,Y1,Y2,Y3; 466 vector signed short U,V; 467 vector signed short vx,ux,uvx; 468 vector signed short vx0,ux0,uvx0; 469 vector signed short vx1,ux1,uvx1; 470 vector signed short R0,G0,B0; 471 vector signed short R1,G1,B1; 472 vector unsigned char R,G,B; 473 474 vector unsigned char *uivP, *vivP; 475 vector unsigned char align_perm; 476 477 vector signed short 478 lCY = c->CY, 479 lOY = c->OY, 480 lCRV = c->CRV, 481 lCBU = c->CBU, 482 lCGU = c->CGU, 483 lCGV = c->CGV; 484 485 vector unsigned short lCSHIFT = c->CSHIFT; 486 487 ubyte *y1i = in[0]; 488 ubyte *y2i = in[0]+w; 489 ubyte *ui = in[1]; 490 ubyte *vi = in[2]; 491 492 vector unsigned char *oute 493 = (vector unsigned char *) 494 (oplanes[0]+srcSliceY*outstrides[0]); 495 vector unsigned char *outo 496 = (vector unsigned char *) 497 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); 498 499 500 instrides_scl[0] = instrides[0]; 501 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ 502 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ 503 504 505 for (i=0;i<h/2;i++) { 506 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); 507 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); 508 509 for (j=0;j<w/16;j++) { 510 511 y0 = vec_ldl (0,y1i); 512 y1 = vec_ldl (0,y2i); 513 uivP = (vector unsigned char *)ui; 514 vivP = (vector unsigned char *)vi; 515 516 align_perm = vec_lvsl (0, ui); 517 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); 518 519 align_perm = vec_lvsl (0, vi); 520 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); 521 u = (vector signed char) 522 vec_sub (u,(vector signed char) 523 vec_splat((vector signed char){128},0)); 524 525 v = (vector signed char) 526 vec_sub (v, (vector signed char) 527 vec_splat((vector signed char){128},0)); 528 529 U = vec_unpackh (u); 530 V = vec_unpackh (v); 531 532 533 Y0 = vec_unh (y0); 534 Y1 = vec_unl (y0); 535 Y2 = vec_unh (y1); 536 Y3 = vec_unl (y1); 537 538 Y0 = vec_mradds (Y0, lCY, lOY); 539 Y1 = vec_mradds (Y1, lCY, lOY); 540 Y2 = vec_mradds (Y2, lCY, lOY); 541 Y3 = vec_mradds (Y3, lCY, lOY); 542 543 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ 544 ux = vec_sl (U, lCSHIFT); 545 ux = vec_mradds (ux, lCBU, (vector signed short){0}); 546 ux0 = vec_mergeh (ux,ux); 547 ux1 = vec_mergel (ux,ux); 548 549 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ 550 vx = vec_sl (V, lCSHIFT); 551 vx = vec_mradds (vx, lCRV, (vector signed short){0}); 552 vx0 = vec_mergeh (vx,vx); 553 vx1 = vec_mergel (vx,vx); 554 /* uvx = ((CGU*u) + (CGV*v))>>15 */ 555 uvx = vec_mradds (U, lCGU, (vector signed short){0}); 556 uvx = vec_mradds (V, lCGV, uvx); 557 uvx0 = vec_mergeh (uvx,uvx); 558 uvx1 = vec_mergel (uvx,uvx); 559 R0 = vec_add (Y0,vx0); 560 G0 = vec_add (Y0,uvx0); 561 B0 = vec_add (Y0,ux0); 562 R1 = vec_add (Y1,vx1); 563 G1 = vec_add (Y1,uvx1); 564 B1 = vec_add (Y1,ux1); 565 R = vec_packclp (R0,R1); 566 G = vec_packclp (G0,G1); 567 B = vec_packclp (B0,B1); 568 569 out_argb(R,G,B,oute); 570 R0 = vec_add (Y2,vx0); 571 G0 = vec_add (Y2,uvx0); 572 B0 = vec_add (Y2,ux0); 573 R1 = vec_add (Y3,vx1); 574 G1 = vec_add (Y3,uvx1); 575 B1 = vec_add (Y3,ux1); 576 R = vec_packclp (R0,R1); 577 G = vec_packclp (G0,G1); 578 B = vec_packclp (B0,B1); 579 580 out_argb(R,G,B,outo); 581 y1i += 16; 582 y2i += 16; 583 ui += 8; 584 vi += 8; 585 586 } 587 588 outo += (outstrides[0])>>4; 589 oute += (outstrides[0])>>4; 590 591 ui += instrides_scl[1]; 592 vi += instrides_scl[2]; 593 y1i += instrides_scl[0]; 594 y2i += instrides_scl[0]; 595 } 596 return srcSliceH; 597} 598 599#endif 600 601 602DEFCSP420_CVT (yuv2_rgba, out_rgba) 603DEFCSP420_CVT (yuv2_argb, out_argb) 604DEFCSP420_CVT (yuv2_rgb24, out_rgb24) 605DEFCSP420_CVT (yuv2_bgr24, out_bgr24) 606 607 608// uyvy|uyvy|uyvy|uyvy 609// 0123 4567 89ab cdef 610static 611const vector unsigned char 612 demux_u = {0x10,0x00,0x10,0x00, 613 0x10,0x04,0x10,0x04, 614 0x10,0x08,0x10,0x08, 615 0x10,0x0c,0x10,0x0c}, 616 demux_v = {0x10,0x02,0x10,0x02, 617 0x10,0x06,0x10,0x06, 618 0x10,0x0A,0x10,0x0A, 619 0x10,0x0E,0x10,0x0E}, 620 demux_y = {0x10,0x01,0x10,0x03, 621 0x10,0x05,0x10,0x07, 622 0x10,0x09,0x10,0x0B, 623 0x10,0x0D,0x10,0x0F}; 624 625/* 626 this is so I can play live CCIR raw video 627*/ 628static int altivec_uyvy_rgb32 (SwsContext *c, 629 unsigned char **in, int *instrides, 630 int srcSliceY, int srcSliceH, 631 unsigned char **oplanes, int *outstrides) 632{ 633 int w = c->srcW; 634 int h = srcSliceH; 635 int i,j; 636 vector unsigned char uyvy; 637 vector signed short Y,U,V; 638 vector signed short R0,G0,B0,R1,G1,B1; 639 vector unsigned char R,G,B; 640 vector unsigned char *out; 641 ubyte *img; 642 643 img = in[0]; 644 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]); 645 646 for (i=0;i<h;i++) { 647 for (j=0;j<w/16;j++) { 648 uyvy = vec_ld (0, img); 649 U = (vector signed short) 650 vec_perm (uyvy, (vector unsigned char){0}, demux_u); 651 652 V = (vector signed short) 653 vec_perm (uyvy, (vector unsigned char){0}, demux_v); 654 655 Y = (vector signed short) 656 vec_perm (uyvy, (vector unsigned char){0}, demux_y); 657 658 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0); 659 660 uyvy = vec_ld (16, img); 661 U = (vector signed short) 662 vec_perm (uyvy, (vector unsigned char){0}, demux_u); 663 664 V = (vector signed short) 665 vec_perm (uyvy, (vector unsigned char){0}, demux_v); 666 667 Y = (vector signed short) 668 vec_perm (uyvy, (vector unsigned char){0}, demux_y); 669 670 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1); 671 672 R = vec_packclp (R0,R1); 673 G = vec_packclp (G0,G1); 674 B = vec_packclp (B0,B1); 675 676 // vec_mstbgr24 (R,G,B, out); 677 out_rgba (R,G,B,out); 678 679 img += 32; 680 } 681 } 682 return srcSliceH; 683} 684 685 686 687/* Ok currently the acceleration routine only supports 688 inputs of widths a multiple of 16 689 and heights a multiple 2 690 691 So we just fall back to the C codes for this. 692*/ 693SwsFunc sws_yuv2rgb_init_altivec (SwsContext *c) 694{ 695 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC)) 696 return NULL; 697 698 /* 699 and this seems not to matter too much I tried a bunch of 700 videos with abnormal widths and MPlayer crashes elsewhere. 701 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv 702 boom with X11 bad match. 703 704 */ 705 if ((c->srcW & 0xf) != 0) return NULL; 706 707 switch (c->srcFormat) { 708 case PIX_FMT_YUV410P: 709 case PIX_FMT_YUV420P: 710 /*case IMGFMT_CLPL: ??? */ 711 case PIX_FMT_GRAY8: 712 case PIX_FMT_NV12: 713 case PIX_FMT_NV21: 714 if ((c->srcH & 0x1) != 0) 715 return NULL; 716 717 switch(c->dstFormat){ 718 case PIX_FMT_RGB24: 719 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n"); 720 return altivec_yuv2_rgb24; 721 case PIX_FMT_BGR24: 722 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n"); 723 return altivec_yuv2_bgr24; 724 case PIX_FMT_ARGB: 725 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n"); 726 return altivec_yuv2_argb; 727 case PIX_FMT_ABGR: 728 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n"); 729 return altivec_yuv2_abgr; 730 case PIX_FMT_RGBA: 731 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n"); 732 return altivec_yuv2_rgba; 733 case PIX_FMT_BGRA: 734 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n"); 735 return altivec_yuv2_bgra; 736 default: return NULL; 737 } 738 break; 739 740 case PIX_FMT_UYVY422: 741 switch(c->dstFormat){ 742 case PIX_FMT_BGR32: 743 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n"); 744 return altivec_uyvy_rgb32; 745 default: return NULL; 746 } 747 break; 748 749 } 750 return NULL; 751} 752 753void sws_yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation) 754{ 755 union { 756 signed short tmp[8] __attribute__ ((aligned(16))); 757 vector signed short vec; 758 } buf; 759 760 buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy 761 buf.tmp[1] = -256*brightness; //oy 762 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv 763 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu 764 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu 765 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv 766 767 768 c->CSHIFT = (vector unsigned short)vec_splat_u16(2); 769 c->CY = vec_splat ((vector signed short)buf.vec, 0); 770 c->OY = vec_splat ((vector signed short)buf.vec, 1); 771 c->CRV = vec_splat ((vector signed short)buf.vec, 2); 772 c->CBU = vec_splat ((vector signed short)buf.vec, 3); 773 c->CGU = vec_splat ((vector signed short)buf.vec, 4); 774 c->CGV = vec_splat ((vector signed short)buf.vec, 5); 775#if 0 776 { 777 int i; 778 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"}; 779 for (i=0; i<6; i++) 780 printf("%s %d ", v[i],buf.tmp[i] ); 781 printf("\n"); 782 } 783#endif 784 return; 785} 786 787 788void 789altivec_yuv2packedX (SwsContext *c, 790 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, 791 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, 792 uint8_t *dest, int dstW, int dstY) 793{ 794 int i,j; 795 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; 796 vector signed short R0,G0,B0,R1,G1,B1; 797 798 vector unsigned char R,G,B; 799 vector unsigned char *out,*nout; 800 801 vector signed short RND = vec_splat_s16(1<<3); 802 vector unsigned short SCL = vec_splat_u16(4); 803 unsigned long scratch[16] __attribute__ ((aligned (16))); 804 805 vector signed short *YCoeffs, *CCoeffs; 806 807 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize; 808 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize; 809 810 out = (vector unsigned char *)dest; 811 812 for (i=0; i<dstW; i+=16){ 813 Y0 = RND; 814 Y1 = RND; 815 /* extract 16 coeffs from lumSrc */ 816 for (j=0; j<lumFilterSize; j++) { 817 X0 = vec_ld (0, &lumSrc[j][i]); 818 X1 = vec_ld (16, &lumSrc[j][i]); 819 Y0 = vec_mradds (X0, YCoeffs[j], Y0); 820 Y1 = vec_mradds (X1, YCoeffs[j], Y1); 821 } 822 823 U = RND; 824 V = RND; 825 /* extract 8 coeffs from U,V */ 826 for (j=0; j<chrFilterSize; j++) { 827 X = vec_ld (0, &chrSrc[j][i/2]); 828 U = vec_mradds (X, CCoeffs[j], U); 829 X = vec_ld (0, &chrSrc[j][i/2+2048]); 830 V = vec_mradds (X, CCoeffs[j], V); 831 } 832 833 /* scale and clip signals */ 834 Y0 = vec_sra (Y0, SCL); 835 Y1 = vec_sra (Y1, SCL); 836 U = vec_sra (U, SCL); 837 V = vec_sra (V, SCL); 838 839 Y0 = vec_clip_s16 (Y0); 840 Y1 = vec_clip_s16 (Y1); 841 U = vec_clip_s16 (U); 842 V = vec_clip_s16 (V); 843 844 /* now we have 845 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 846 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7 847 848 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 849 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 850 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 851 */ 852 853 U0 = vec_mergeh (U,U); 854 V0 = vec_mergeh (V,V); 855 856 U1 = vec_mergel (U,U); 857 V1 = vec_mergel (V,V); 858 859 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); 860 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); 861 862 R = vec_packclp (R0,R1); 863 G = vec_packclp (G0,G1); 864 B = vec_packclp (B0,B1); 865 866 switch(c->dstFormat) { 867 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break; 868 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break; 869 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break; 870 case PIX_FMT_ARGB: out_argb (R,G,B,out); break; 871 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break; 872 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break; 873 default: 874 { 875 /* If this is reached, the caller should have called yuv2packedXinC 876 instead. */ 877 static int printed_error_message; 878 if (!printed_error_message) { 879 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", 880 sws_format_name(c->dstFormat)); 881 printed_error_message=1; 882 } 883 return; 884 } 885 } 886 } 887 888 if (i < dstW) { 889 i -= 16; 890 891 Y0 = RND; 892 Y1 = RND; 893 /* extract 16 coeffs from lumSrc */ 894 for (j=0; j<lumFilterSize; j++) { 895 X0 = vec_ld (0, &lumSrc[j][i]); 896 X1 = vec_ld (16, &lumSrc[j][i]); 897 Y0 = vec_mradds (X0, YCoeffs[j], Y0); 898 Y1 = vec_mradds (X1, YCoeffs[j], Y1); 899 } 900 901 U = RND; 902 V = RND; 903 /* extract 8 coeffs from U,V */ 904 for (j=0; j<chrFilterSize; j++) { 905 X = vec_ld (0, &chrSrc[j][i/2]); 906 U = vec_mradds (X, CCoeffs[j], U); 907 X = vec_ld (0, &chrSrc[j][i/2+2048]); 908 V = vec_mradds (X, CCoeffs[j], V); 909 } 910 911 /* scale and clip signals */ 912 Y0 = vec_sra (Y0, SCL); 913 Y1 = vec_sra (Y1, SCL); 914 U = vec_sra (U, SCL); 915 V = vec_sra (V, SCL); 916 917 Y0 = vec_clip_s16 (Y0); 918 Y1 = vec_clip_s16 (Y1); 919 U = vec_clip_s16 (U); 920 V = vec_clip_s16 (V); 921 922 /* now we have 923 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 924 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7 925 926 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 927 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 928 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 929 */ 930 931 U0 = vec_mergeh (U,U); 932 V0 = vec_mergeh (V,V); 933 934 U1 = vec_mergel (U,U); 935 V1 = vec_mergel (V,V); 936 937 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); 938 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); 939 940 R = vec_packclp (R0,R1); 941 G = vec_packclp (G0,G1); 942 B = vec_packclp (B0,B1); 943 944 nout = (vector unsigned char *)scratch; 945 switch(c->dstFormat) { 946 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break; 947 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break; 948 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break; 949 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break; 950 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break; 951 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break; 952 default: 953 /* Unreachable, I think. */ 954 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", 955 sws_format_name(c->dstFormat)); 956 return; 957 } 958 959 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4); 960 } 961 962} 963