1/* 2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder 3 * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22/** 23 * @file 24 * H.264 / AVC / MPEG4 part10 prediction functions. 25 * @author Michael Niedermayer <michaelni@gmx.at> 26 */ 27 28#include "libavutil/intreadwrite.h" 29 30#include "mathops.h" 31 32#include "bit_depth_template.c" 33 34static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright, 35 ptrdiff_t _stride) 36{ 37 pixel *src = (pixel*)_src; 38 int stride = _stride>>(sizeof(pixel)-1); 39 const pixel4 a= AV_RN4PA(src-stride); 40 41 AV_WN4PA(src+0*stride, a); 42 AV_WN4PA(src+1*stride, a); 43 AV_WN4PA(src+2*stride, a); 44 AV_WN4PA(src+3*stride, a); 45} 46 47static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright, 48 ptrdiff_t _stride) 49{ 50 pixel *src = (pixel*)_src; 51 int stride = _stride>>(sizeof(pixel)-1); 52 AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride])); 53 AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride])); 54 AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride])); 55 AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4(src[-1+3*stride])); 56} 57 58static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, 59 ptrdiff_t _stride) 60{ 61 pixel *src = (pixel*)_src; 62 int stride = _stride>>(sizeof(pixel)-1); 63 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] 64 + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3; 65 const pixel4 a = PIXEL_SPLAT_X4(dc); 66 67 AV_WN4PA(src+0*stride, a); 68 AV_WN4PA(src+1*stride, a); 69 AV_WN4PA(src+2*stride, a); 70 AV_WN4PA(src+3*stride, a); 71} 72 73static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright, 74 ptrdiff_t _stride) 75{ 76 pixel *src = (pixel*)_src; 77 int stride = _stride>>(sizeof(pixel)-1); 78 const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2; 79 const pixel4 a = PIXEL_SPLAT_X4(dc); 80 81 AV_WN4PA(src+0*stride, a); 82 AV_WN4PA(src+1*stride, a); 83 AV_WN4PA(src+2*stride, a); 84 AV_WN4PA(src+3*stride, a); 85} 86 87static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright, 88 ptrdiff_t _stride) 89{ 90 pixel *src = (pixel*)_src; 91 int stride = _stride>>(sizeof(pixel)-1); 92 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2; 93 const pixel4 a = PIXEL_SPLAT_X4(dc); 94 95 AV_WN4PA(src+0*stride, a); 96 AV_WN4PA(src+1*stride, a); 97 AV_WN4PA(src+2*stride, a); 98 AV_WN4PA(src+3*stride, a); 99} 100 101static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright, 102 ptrdiff_t _stride) 103{ 104 pixel *src = (pixel*)_src; 105 int stride = _stride>>(sizeof(pixel)-1); 106 const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)); 107 108 AV_WN4PA(src+0*stride, a); 109 AV_WN4PA(src+1*stride, a); 110 AV_WN4PA(src+2*stride, a); 111 AV_WN4PA(src+3*stride, a); 112} 113 114static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright, 115 ptrdiff_t _stride) 116{ 117 pixel *src = (pixel*)_src; 118 int stride = _stride>>(sizeof(pixel)-1); 119 const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1); 120 121 AV_WN4PA(src+0*stride, a); 122 AV_WN4PA(src+1*stride, a); 123 AV_WN4PA(src+2*stride, a); 124 AV_WN4PA(src+3*stride, a); 125} 126 127static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright, 128 ptrdiff_t _stride) 129{ 130 pixel *src = (pixel*)_src; 131 int stride = _stride>>(sizeof(pixel)-1); 132 const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1); 133 134 AV_WN4PA(src+0*stride, a); 135 AV_WN4PA(src+1*stride, a); 136 AV_WN4PA(src+2*stride, a); 137 AV_WN4PA(src+3*stride, a); 138} 139 140 141#define LOAD_TOP_RIGHT_EDGE\ 142 const unsigned av_unused t4 = topright[0];\ 143 const unsigned av_unused t5 = topright[1];\ 144 const unsigned av_unused t6 = topright[2];\ 145 const unsigned av_unused t7 = topright[3];\ 146 147#define LOAD_DOWN_LEFT_EDGE\ 148 const unsigned av_unused l4 = src[-1+4*stride];\ 149 const unsigned av_unused l5 = src[-1+5*stride];\ 150 const unsigned av_unused l6 = src[-1+6*stride];\ 151 const unsigned av_unused l7 = src[-1+7*stride];\ 152 153#define LOAD_LEFT_EDGE\ 154 const unsigned av_unused l0 = src[-1+0*stride];\ 155 const unsigned av_unused l1 = src[-1+1*stride];\ 156 const unsigned av_unused l2 = src[-1+2*stride];\ 157 const unsigned av_unused l3 = src[-1+3*stride];\ 158 159#define LOAD_TOP_EDGE\ 160 const unsigned av_unused t0 = src[ 0-1*stride];\ 161 const unsigned av_unused t1 = src[ 1-1*stride];\ 162 const unsigned av_unused t2 = src[ 2-1*stride];\ 163 const unsigned av_unused t3 = src[ 3-1*stride];\ 164 165static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright, 166 ptrdiff_t _stride) 167{ 168 pixel *src = (pixel*)_src; 169 int stride = _stride>>(sizeof(pixel)-1); 170 const int lt= src[-1-1*stride]; 171 LOAD_TOP_EDGE 172 LOAD_LEFT_EDGE 173 174 src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2; 175 src[0+2*stride]= 176 src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2; 177 src[0+1*stride]= 178 src[1+2*stride]= 179 src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2; 180 src[0+0*stride]= 181 src[1+1*stride]= 182 src[2+2*stride]= 183 src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2; 184 src[1+0*stride]= 185 src[2+1*stride]= 186 src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2; 187 src[2+0*stride]= 188 src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; 189 src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2; 190} 191 192static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright, 193 ptrdiff_t _stride) 194{ 195 pixel *src = (pixel*)_src; 196 const pixel *topright = (const pixel*)_topright; 197 int stride = _stride>>(sizeof(pixel)-1); 198 LOAD_TOP_EDGE 199 LOAD_TOP_RIGHT_EDGE 200// LOAD_LEFT_EDGE 201 202 src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2; 203 src[1+0*stride]= 204 src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2; 205 src[2+0*stride]= 206 src[1+1*stride]= 207 src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2; 208 src[3+0*stride]= 209 src[2+1*stride]= 210 src[1+2*stride]= 211 src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2; 212 src[3+1*stride]= 213 src[2+2*stride]= 214 src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2; 215 src[3+2*stride]= 216 src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2; 217 src[3+3*stride]=(t6 + 3*t7 + 2)>>2; 218} 219 220static void FUNCC(pred4x4_vertical_right)(uint8_t *_src, 221 const uint8_t *topright, 222 ptrdiff_t _stride) 223{ 224 pixel *src = (pixel*)_src; 225 int stride = _stride>>(sizeof(pixel)-1); 226 const int lt= src[-1-1*stride]; 227 LOAD_TOP_EDGE 228 LOAD_LEFT_EDGE 229 230 src[0+0*stride]= 231 src[1+2*stride]=(lt + t0 + 1)>>1; 232 src[1+0*stride]= 233 src[2+2*stride]=(t0 + t1 + 1)>>1; 234 src[2+0*stride]= 235 src[3+2*stride]=(t1 + t2 + 1)>>1; 236 src[3+0*stride]=(t2 + t3 + 1)>>1; 237 src[0+1*stride]= 238 src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2; 239 src[1+1*stride]= 240 src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2; 241 src[2+1*stride]= 242 src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2; 243 src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2; 244 src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2; 245 src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; 246} 247 248static void FUNCC(pred4x4_vertical_left)(uint8_t *_src, 249 const uint8_t *_topright, 250 ptrdiff_t _stride) 251{ 252 pixel *src = (pixel*)_src; 253 const pixel *topright = (const pixel*)_topright; 254 int stride = _stride>>(sizeof(pixel)-1); 255 LOAD_TOP_EDGE 256 LOAD_TOP_RIGHT_EDGE 257 258 src[0+0*stride]=(t0 + t1 + 1)>>1; 259 src[1+0*stride]= 260 src[0+2*stride]=(t1 + t2 + 1)>>1; 261 src[2+0*stride]= 262 src[1+2*stride]=(t2 + t3 + 1)>>1; 263 src[3+0*stride]= 264 src[2+2*stride]=(t3 + t4+ 1)>>1; 265 src[3+2*stride]=(t4 + t5+ 1)>>1; 266 src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; 267 src[1+1*stride]= 268 src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2; 269 src[2+1*stride]= 270 src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2; 271 src[3+1*stride]= 272 src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2; 273 src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2; 274} 275 276static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright, 277 ptrdiff_t _stride) 278{ 279 pixel *src = (pixel*)_src; 280 int stride = _stride>>(sizeof(pixel)-1); 281 LOAD_LEFT_EDGE 282 283 src[0+0*stride]=(l0 + l1 + 1)>>1; 284 src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2; 285 src[2+0*stride]= 286 src[0+1*stride]=(l1 + l2 + 1)>>1; 287 src[3+0*stride]= 288 src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2; 289 src[2+1*stride]= 290 src[0+2*stride]=(l2 + l3 + 1)>>1; 291 src[3+1*stride]= 292 src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2; 293 src[3+2*stride]= 294 src[1+3*stride]= 295 src[0+3*stride]= 296 src[2+2*stride]= 297 src[2+3*stride]= 298 src[3+3*stride]=l3; 299} 300 301static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src, 302 const uint8_t *topright, 303 ptrdiff_t _stride) 304{ 305 pixel *src = (pixel*)_src; 306 int stride = _stride>>(sizeof(pixel)-1); 307 const int lt= src[-1-1*stride]; 308 LOAD_TOP_EDGE 309 LOAD_LEFT_EDGE 310 311 src[0+0*stride]= 312 src[2+1*stride]=(lt + l0 + 1)>>1; 313 src[1+0*stride]= 314 src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2; 315 src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2; 316 src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2; 317 src[0+1*stride]= 318 src[2+2*stride]=(l0 + l1 + 1)>>1; 319 src[1+1*stride]= 320 src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2; 321 src[0+2*stride]= 322 src[2+3*stride]=(l1 + l2+ 1)>>1; 323 src[1+2*stride]= 324 src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; 325 src[0+3*stride]=(l2 + l3 + 1)>>1; 326 src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2; 327} 328 329static void FUNCC(pred16x16_vertical)(uint8_t *_src, ptrdiff_t _stride) 330{ 331 int i; 332 pixel *src = (pixel*)_src; 333 int stride = _stride>>(sizeof(pixel)-1); 334 const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0); 335 const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1); 336 const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2); 337 const pixel4 d = AV_RN4PA(((pixel4*)(src-stride))+3); 338 339 for(i=0; i<16; i++){ 340 AV_WN4PA(((pixel4*)(src+i*stride))+0, a); 341 AV_WN4PA(((pixel4*)(src+i*stride))+1, b); 342 AV_WN4PA(((pixel4*)(src+i*stride))+2, c); 343 AV_WN4PA(((pixel4*)(src+i*stride))+3, d); 344 } 345} 346 347static void FUNCC(pred16x16_horizontal)(uint8_t *_src, ptrdiff_t stride) 348{ 349 int i; 350 pixel *src = (pixel*)_src; 351 stride >>= sizeof(pixel)-1; 352 353 for(i=0; i<16; i++){ 354 const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]); 355 356 AV_WN4PA(((pixel4*)(src+i*stride))+0, a); 357 AV_WN4PA(((pixel4*)(src+i*stride))+1, a); 358 AV_WN4PA(((pixel4*)(src+i*stride))+2, a); 359 AV_WN4PA(((pixel4*)(src+i*stride))+3, a); 360 } 361} 362 363#define PREDICT_16x16_DC(v)\ 364 for(i=0; i<16; i++){\ 365 AV_WN4PA(src+ 0, v);\ 366 AV_WN4PA(src+ 4, v);\ 367 AV_WN4PA(src+ 8, v);\ 368 AV_WN4PA(src+12, v);\ 369 src += stride;\ 370 } 371 372static void FUNCC(pred16x16_dc)(uint8_t *_src, ptrdiff_t stride) 373{ 374 int i, dc=0; 375 pixel *src = (pixel*)_src; 376 pixel4 dcsplat; 377 stride >>= sizeof(pixel)-1; 378 379 for(i=0;i<16; i++){ 380 dc+= src[-1+i*stride]; 381 } 382 383 for(i=0;i<16; i++){ 384 dc+= src[i-stride]; 385 } 386 387 dcsplat = PIXEL_SPLAT_X4((dc+16)>>5); 388 PREDICT_16x16_DC(dcsplat); 389} 390 391static void FUNCC(pred16x16_left_dc)(uint8_t *_src, ptrdiff_t stride) 392{ 393 int i, dc=0; 394 pixel *src = (pixel*)_src; 395 pixel4 dcsplat; 396 stride >>= sizeof(pixel)-1; 397 398 for(i=0;i<16; i++){ 399 dc+= src[-1+i*stride]; 400 } 401 402 dcsplat = PIXEL_SPLAT_X4((dc+8)>>4); 403 PREDICT_16x16_DC(dcsplat); 404} 405 406static void FUNCC(pred16x16_top_dc)(uint8_t *_src, ptrdiff_t stride) 407{ 408 int i, dc=0; 409 pixel *src = (pixel*)_src; 410 pixel4 dcsplat; 411 stride >>= sizeof(pixel)-1; 412 413 for(i=0;i<16; i++){ 414 dc+= src[i-stride]; 415 } 416 417 dcsplat = PIXEL_SPLAT_X4((dc+8)>>4); 418 PREDICT_16x16_DC(dcsplat); 419} 420 421#define PRED16x16_X(n, v) \ 422static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\ 423{\ 424 int i;\ 425 pixel *src = (pixel*)_src;\ 426 stride >>= sizeof(pixel)-1;\ 427 PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\ 428} 429 430PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1) 431PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0) 432PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1) 433 434static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src, 435 ptrdiff_t _stride, 436 const int svq3, 437 const int rv40) 438{ 439 int i, j, k; 440 int a; 441 INIT_CLIP 442 pixel *src = (pixel*)_src; 443 int stride = _stride>>(sizeof(pixel)-1); 444 const pixel * const src0 = src +7-stride; 445 const pixel * src1 = src +8*stride-1; 446 const pixel * src2 = src1-2*stride; // == src+6*stride-1; 447 int H = src0[1] - src0[-1]; 448 int V = src1[0] - src2[ 0]; 449 for(k=2; k<=8; ++k) { 450 src1 += stride; src2 -= stride; 451 H += k*(src0[k] - src0[-k]); 452 V += k*(src1[0] - src2[ 0]); 453 } 454 if(svq3){ 455 H = ( 5*(H/4) ) / 16; 456 V = ( 5*(V/4) ) / 16; 457 458 /* required for 100% accuracy */ 459 i = H; H = V; V = i; 460 }else if(rv40){ 461 H = ( H + (H>>2) ) >> 4; 462 V = ( V + (V>>2) ) >> 4; 463 }else{ 464 H = ( 5*H+32 ) >> 6; 465 V = ( 5*V+32 ) >> 6; 466 } 467 468 a = 16*(src1[0] + src2[16] + 1) - 7*(V+H); 469 for(j=16; j>0; --j) { 470 int b = a; 471 a += V; 472 for(i=-16; i<0; i+=4) { 473 src[16+i] = CLIP((b ) >> 5); 474 src[17+i] = CLIP((b+ H) >> 5); 475 src[18+i] = CLIP((b+2*H) >> 5); 476 src[19+i] = CLIP((b+3*H) >> 5); 477 b += 4*H; 478 } 479 src += stride; 480 } 481} 482 483static void FUNCC(pred16x16_plane)(uint8_t *src, ptrdiff_t stride) 484{ 485 FUNCC(pred16x16_plane_compat)(src, stride, 0, 0); 486} 487 488static void FUNCC(pred8x8_vertical)(uint8_t *_src, ptrdiff_t _stride) 489{ 490 int i; 491 pixel *src = (pixel*)_src; 492 int stride = _stride>>(sizeof(pixel)-1); 493 const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0); 494 const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1); 495 496 for(i=0; i<8; i++){ 497 AV_WN4PA(((pixel4*)(src+i*stride))+0, a); 498 AV_WN4PA(((pixel4*)(src+i*stride))+1, b); 499 } 500} 501 502static void FUNCC(pred8x16_vertical)(uint8_t *_src, ptrdiff_t _stride) 503{ 504 int i; 505 pixel *src = (pixel*)_src; 506 int stride = _stride>>(sizeof(pixel)-1); 507 const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0); 508 const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1); 509 510 for(i=0; i<16; i++){ 511 AV_WN4PA(((pixel4*)(src+i*stride))+0, a); 512 AV_WN4PA(((pixel4*)(src+i*stride))+1, b); 513 } 514} 515 516static void FUNCC(pred8x8_horizontal)(uint8_t *_src, ptrdiff_t stride) 517{ 518 int i; 519 pixel *src = (pixel*)_src; 520 stride >>= sizeof(pixel)-1; 521 522 for(i=0; i<8; i++){ 523 const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]); 524 AV_WN4PA(((pixel4*)(src+i*stride))+0, a); 525 AV_WN4PA(((pixel4*)(src+i*stride))+1, a); 526 } 527} 528 529static void FUNCC(pred8x16_horizontal)(uint8_t *_src, ptrdiff_t stride) 530{ 531 int i; 532 pixel *src = (pixel*)_src; 533 stride >>= sizeof(pixel)-1; 534 for(i=0; i<16; i++){ 535 const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]); 536 AV_WN4PA(((pixel4*)(src+i*stride))+0, a); 537 AV_WN4PA(((pixel4*)(src+i*stride))+1, a); 538 } 539} 540 541#define PRED8x8_X(n, v)\ 542static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\ 543{\ 544 int i;\ 545 const pixel4 a = PIXEL_SPLAT_X4(v);\ 546 pixel *src = (pixel*)_src;\ 547 stride >>= sizeof(pixel)-1;\ 548 for(i=0; i<8; i++){\ 549 AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\ 550 AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\ 551 }\ 552} 553 554PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1) 555PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0) 556PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1) 557 558static void FUNCC(pred8x16_128_dc)(uint8_t *_src, ptrdiff_t stride) 559{ 560 FUNCC(pred8x8_128_dc)(_src, stride); 561 FUNCC(pred8x8_128_dc)(_src+8*stride, stride); 562} 563 564static void FUNCC(pred8x8_left_dc)(uint8_t *_src, ptrdiff_t stride) 565{ 566 int i; 567 int dc0, dc2; 568 pixel4 dc0splat, dc2splat; 569 pixel *src = (pixel*)_src; 570 stride >>= sizeof(pixel)-1; 571 572 dc0=dc2=0; 573 for(i=0;i<4; i++){ 574 dc0+= src[-1+i*stride]; 575 dc2+= src[-1+(i+4)*stride]; 576 } 577 dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2); 578 dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2); 579 580 for(i=0; i<4; i++){ 581 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); 582 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc0splat); 583 } 584 for(i=4; i<8; i++){ 585 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat); 586 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc2splat); 587 } 588} 589 590static void FUNCC(pred8x16_left_dc)(uint8_t *_src, ptrdiff_t stride) 591{ 592 FUNCC(pred8x8_left_dc)(_src, stride); 593 FUNCC(pred8x8_left_dc)(_src+8*stride, stride); 594} 595 596static void FUNCC(pred8x8_top_dc)(uint8_t *_src, ptrdiff_t stride) 597{ 598 int i; 599 int dc0, dc1; 600 pixel4 dc0splat, dc1splat; 601 pixel *src = (pixel*)_src; 602 stride >>= sizeof(pixel)-1; 603 604 dc0=dc1=0; 605 for(i=0;i<4; i++){ 606 dc0+= src[i-stride]; 607 dc1+= src[4+i-stride]; 608 } 609 dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2); 610 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); 611 612 for(i=0; i<4; i++){ 613 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); 614 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); 615 } 616 for(i=4; i<8; i++){ 617 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); 618 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); 619 } 620} 621 622static void FUNCC(pred8x16_top_dc)(uint8_t *_src, ptrdiff_t stride) 623{ 624 int i; 625 int dc0, dc1; 626 pixel4 dc0splat, dc1splat; 627 pixel *src = (pixel*)_src; 628 stride >>= sizeof(pixel)-1; 629 630 dc0=dc1=0; 631 for(i=0;i<4; i++){ 632 dc0+= src[i-stride]; 633 dc1+= src[4+i-stride]; 634 } 635 dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2); 636 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); 637 638 for(i=0; i<16; i++){ 639 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); 640 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); 641 } 642} 643 644static void FUNCC(pred8x8_dc)(uint8_t *_src, ptrdiff_t stride) 645{ 646 int i; 647 int dc0, dc1, dc2; 648 pixel4 dc0splat, dc1splat, dc2splat, dc3splat; 649 pixel *src = (pixel*)_src; 650 stride >>= sizeof(pixel)-1; 651 652 dc0=dc1=dc2=0; 653 for(i=0;i<4; i++){ 654 dc0+= src[-1+i*stride] + src[i-stride]; 655 dc1+= src[4+i-stride]; 656 dc2+= src[-1+(i+4)*stride]; 657 } 658 dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3); 659 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); 660 dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2); 661 dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3); 662 663 for(i=0; i<4; i++){ 664 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); 665 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); 666 } 667 for(i=4; i<8; i++){ 668 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat); 669 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat); 670 } 671} 672 673static void FUNCC(pred8x16_dc)(uint8_t *_src, ptrdiff_t stride) 674{ 675 int i; 676 int dc0, dc1, dc2, dc3, dc4; 677 pixel4 dc0splat, dc1splat, dc2splat, dc3splat, dc4splat, dc5splat, dc6splat, dc7splat; 678 pixel *src = (pixel*)_src; 679 stride >>= sizeof(pixel)-1; 680 681 dc0=dc1=dc2=dc3=dc4=0; 682 for(i=0;i<4; i++){ 683 dc0+= src[-1+i*stride] + src[i-stride]; 684 dc1+= src[4+i-stride]; 685 dc2+= src[-1+(i+4)*stride]; 686 dc3+= src[-1+(i+8)*stride]; 687 dc4+= src[-1+(i+12)*stride]; 688 } 689 dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3); 690 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); 691 dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2); 692 dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3); 693 dc4splat = PIXEL_SPLAT_X4((dc3 + 2)>>2); 694 dc5splat = PIXEL_SPLAT_X4((dc1 + dc3 + 4)>>3); 695 dc6splat = PIXEL_SPLAT_X4((dc4 + 2)>>2); 696 dc7splat = PIXEL_SPLAT_X4((dc1 + dc4 + 4)>>3); 697 698 for(i=0; i<4; i++){ 699 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); 700 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); 701 } 702 for(i=4; i<8; i++){ 703 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat); 704 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat); 705 } 706 for(i=8; i<12; i++){ 707 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc4splat); 708 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc5splat); 709 } 710 for(i=12; i<16; i++){ 711 AV_WN4PA(((pixel4*)(src+i*stride))+0, dc6splat); 712 AV_WN4PA(((pixel4*)(src+i*stride))+1, dc7splat); 713 } 714} 715 716//the following 4 function should not be optimized! 717static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride) 718{ 719 FUNCC(pred8x8_top_dc)(src, stride); 720 FUNCC(pred4x4_dc)(src, NULL, stride); 721} 722 723static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride) 724{ 725 FUNCC(pred8x16_top_dc)(src, stride); 726 FUNCC(pred4x4_dc)(src, NULL, stride); 727} 728 729static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride) 730{ 731 FUNCC(pred8x8_dc)(src, stride); 732 FUNCC(pred4x4_top_dc)(src, NULL, stride); 733} 734 735static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride) 736{ 737 FUNCC(pred8x16_dc)(src, stride); 738 FUNCC(pred4x4_top_dc)(src, NULL, stride); 739} 740 741static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride) 742{ 743 FUNCC(pred8x8_left_dc)(src, stride); 744 FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride); 745 FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride); 746} 747 748static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride) 749{ 750 FUNCC(pred8x16_left_dc)(src, stride); 751 FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride); 752 FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride); 753} 754 755static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride) 756{ 757 FUNCC(pred8x8_left_dc)(src, stride); 758 FUNCC(pred4x4_128_dc)(src , NULL, stride); 759 FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride); 760} 761 762static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride) 763{ 764 FUNCC(pred8x16_left_dc)(src, stride); 765 FUNCC(pred4x4_128_dc)(src , NULL, stride); 766 FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride); 767} 768 769static void FUNCC(pred8x8_plane)(uint8_t *_src, ptrdiff_t _stride) 770{ 771 int j, k; 772 int a; 773 INIT_CLIP 774 pixel *src = (pixel*)_src; 775 int stride = _stride>>(sizeof(pixel)-1); 776 const pixel * const src0 = src +3-stride; 777 const pixel * src1 = src +4*stride-1; 778 const pixel * src2 = src1-2*stride; // == src+2*stride-1; 779 int H = src0[1] - src0[-1]; 780 int V = src1[0] - src2[ 0]; 781 for(k=2; k<=4; ++k) { 782 src1 += stride; src2 -= stride; 783 H += k*(src0[k] - src0[-k]); 784 V += k*(src1[0] - src2[ 0]); 785 } 786 H = ( 17*H+16 ) >> 5; 787 V = ( 17*V+16 ) >> 5; 788 789 a = 16*(src1[0] + src2[8]+1) - 3*(V+H); 790 for(j=8; j>0; --j) { 791 int b = a; 792 a += V; 793 src[0] = CLIP((b ) >> 5); 794 src[1] = CLIP((b+ H) >> 5); 795 src[2] = CLIP((b+2*H) >> 5); 796 src[3] = CLIP((b+3*H) >> 5); 797 src[4] = CLIP((b+4*H) >> 5); 798 src[5] = CLIP((b+5*H) >> 5); 799 src[6] = CLIP((b+6*H) >> 5); 800 src[7] = CLIP((b+7*H) >> 5); 801 src += stride; 802 } 803} 804 805static void FUNCC(pred8x16_plane)(uint8_t *_src, ptrdiff_t _stride) 806{ 807 int j, k; 808 int a; 809 INIT_CLIP 810 pixel *src = (pixel*)_src; 811 int stride = _stride>>(sizeof(pixel)-1); 812 const pixel * const src0 = src +3-stride; 813 const pixel * src1 = src +8*stride-1; 814 const pixel * src2 = src1-2*stride; // == src+6*stride-1; 815 int H = src0[1] - src0[-1]; 816 int V = src1[0] - src2[ 0]; 817 818 for (k = 2; k <= 4; ++k) { 819 src1 += stride; src2 -= stride; 820 H += k*(src0[k] - src0[-k]); 821 V += k*(src1[0] - src2[ 0]); 822 } 823 for (; k <= 8; ++k) { 824 src1 += stride; src2 -= stride; 825 V += k*(src1[0] - src2[0]); 826 } 827 828 H = (17*H+16) >> 5; 829 V = (5*V+32) >> 6; 830 831 a = 16*(src1[0] + src2[8] + 1) - 7*V - 3*H; 832 for(j=16; j>0; --j) { 833 int b = a; 834 a += V; 835 src[0] = CLIP((b ) >> 5); 836 src[1] = CLIP((b+ H) >> 5); 837 src[2] = CLIP((b+2*H) >> 5); 838 src[3] = CLIP((b+3*H) >> 5); 839 src[4] = CLIP((b+4*H) >> 5); 840 src[5] = CLIP((b+5*H) >> 5); 841 src[6] = CLIP((b+6*H) >> 5); 842 src[7] = CLIP((b+7*H) >> 5); 843 src += stride; 844 } 845} 846 847#define SRC(x,y) src[(x)+(y)*stride] 848#define PL(y) \ 849 const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; 850#define PREDICT_8x8_LOAD_LEFT \ 851 const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \ 852 + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \ 853 PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \ 854 const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2 855 856#define PT(x) \ 857 const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; 858#define PREDICT_8x8_LOAD_TOP \ 859 const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \ 860 + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \ 861 PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \ 862 const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \ 863 + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2 864 865#define PTR(x) \ 866 t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; 867#define PREDICT_8x8_LOAD_TOPRIGHT \ 868 int t8, t9, t10, t11, t12, t13, t14, t15; \ 869 if(has_topright) { \ 870 PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \ 871 t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \ 872 } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1); 873 874#define PREDICT_8x8_LOAD_TOPLEFT \ 875 const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2 876 877#define PREDICT_8x8_DC(v) \ 878 int y; \ 879 for( y = 0; y < 8; y++ ) { \ 880 AV_WN4PA(((pixel4*)src)+0, v); \ 881 AV_WN4PA(((pixel4*)src)+1, v); \ 882 src += stride; \ 883 } 884 885static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft, 886 int has_topright, ptrdiff_t _stride) 887{ 888 pixel *src = (pixel*)_src; 889 int stride = _stride>>(sizeof(pixel)-1); 890 891 PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1))); 892} 893static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft, 894 int has_topright, ptrdiff_t _stride) 895{ 896 pixel *src = (pixel*)_src; 897 int stride = _stride>>(sizeof(pixel)-1); 898 899 PREDICT_8x8_LOAD_LEFT; 900 const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3); 901 PREDICT_8x8_DC(dc); 902} 903static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft, 904 int has_topright, ptrdiff_t _stride) 905{ 906 pixel *src = (pixel*)_src; 907 int stride = _stride>>(sizeof(pixel)-1); 908 909 PREDICT_8x8_LOAD_TOP; 910 const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3); 911 PREDICT_8x8_DC(dc); 912} 913static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft, 914 int has_topright, ptrdiff_t _stride) 915{ 916 pixel *src = (pixel*)_src; 917 int stride = _stride>>(sizeof(pixel)-1); 918 919 PREDICT_8x8_LOAD_LEFT; 920 PREDICT_8x8_LOAD_TOP; 921 const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7 922 +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4); 923 PREDICT_8x8_DC(dc); 924} 925static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft, 926 int has_topright, ptrdiff_t _stride) 927{ 928 pixel *src = (pixel*)_src; 929 int stride = _stride>>(sizeof(pixel)-1); 930 pixel4 a; 931 932 PREDICT_8x8_LOAD_LEFT; 933#define ROW(y) a = PIXEL_SPLAT_X4(l##y); \ 934 AV_WN4PA(src+y*stride, a); \ 935 AV_WN4PA(src+y*stride+4, a); 936 ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); 937#undef ROW 938} 939static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, 940 int has_topright, ptrdiff_t _stride) 941{ 942 int y; 943 pixel *src = (pixel*)_src; 944 int stride = _stride>>(sizeof(pixel)-1); 945 pixel4 a, b; 946 947 PREDICT_8x8_LOAD_TOP; 948 src[0] = t0; 949 src[1] = t1; 950 src[2] = t2; 951 src[3] = t3; 952 src[4] = t4; 953 src[5] = t5; 954 src[6] = t6; 955 src[7] = t7; 956 a = AV_RN4PA(((pixel4*)src)+0); 957 b = AV_RN4PA(((pixel4*)src)+1); 958 for( y = 1; y < 8; y++ ) { 959 AV_WN4PA(((pixel4*)(src+y*stride))+0, a); 960 AV_WN4PA(((pixel4*)(src+y*stride))+1, b); 961 } 962} 963static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft, 964 int has_topright, ptrdiff_t _stride) 965{ 966 pixel *src = (pixel*)_src; 967 int stride = _stride>>(sizeof(pixel)-1); 968 PREDICT_8x8_LOAD_TOP; 969 PREDICT_8x8_LOAD_TOPRIGHT; 970 SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2; 971 SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2; 972 SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2; 973 SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2; 974 SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2; 975 SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2; 976 SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2; 977 SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2; 978 SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2; 979 SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2; 980 SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2; 981 SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2; 982 SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2; 983 SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2; 984 SRC(7,7)= (t14 + 3*t15 + 2) >> 2; 985} 986static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft, 987 int has_topright, ptrdiff_t _stride) 988{ 989 pixel *src = (pixel*)_src; 990 int stride = _stride>>(sizeof(pixel)-1); 991 PREDICT_8x8_LOAD_TOP; 992 PREDICT_8x8_LOAD_LEFT; 993 PREDICT_8x8_LOAD_TOPLEFT; 994 SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2; 995 SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2; 996 SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2; 997 SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2; 998 SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2; 999 SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2; 1000 SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2; 1001 SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2; 1002 SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2; 1003 SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2; 1004 SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2; 1005 SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2; 1006 SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2; 1007 SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2; 1008 SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2; 1009} 1010static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft, 1011 int has_topright, ptrdiff_t _stride) 1012{ 1013 pixel *src = (pixel*)_src; 1014 int stride = _stride>>(sizeof(pixel)-1); 1015 PREDICT_8x8_LOAD_TOP; 1016 PREDICT_8x8_LOAD_LEFT; 1017 PREDICT_8x8_LOAD_TOPLEFT; 1018 SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2; 1019 SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2; 1020 SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2; 1021 SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2; 1022 SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2; 1023 SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2; 1024 SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2; 1025 SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1; 1026 SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2; 1027 SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1; 1028 SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2; 1029 SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1; 1030 SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2; 1031 SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1; 1032 SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2; 1033 SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1; 1034 SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2; 1035 SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1; 1036 SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2; 1037 SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1; 1038 SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2; 1039 SRC(7,0)= (t6 + t7 + 1) >> 1; 1040} 1041static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft, 1042 int has_topright, ptrdiff_t _stride) 1043{ 1044 pixel *src = (pixel*)_src; 1045 int stride = _stride>>(sizeof(pixel)-1); 1046 PREDICT_8x8_LOAD_TOP; 1047 PREDICT_8x8_LOAD_LEFT; 1048 PREDICT_8x8_LOAD_TOPLEFT; 1049 SRC(0,7)= (l6 + l7 + 1) >> 1; 1050 SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2; 1051 SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1; 1052 SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2; 1053 SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1; 1054 SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2; 1055 SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1; 1056 SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2; 1057 SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1; 1058 SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2; 1059 SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1; 1060 SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2; 1061 SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1; 1062 SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2; 1063 SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1; 1064 SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2; 1065 SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2; 1066 SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2; 1067 SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2; 1068 SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2; 1069 SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; 1070 SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; 1071} 1072static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft, 1073 int has_topright, ptrdiff_t _stride) 1074{ 1075 pixel *src = (pixel*)_src; 1076 int stride = _stride>>(sizeof(pixel)-1); 1077 PREDICT_8x8_LOAD_TOP; 1078 PREDICT_8x8_LOAD_TOPRIGHT; 1079 SRC(0,0)= (t0 + t1 + 1) >> 1; 1080 SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2; 1081 SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1; 1082 SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2; 1083 SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1; 1084 SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2; 1085 SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1; 1086 SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2; 1087 SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1; 1088 SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2; 1089 SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1; 1090 SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2; 1091 SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1; 1092 SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2; 1093 SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1; 1094 SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2; 1095 SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1; 1096 SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2; 1097 SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1; 1098 SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2; 1099 SRC(7,6)= (t10 + t11 + 1) >> 1; 1100 SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2; 1101} 1102static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft, 1103 int has_topright, ptrdiff_t _stride) 1104{ 1105 pixel *src = (pixel*)_src; 1106 int stride = _stride>>(sizeof(pixel)-1); 1107 PREDICT_8x8_LOAD_LEFT; 1108 SRC(0,0)= (l0 + l1 + 1) >> 1; 1109 SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; 1110 SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1; 1111 SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2; 1112 SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1; 1113 SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2; 1114 SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1; 1115 SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2; 1116 SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1; 1117 SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2; 1118 SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1; 1119 SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2; 1120 SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1; 1121 SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2; 1122 SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)= 1123 SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)= 1124 SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)= 1125 SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7; 1126} 1127 1128static void FUNCC(pred8x8l_vertical_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft, 1129 int has_topright, ptrdiff_t _stride) 1130{ 1131 int i; 1132 pixel *src = (pixel*)_src; 1133 const dctcoef *block = (const dctcoef*)_block; 1134 pixel pix[8]; 1135 int stride = _stride>>(sizeof(pixel)-1); 1136 PREDICT_8x8_LOAD_TOP; 1137 1138 pix[0] = t0; 1139 pix[1] = t1; 1140 pix[2] = t2; 1141 pix[3] = t3; 1142 pix[4] = t4; 1143 pix[5] = t5; 1144 pix[6] = t6; 1145 pix[7] = t7; 1146 1147 for(i=0; i<8; i++){ 1148 pixel v = pix[i]; 1149 src[0*stride]= v += block[0]; 1150 src[1*stride]= v += block[8]; 1151 src[2*stride]= v += block[16]; 1152 src[3*stride]= v += block[24]; 1153 src[4*stride]= v += block[32]; 1154 src[5*stride]= v += block[40]; 1155 src[6*stride]= v += block[48]; 1156 src[7*stride]= v + block[56]; 1157 src++; 1158 block++; 1159 } 1160 1161 memset(_block, 0, sizeof(dctcoef) * 64); 1162} 1163 1164static void FUNCC(pred8x8l_horizontal_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft, 1165 int has_topright, ptrdiff_t _stride) 1166{ 1167 int i; 1168 pixel *src = (pixel*)_src; 1169 const dctcoef *block = (const dctcoef*)_block; 1170 pixel pix[8]; 1171 int stride = _stride>>(sizeof(pixel)-1); 1172 PREDICT_8x8_LOAD_LEFT; 1173 1174 pix[0] = l0; 1175 pix[1] = l1; 1176 pix[2] = l2; 1177 pix[3] = l3; 1178 pix[4] = l4; 1179 pix[5] = l5; 1180 pix[6] = l6; 1181 pix[7] = l7; 1182 1183 for(i=0; i<8; i++){ 1184 pixel v = pix[i]; 1185 src[0]= v += block[0]; 1186 src[1]= v += block[1]; 1187 src[2]= v += block[2]; 1188 src[3]= v += block[3]; 1189 src[4]= v += block[4]; 1190 src[5]= v += block[5]; 1191 src[6]= v += block[6]; 1192 src[7]= v + block[7]; 1193 src+= stride; 1194 block+= 8; 1195 } 1196 1197 memset(_block, 0, sizeof(dctcoef) * 64); 1198} 1199 1200#undef PREDICT_8x8_LOAD_LEFT 1201#undef PREDICT_8x8_LOAD_TOP 1202#undef PREDICT_8x8_LOAD_TOPLEFT 1203#undef PREDICT_8x8_LOAD_TOPRIGHT 1204#undef PREDICT_8x8_DC 1205#undef PTR 1206#undef PT 1207#undef PL 1208#undef SRC 1209 1210static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block, 1211 ptrdiff_t stride) 1212{ 1213 int i; 1214 pixel *pix = (pixel*)_pix; 1215 const dctcoef *block = (const dctcoef*)_block; 1216 stride >>= sizeof(pixel)-1; 1217 pix -= stride; 1218 for(i=0; i<4; i++){ 1219 pixel v = pix[0]; 1220 pix[1*stride]= v += block[0]; 1221 pix[2*stride]= v += block[4]; 1222 pix[3*stride]= v += block[8]; 1223 pix[4*stride]= v + block[12]; 1224 pix++; 1225 block++; 1226 } 1227 1228 memset(_block, 0, sizeof(dctcoef) * 16); 1229} 1230 1231static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block, 1232 ptrdiff_t stride) 1233{ 1234 int i; 1235 pixel *pix = (pixel*)_pix; 1236 const dctcoef *block = (const dctcoef*)_block; 1237 stride >>= sizeof(pixel)-1; 1238 for(i=0; i<4; i++){ 1239 pixel v = pix[-1]; 1240 pix[0]= v += block[0]; 1241 pix[1]= v += block[1]; 1242 pix[2]= v += block[2]; 1243 pix[3]= v + block[3]; 1244 pix+= stride; 1245 block+= 4; 1246 } 1247 1248 memset(_block, 0, sizeof(dctcoef) * 16); 1249} 1250 1251static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block, 1252 ptrdiff_t stride) 1253{ 1254 int i; 1255 pixel *pix = (pixel*)_pix; 1256 const dctcoef *block = (const dctcoef*)_block; 1257 stride >>= sizeof(pixel)-1; 1258 pix -= stride; 1259 for(i=0; i<8; i++){ 1260 pixel v = pix[0]; 1261 pix[1*stride]= v += block[0]; 1262 pix[2*stride]= v += block[8]; 1263 pix[3*stride]= v += block[16]; 1264 pix[4*stride]= v += block[24]; 1265 pix[5*stride]= v += block[32]; 1266 pix[6*stride]= v += block[40]; 1267 pix[7*stride]= v += block[48]; 1268 pix[8*stride]= v + block[56]; 1269 pix++; 1270 block++; 1271 } 1272 1273 memset(_block, 0, sizeof(dctcoef) * 64); 1274} 1275 1276static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block, 1277 ptrdiff_t stride) 1278{ 1279 int i; 1280 pixel *pix = (pixel*)_pix; 1281 const dctcoef *block = (const dctcoef*)_block; 1282 stride >>= sizeof(pixel)-1; 1283 for(i=0; i<8; i++){ 1284 pixel v = pix[-1]; 1285 pix[0]= v += block[0]; 1286 pix[1]= v += block[1]; 1287 pix[2]= v += block[2]; 1288 pix[3]= v += block[3]; 1289 pix[4]= v += block[4]; 1290 pix[5]= v += block[5]; 1291 pix[6]= v += block[6]; 1292 pix[7]= v + block[7]; 1293 pix+= stride; 1294 block+= 8; 1295 } 1296 1297 memset(_block, 0, sizeof(dctcoef) * 64); 1298} 1299 1300static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, 1301 int16_t *block, 1302 ptrdiff_t stride) 1303{ 1304 int i; 1305 for(i=0; i<16; i++) 1306 FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); 1307} 1308 1309static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, 1310 const int *block_offset, 1311 int16_t *block, 1312 ptrdiff_t stride) 1313{ 1314 int i; 1315 for(i=0; i<16; i++) 1316 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); 1317} 1318 1319static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, 1320 int16_t *block, ptrdiff_t stride) 1321{ 1322 int i; 1323 for(i=0; i<4; i++) 1324 FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); 1325} 1326 1327static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, 1328 int16_t *block, ptrdiff_t stride) 1329{ 1330 int i; 1331 for(i=0; i<4; i++) 1332 FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); 1333 for(i=4; i<8; i++) 1334 FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride); 1335} 1336 1337static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, 1338 int16_t *block, 1339 ptrdiff_t stride) 1340{ 1341 int i; 1342 for(i=0; i<4; i++) 1343 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); 1344} 1345 1346static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, 1347 const int *block_offset, 1348 int16_t *block, ptrdiff_t stride) 1349{ 1350 int i; 1351 for(i=0; i<4; i++) 1352 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); 1353 for(i=4; i<8; i++) 1354 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride); 1355} 1356