1/* 2 * DSP utils 3 * Copyright (c) 2000, 2001 Fabrice Bellard 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23/** 24 * @file 25 * DSP utils 26 */ 27 28#include "libavutil/attributes.h" 29#include "libavutil/internal.h" 30#include "avcodec.h" 31#include "copy_block.h" 32#include "dsputil.h" 33#include "simple_idct.h" 34#include "mpegvideo.h" 35#include "config.h" 36 37uint32_t ff_square_tab[512] = { 0, }; 38 39static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 40 int line_size, int h) 41{ 42 int s = 0, i; 43 uint32_t *sq = ff_square_tab + 256; 44 45 for (i = 0; i < h; i++) { 46 s += sq[pix1[0] - pix2[0]]; 47 s += sq[pix1[1] - pix2[1]]; 48 s += sq[pix1[2] - pix2[2]]; 49 s += sq[pix1[3] - pix2[3]]; 50 pix1 += line_size; 51 pix2 += line_size; 52 } 53 return s; 54} 55 56static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 57 int line_size, int h) 58{ 59 int s = 0, i; 60 uint32_t *sq = ff_square_tab + 256; 61 62 for (i = 0; i < h; i++) { 63 s += sq[pix1[0] - pix2[0]]; 64 s += sq[pix1[1] - pix2[1]]; 65 s += sq[pix1[2] - pix2[2]]; 66 s += sq[pix1[3] - pix2[3]]; 67 s += sq[pix1[4] - pix2[4]]; 68 s += sq[pix1[5] - pix2[5]]; 69 s += sq[pix1[6] - pix2[6]]; 70 s += sq[pix1[7] - pix2[7]]; 71 pix1 += line_size; 72 pix2 += line_size; 73 } 74 return s; 75} 76 77static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 78 int line_size, int h) 79{ 80 int s = 0, i; 81 uint32_t *sq = ff_square_tab + 256; 82 83 for (i = 0; i < h; i++) { 84 s += sq[pix1[0] - pix2[0]]; 85 s += sq[pix1[1] - pix2[1]]; 86 s += sq[pix1[2] - pix2[2]]; 87 s += sq[pix1[3] - pix2[3]]; 88 s += sq[pix1[4] - pix2[4]]; 89 s += sq[pix1[5] - pix2[5]]; 90 s += sq[pix1[6] - pix2[6]]; 91 s += sq[pix1[7] - pix2[7]]; 92 s += sq[pix1[8] - pix2[8]]; 93 s += sq[pix1[9] - pix2[9]]; 94 s += sq[pix1[10] - pix2[10]]; 95 s += sq[pix1[11] - pix2[11]]; 96 s += sq[pix1[12] - pix2[12]]; 97 s += sq[pix1[13] - pix2[13]]; 98 s += sq[pix1[14] - pix2[14]]; 99 s += sq[pix1[15] - pix2[15]]; 100 101 pix1 += line_size; 102 pix2 += line_size; 103 } 104 return s; 105} 106 107static int sum_abs_dctelem_c(int16_t *block) 108{ 109 int sum = 0, i; 110 111 for (i = 0; i < 64; i++) 112 sum += FFABS(block[i]); 113 return sum; 114} 115 116#define avg2(a, b) ((a + b + 1) >> 1) 117#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2) 118 119static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 120 int line_size, int h) 121{ 122 int s = 0, i; 123 124 for (i = 0; i < h; i++) { 125 s += abs(pix1[0] - pix2[0]); 126 s += abs(pix1[1] - pix2[1]); 127 s += abs(pix1[2] - pix2[2]); 128 s += abs(pix1[3] - pix2[3]); 129 s += abs(pix1[4] - pix2[4]); 130 s += abs(pix1[5] - pix2[5]); 131 s += abs(pix1[6] - pix2[6]); 132 s += abs(pix1[7] - pix2[7]); 133 s += abs(pix1[8] - pix2[8]); 134 s += abs(pix1[9] - pix2[9]); 135 s += abs(pix1[10] - pix2[10]); 136 s += abs(pix1[11] - pix2[11]); 137 s += abs(pix1[12] - pix2[12]); 138 s += abs(pix1[13] - pix2[13]); 139 s += abs(pix1[14] - pix2[14]); 140 s += abs(pix1[15] - pix2[15]); 141 pix1 += line_size; 142 pix2 += line_size; 143 } 144 return s; 145} 146 147static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 148 int line_size, int h) 149{ 150 int s = 0, i; 151 152 for (i = 0; i < h; i++) { 153 s += abs(pix1[0] - avg2(pix2[0], pix2[1])); 154 s += abs(pix1[1] - avg2(pix2[1], pix2[2])); 155 s += abs(pix1[2] - avg2(pix2[2], pix2[3])); 156 s += abs(pix1[3] - avg2(pix2[3], pix2[4])); 157 s += abs(pix1[4] - avg2(pix2[4], pix2[5])); 158 s += abs(pix1[5] - avg2(pix2[5], pix2[6])); 159 s += abs(pix1[6] - avg2(pix2[6], pix2[7])); 160 s += abs(pix1[7] - avg2(pix2[7], pix2[8])); 161 s += abs(pix1[8] - avg2(pix2[8], pix2[9])); 162 s += abs(pix1[9] - avg2(pix2[9], pix2[10])); 163 s += abs(pix1[10] - avg2(pix2[10], pix2[11])); 164 s += abs(pix1[11] - avg2(pix2[11], pix2[12])); 165 s += abs(pix1[12] - avg2(pix2[12], pix2[13])); 166 s += abs(pix1[13] - avg2(pix2[13], pix2[14])); 167 s += abs(pix1[14] - avg2(pix2[14], pix2[15])); 168 s += abs(pix1[15] - avg2(pix2[15], pix2[16])); 169 pix1 += line_size; 170 pix2 += line_size; 171 } 172 return s; 173} 174 175static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 176 int line_size, int h) 177{ 178 int s = 0, i; 179 uint8_t *pix3 = pix2 + line_size; 180 181 for (i = 0; i < h; i++) { 182 s += abs(pix1[0] - avg2(pix2[0], pix3[0])); 183 s += abs(pix1[1] - avg2(pix2[1], pix3[1])); 184 s += abs(pix1[2] - avg2(pix2[2], pix3[2])); 185 s += abs(pix1[3] - avg2(pix2[3], pix3[3])); 186 s += abs(pix1[4] - avg2(pix2[4], pix3[4])); 187 s += abs(pix1[5] - avg2(pix2[5], pix3[5])); 188 s += abs(pix1[6] - avg2(pix2[6], pix3[6])); 189 s += abs(pix1[7] - avg2(pix2[7], pix3[7])); 190 s += abs(pix1[8] - avg2(pix2[8], pix3[8])); 191 s += abs(pix1[9] - avg2(pix2[9], pix3[9])); 192 s += abs(pix1[10] - avg2(pix2[10], pix3[10])); 193 s += abs(pix1[11] - avg2(pix2[11], pix3[11])); 194 s += abs(pix1[12] - avg2(pix2[12], pix3[12])); 195 s += abs(pix1[13] - avg2(pix2[13], pix3[13])); 196 s += abs(pix1[14] - avg2(pix2[14], pix3[14])); 197 s += abs(pix1[15] - avg2(pix2[15], pix3[15])); 198 pix1 += line_size; 199 pix2 += line_size; 200 pix3 += line_size; 201 } 202 return s; 203} 204 205static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 206 int line_size, int h) 207{ 208 int s = 0, i; 209 uint8_t *pix3 = pix2 + line_size; 210 211 for (i = 0; i < h; i++) { 212 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); 213 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); 214 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); 215 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); 216 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); 217 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); 218 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); 219 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); 220 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9])); 221 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10])); 222 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11])); 223 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12])); 224 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13])); 225 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14])); 226 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15])); 227 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16])); 228 pix1 += line_size; 229 pix2 += line_size; 230 pix3 += line_size; 231 } 232 return s; 233} 234 235static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 236 int line_size, int h) 237{ 238 int s = 0, i; 239 240 for (i = 0; i < h; i++) { 241 s += abs(pix1[0] - pix2[0]); 242 s += abs(pix1[1] - pix2[1]); 243 s += abs(pix1[2] - pix2[2]); 244 s += abs(pix1[3] - pix2[3]); 245 s += abs(pix1[4] - pix2[4]); 246 s += abs(pix1[5] - pix2[5]); 247 s += abs(pix1[6] - pix2[6]); 248 s += abs(pix1[7] - pix2[7]); 249 pix1 += line_size; 250 pix2 += line_size; 251 } 252 return s; 253} 254 255static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 256 int line_size, int h) 257{ 258 int s = 0, i; 259 260 for (i = 0; i < h; i++) { 261 s += abs(pix1[0] - avg2(pix2[0], pix2[1])); 262 s += abs(pix1[1] - avg2(pix2[1], pix2[2])); 263 s += abs(pix1[2] - avg2(pix2[2], pix2[3])); 264 s += abs(pix1[3] - avg2(pix2[3], pix2[4])); 265 s += abs(pix1[4] - avg2(pix2[4], pix2[5])); 266 s += abs(pix1[5] - avg2(pix2[5], pix2[6])); 267 s += abs(pix1[6] - avg2(pix2[6], pix2[7])); 268 s += abs(pix1[7] - avg2(pix2[7], pix2[8])); 269 pix1 += line_size; 270 pix2 += line_size; 271 } 272 return s; 273} 274 275static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 276 int line_size, int h) 277{ 278 int s = 0, i; 279 uint8_t *pix3 = pix2 + line_size; 280 281 for (i = 0; i < h; i++) { 282 s += abs(pix1[0] - avg2(pix2[0], pix3[0])); 283 s += abs(pix1[1] - avg2(pix2[1], pix3[1])); 284 s += abs(pix1[2] - avg2(pix2[2], pix3[2])); 285 s += abs(pix1[3] - avg2(pix2[3], pix3[3])); 286 s += abs(pix1[4] - avg2(pix2[4], pix3[4])); 287 s += abs(pix1[5] - avg2(pix2[5], pix3[5])); 288 s += abs(pix1[6] - avg2(pix2[6], pix3[6])); 289 s += abs(pix1[7] - avg2(pix2[7], pix3[7])); 290 pix1 += line_size; 291 pix2 += line_size; 292 pix3 += line_size; 293 } 294 return s; 295} 296 297static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 298 int line_size, int h) 299{ 300 int s = 0, i; 301 uint8_t *pix3 = pix2 + line_size; 302 303 for (i = 0; i < h; i++) { 304 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); 305 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); 306 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); 307 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); 308 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); 309 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); 310 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); 311 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); 312 pix1 += line_size; 313 pix2 += line_size; 314 pix3 += line_size; 315 } 316 return s; 317} 318 319static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) 320{ 321 int score1 = 0, score2 = 0, x, y; 322 323 for (y = 0; y < h; y++) { 324 for (x = 0; x < 16; x++) 325 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); 326 if (y + 1 < h) { 327 for (x = 0; x < 15; x++) 328 score2 += FFABS(s1[x] - s1[x + stride] - 329 s1[x + 1] + s1[x + stride + 1]) - 330 FFABS(s2[x] - s2[x + stride] - 331 s2[x + 1] + s2[x + stride + 1]); 332 } 333 s1 += stride; 334 s2 += stride; 335 } 336 337 if (c) 338 return score1 + FFABS(score2) * c->avctx->nsse_weight; 339 else 340 return score1 + FFABS(score2) * 8; 341} 342 343static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) 344{ 345 int score1 = 0, score2 = 0, x, y; 346 347 for (y = 0; y < h; y++) { 348 for (x = 0; x < 8; x++) 349 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); 350 if (y + 1 < h) { 351 for (x = 0; x < 7; x++) 352 score2 += FFABS(s1[x] - s1[x + stride] - 353 s1[x + 1] + s1[x + stride + 1]) - 354 FFABS(s2[x] - s2[x + stride] - 355 s2[x + 1] + s2[x + stride + 1]); 356 } 357 s1 += stride; 358 s2 += stride; 359 } 360 361 if (c) 362 return score1 + FFABS(score2) * c->avctx->nsse_weight; 363 else 364 return score1 + FFABS(score2) * 8; 365} 366 367static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b, 368 int stride, int h) 369{ 370 return 0; 371} 372 373void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type) 374{ 375 int i; 376 377 memset(cmp, 0, sizeof(void *) * 6); 378 379 for (i = 0; i < 6; i++) { 380 switch (type & 0xFF) { 381 case FF_CMP_SAD: 382 cmp[i] = c->sad[i]; 383 break; 384 case FF_CMP_SATD: 385 cmp[i] = c->hadamard8_diff[i]; 386 break; 387 case FF_CMP_SSE: 388 cmp[i] = c->sse[i]; 389 break; 390 case FF_CMP_DCT: 391 cmp[i] = c->dct_sad[i]; 392 break; 393 case FF_CMP_DCT264: 394 cmp[i] = c->dct264_sad[i]; 395 break; 396 case FF_CMP_DCTMAX: 397 cmp[i] = c->dct_max[i]; 398 break; 399 case FF_CMP_PSNR: 400 cmp[i] = c->quant_psnr[i]; 401 break; 402 case FF_CMP_BIT: 403 cmp[i] = c->bit[i]; 404 break; 405 case FF_CMP_RD: 406 cmp[i] = c->rd[i]; 407 break; 408 case FF_CMP_VSAD: 409 cmp[i] = c->vsad[i]; 410 break; 411 case FF_CMP_VSSE: 412 cmp[i] = c->vsse[i]; 413 break; 414 case FF_CMP_ZERO: 415 cmp[i] = zero_cmp; 416 break; 417 case FF_CMP_NSSE: 418 cmp[i] = c->nsse[i]; 419 break; 420#if CONFIG_DWT 421 case FF_CMP_W53: 422 cmp[i]= c->w53[i]; 423 break; 424 case FF_CMP_W97: 425 cmp[i]= c->w97[i]; 426 break; 427#endif 428 default: 429 av_log(NULL, AV_LOG_ERROR, 430 "internal error in cmp function selection\n"); 431 } 432 } 433} 434 435#define BUTTERFLY2(o1, o2, i1, i2) \ 436 o1 = (i1) + (i2); \ 437 o2 = (i1) - (i2); 438 439#define BUTTERFLY1(x, y) \ 440 { \ 441 int a, b; \ 442 a = x; \ 443 b = y; \ 444 x = a + b; \ 445 y = a - b; \ 446 } 447 448#define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y))) 449 450static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst, 451 uint8_t *src, int stride, int h) 452{ 453 int i, temp[64], sum = 0; 454 455 av_assert2(h == 8); 456 457 for (i = 0; i < 8; i++) { 458 // FIXME: try pointer walks 459 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], 460 src[stride * i + 0] - dst[stride * i + 0], 461 src[stride * i + 1] - dst[stride * i + 1]); 462 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], 463 src[stride * i + 2] - dst[stride * i + 2], 464 src[stride * i + 3] - dst[stride * i + 3]); 465 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], 466 src[stride * i + 4] - dst[stride * i + 4], 467 src[stride * i + 5] - dst[stride * i + 5]); 468 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], 469 src[stride * i + 6] - dst[stride * i + 6], 470 src[stride * i + 7] - dst[stride * i + 7]); 471 472 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); 473 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); 474 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); 475 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); 476 477 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); 478 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); 479 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); 480 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); 481 } 482 483 for (i = 0; i < 8; i++) { 484 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); 485 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); 486 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); 487 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); 488 489 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); 490 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); 491 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); 492 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); 493 494 sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) + 495 BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) + 496 BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) + 497 BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); 498 } 499 return sum; 500} 501 502static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src, 503 uint8_t *dummy, int stride, int h) 504{ 505 int i, temp[64], sum = 0; 506 507 av_assert2(h == 8); 508 509 for (i = 0; i < 8; i++) { 510 // FIXME: try pointer walks 511 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], 512 src[stride * i + 0], src[stride * i + 1]); 513 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], 514 src[stride * i + 2], src[stride * i + 3]); 515 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], 516 src[stride * i + 4], src[stride * i + 5]); 517 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], 518 src[stride * i + 6], src[stride * i + 7]); 519 520 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); 521 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); 522 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); 523 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); 524 525 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); 526 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); 527 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); 528 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); 529 } 530 531 for (i = 0; i < 8; i++) { 532 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); 533 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); 534 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); 535 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); 536 537 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); 538 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); 539 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); 540 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); 541 542 sum += 543 BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) 544 + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) 545 + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) 546 + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); 547 } 548 549 sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean 550 551 return sum; 552} 553 554static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1, 555 uint8_t *src2, int stride, int h) 556{ 557 LOCAL_ALIGNED_16(int16_t, temp, [64]); 558 559 av_assert2(h == 8); 560 561 s->pdsp.diff_pixels(temp, src1, src2, stride); 562 s->fdsp.fdct(temp); 563 return s->dsp.sum_abs_dctelem(temp); 564} 565 566#if CONFIG_GPL 567#define DCT8_1D \ 568 { \ 569 const int s07 = SRC(0) + SRC(7); \ 570 const int s16 = SRC(1) + SRC(6); \ 571 const int s25 = SRC(2) + SRC(5); \ 572 const int s34 = SRC(3) + SRC(4); \ 573 const int a0 = s07 + s34; \ 574 const int a1 = s16 + s25; \ 575 const int a2 = s07 - s34; \ 576 const int a3 = s16 - s25; \ 577 const int d07 = SRC(0) - SRC(7); \ 578 const int d16 = SRC(1) - SRC(6); \ 579 const int d25 = SRC(2) - SRC(5); \ 580 const int d34 = SRC(3) - SRC(4); \ 581 const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \ 582 const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \ 583 const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \ 584 const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \ 585 DST(0, a0 + a1); \ 586 DST(1, a4 + (a7 >> 2)); \ 587 DST(2, a2 + (a3 >> 1)); \ 588 DST(3, a5 + (a6 >> 2)); \ 589 DST(4, a0 - a1); \ 590 DST(5, a6 - (a5 >> 2)); \ 591 DST(6, (a2 >> 1) - a3); \ 592 DST(7, (a4 >> 2) - a7); \ 593 } 594 595static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1, 596 uint8_t *src2, int stride, int h) 597{ 598 int16_t dct[8][8]; 599 int i, sum = 0; 600 601 s->pdsp.diff_pixels(dct[0], src1, src2, stride); 602 603#define SRC(x) dct[i][x] 604#define DST(x, v) dct[i][x] = v 605 for (i = 0; i < 8; i++) 606 DCT8_1D 607#undef SRC 608#undef DST 609 610#define SRC(x) dct[x][i] 611#define DST(x, v) sum += FFABS(v) 612 for (i = 0; i < 8; i++) 613 DCT8_1D 614#undef SRC 615#undef DST 616 return sum; 617} 618#endif 619 620static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1, 621 uint8_t *src2, int stride, int h) 622{ 623 LOCAL_ALIGNED_16(int16_t, temp, [64]); 624 int sum = 0, i; 625 626 av_assert2(h == 8); 627 628 s->pdsp.diff_pixels(temp, src1, src2, stride); 629 s->fdsp.fdct(temp); 630 631 for (i = 0; i < 64; i++) 632 sum = FFMAX(sum, FFABS(temp[i])); 633 634 return sum; 635} 636 637static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1, 638 uint8_t *src2, int stride, int h) 639{ 640 LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]); 641 int16_t *const bak = temp + 64; 642 int sum = 0, i; 643 644 av_assert2(h == 8); 645 s->mb_intra = 0; 646 647 s->pdsp.diff_pixels(temp, src1, src2, stride); 648 649 memcpy(bak, temp, 64 * sizeof(int16_t)); 650 651 s->block_last_index[0 /* FIXME */] = 652 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); 653 s->dct_unquantize_inter(s, temp, 0, s->qscale); 654 ff_simple_idct_8(temp); // FIXME 655 656 for (i = 0; i < 64; i++) 657 sum += (temp[i] - bak[i]) * (temp[i] - bak[i]); 658 659 return sum; 660} 661 662static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, 663 int stride, int h) 664{ 665 const uint8_t *scantable = s->intra_scantable.permutated; 666 LOCAL_ALIGNED_16(int16_t, temp, [64]); 667 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]); 668 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]); 669 int i, last, run, bits, level, distortion, start_i; 670 const int esc_length = s->ac_esc_length; 671 uint8_t *length, *last_length; 672 673 av_assert2(h == 8); 674 675 copy_block8(lsrc1, src1, 8, stride, 8); 676 copy_block8(lsrc2, src2, 8, stride, 8); 677 678 s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8); 679 680 s->block_last_index[0 /* FIXME */] = 681 last = 682 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); 683 684 bits = 0; 685 686 if (s->mb_intra) { 687 start_i = 1; 688 length = s->intra_ac_vlc_length; 689 last_length = s->intra_ac_vlc_last_length; 690 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma 691 } else { 692 start_i = 0; 693 length = s->inter_ac_vlc_length; 694 last_length = s->inter_ac_vlc_last_length; 695 } 696 697 if (last >= start_i) { 698 run = 0; 699 for (i = start_i; i < last; i++) { 700 int j = scantable[i]; 701 level = temp[j]; 702 703 if (level) { 704 level += 64; 705 if ((level & (~127)) == 0) 706 bits += length[UNI_AC_ENC_INDEX(run, level)]; 707 else 708 bits += esc_length; 709 run = 0; 710 } else 711 run++; 712 } 713 i = scantable[last]; 714 715 level = temp[i] + 64; 716 717 av_assert2(level - 64); 718 719 if ((level & (~127)) == 0) { 720 bits += last_length[UNI_AC_ENC_INDEX(run, level)]; 721 } else 722 bits += esc_length; 723 } 724 725 if (last >= 0) { 726 if (s->mb_intra) 727 s->dct_unquantize_intra(s, temp, 0, s->qscale); 728 else 729 s->dct_unquantize_inter(s, temp, 0, s->qscale); 730 } 731 732 s->idsp.idct_add(lsrc2, 8, temp); 733 734 distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8); 735 736 return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7); 737} 738 739static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, 740 int stride, int h) 741{ 742 const uint8_t *scantable = s->intra_scantable.permutated; 743 LOCAL_ALIGNED_16(int16_t, temp, [64]); 744 int i, last, run, bits, level, start_i; 745 const int esc_length = s->ac_esc_length; 746 uint8_t *length, *last_length; 747 748 av_assert2(h == 8); 749 750 s->pdsp.diff_pixels(temp, src1, src2, stride); 751 752 s->block_last_index[0 /* FIXME */] = 753 last = 754 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); 755 756 bits = 0; 757 758 if (s->mb_intra) { 759 start_i = 1; 760 length = s->intra_ac_vlc_length; 761 last_length = s->intra_ac_vlc_last_length; 762 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma 763 } else { 764 start_i = 0; 765 length = s->inter_ac_vlc_length; 766 last_length = s->inter_ac_vlc_last_length; 767 } 768 769 if (last >= start_i) { 770 run = 0; 771 for (i = start_i; i < last; i++) { 772 int j = scantable[i]; 773 level = temp[j]; 774 775 if (level) { 776 level += 64; 777 if ((level & (~127)) == 0) 778 bits += length[UNI_AC_ENC_INDEX(run, level)]; 779 else 780 bits += esc_length; 781 run = 0; 782 } else 783 run++; 784 } 785 i = scantable[last]; 786 787 level = temp[i] + 64; 788 789 av_assert2(level - 64); 790 791 if ((level & (~127)) == 0) 792 bits += last_length[UNI_AC_ENC_INDEX(run, level)]; 793 else 794 bits += esc_length; 795 } 796 797 return bits; 798} 799 800#define VSAD_INTRA(size) \ 801static int vsad_intra ## size ## _c(MpegEncContext *c, \ 802 uint8_t *s, uint8_t *dummy, \ 803 int stride, int h) \ 804{ \ 805 int score = 0, x, y; \ 806 \ 807 for (y = 1; y < h; y++) { \ 808 for (x = 0; x < size; x += 4) { \ 809 score += FFABS(s[x] - s[x + stride]) + \ 810 FFABS(s[x + 1] - s[x + stride + 1]) + \ 811 FFABS(s[x + 2] - s[x + 2 + stride]) + \ 812 FFABS(s[x + 3] - s[x + 3 + stride]); \ 813 } \ 814 s += stride; \ 815 } \ 816 \ 817 return score; \ 818} 819VSAD_INTRA(8) 820VSAD_INTRA(16) 821 822#define VSAD(size) \ 823static int vsad ## size ## _c(MpegEncContext *c, \ 824 uint8_t *s1, uint8_t *s2, \ 825 int stride, int h) \ 826{ \ 827 int score = 0, x, y; \ 828 \ 829 for (y = 1; y < h; y++) { \ 830 for (x = 0; x < size; x++) \ 831 score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); \ 832 s1 += stride; \ 833 s2 += stride; \ 834 } \ 835 \ 836 return score; \ 837} 838VSAD(8) 839VSAD(16) 840 841#define SQ(a) ((a) * (a)) 842#define VSSE_INTRA(size) \ 843static int vsse_intra ## size ## _c(MpegEncContext *c, \ 844 uint8_t *s, uint8_t *dummy, \ 845 int stride, int h) \ 846{ \ 847 int score = 0, x, y; \ 848 \ 849 for (y = 1; y < h; y++) { \ 850 for (x = 0; x < size; x += 4) { \ 851 score += SQ(s[x] - s[x + stride]) + \ 852 SQ(s[x + 1] - s[x + stride + 1]) + \ 853 SQ(s[x + 2] - s[x + stride + 2]) + \ 854 SQ(s[x + 3] - s[x + stride + 3]); \ 855 } \ 856 s += stride; \ 857 } \ 858 \ 859 return score; \ 860} 861VSSE_INTRA(8) 862VSSE_INTRA(16) 863 864#define VSSE(size) \ 865static int vsse ## size ## _c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, \ 866 int stride, int h) \ 867{ \ 868 int score = 0, x, y; \ 869 \ 870 for (y = 1; y < h; y++) { \ 871 for (x = 0; x < size; x++) \ 872 score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); \ 873 s1 += stride; \ 874 s2 += stride; \ 875 } \ 876 \ 877 return score; \ 878} 879VSSE(8) 880VSSE(16) 881 882#define WRAPPER8_16_SQ(name8, name16) \ 883static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \ 884 int stride, int h) \ 885{ \ 886 int score = 0; \ 887 \ 888 score += name8(s, dst, src, stride, 8); \ 889 score += name8(s, dst + 8, src + 8, stride, 8); \ 890 if (h == 16) { \ 891 dst += 8 * stride; \ 892 src += 8 * stride; \ 893 score += name8(s, dst, src, stride, 8); \ 894 score += name8(s, dst + 8, src + 8, stride, 8); \ 895 } \ 896 return score; \ 897} 898 899WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c) 900WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c) 901WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c) 902#if CONFIG_GPL 903WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c) 904#endif 905WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c) 906WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) 907WRAPPER8_16_SQ(rd8x8_c, rd16_c) 908WRAPPER8_16_SQ(bit8x8_c, bit16_c) 909 910/* init static data */ 911av_cold void ff_dsputil_static_init(void) 912{ 913 int i; 914 915 for (i = 0; i < 512; i++) 916 ff_square_tab[i] = (i - 256) * (i - 256); 917} 918 919int ff_check_alignment(void) 920{ 921 static int did_fail = 0; 922 LOCAL_ALIGNED_16(int, aligned, [4]); 923 924 if ((intptr_t)aligned & 15) { 925 if (!did_fail) { 926#if HAVE_MMX || HAVE_ALTIVEC 927 av_log(NULL, AV_LOG_ERROR, 928 "Compiler did not align stack variables. Libavcodec has been miscompiled\n" 929 "and may be very slow or crash. This is not a bug in libavcodec,\n" 930 "but in the compiler. You may try recompiling using gcc >= 4.2.\n" 931 "Do not report crashes to FFmpeg developers.\n"); 932#endif 933 did_fail=1; 934 } 935 return -1; 936 } 937 return 0; 938} 939 940av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) 941{ 942 ff_check_alignment(); 943 944 c->sum_abs_dctelem = sum_abs_dctelem_c; 945 946 /* TODO [0] 16 [1] 8 */ 947 c->pix_abs[0][0] = pix_abs16_c; 948 c->pix_abs[0][1] = pix_abs16_x2_c; 949 c->pix_abs[0][2] = pix_abs16_y2_c; 950 c->pix_abs[0][3] = pix_abs16_xy2_c; 951 c->pix_abs[1][0] = pix_abs8_c; 952 c->pix_abs[1][1] = pix_abs8_x2_c; 953 c->pix_abs[1][2] = pix_abs8_y2_c; 954 c->pix_abs[1][3] = pix_abs8_xy2_c; 955 956#define SET_CMP_FUNC(name) \ 957 c->name[0] = name ## 16_c; \ 958 c->name[1] = name ## 8x8_c; 959 960 SET_CMP_FUNC(hadamard8_diff) 961 c->hadamard8_diff[4] = hadamard8_intra16_c; 962 c->hadamard8_diff[5] = hadamard8_intra8x8_c; 963 SET_CMP_FUNC(dct_sad) 964 SET_CMP_FUNC(dct_max) 965#if CONFIG_GPL 966 SET_CMP_FUNC(dct264_sad) 967#endif 968 c->sad[0] = pix_abs16_c; 969 c->sad[1] = pix_abs8_c; 970 c->sse[0] = sse16_c; 971 c->sse[1] = sse8_c; 972 c->sse[2] = sse4_c; 973 SET_CMP_FUNC(quant_psnr) 974 SET_CMP_FUNC(rd) 975 SET_CMP_FUNC(bit) 976 c->vsad[0] = vsad16_c; 977 c->vsad[1] = vsad8_c; 978 c->vsad[4] = vsad_intra16_c; 979 c->vsad[5] = vsad_intra8_c; 980 c->vsse[0] = vsse16_c; 981 c->vsse[1] = vsse8_c; 982 c->vsse[4] = vsse_intra16_c; 983 c->vsse[5] = vsse_intra8_c; 984 c->nsse[0] = nsse16_c; 985 c->nsse[1] = nsse8_c; 986#if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER 987 ff_dsputil_init_dwt(c); 988#endif 989 990 if (ARCH_ALPHA) 991 ff_dsputil_init_alpha(c, avctx); 992 if (ARCH_ARM) 993 ff_dsputil_init_arm(c, avctx); 994 if (ARCH_PPC) 995 ff_dsputil_init_ppc(c, avctx); 996 if (ARCH_X86) 997 ff_dsputil_init_x86(c, avctx); 998} 999 1000av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) 1001{ 1002 ff_dsputil_init(c, avctx); 1003} 1004 1005av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx) 1006{ 1007 ff_dsputil_init(c, avctx); 1008} 1009