1/* 2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21//#define DEBUG_ALIGNMENT 22#ifdef DEBUG_ALIGNMENT 23#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); 24#else 25#define ASSERT_ALIGNED(ptr) ; 26#endif 27 28/* this code assume that stride % 16 == 0 */ 29 30#define CHROMA_MC8_ALTIVEC_CORE \ 31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ 32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ 33\ 34 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 35 psum = vec_mladd(vB, vsrc1ssH, psum);\ 36 psum = vec_mladd(vC, vsrc2ssH, psum);\ 37 psum = vec_mladd(vD, vsrc3ssH, psum);\ 38 psum = vec_sr(psum, v6us);\ 39\ 40 vdst = vec_ld(0, dst);\ 41 ppsum = (vec_u8)vec_pack(psum, psum);\ 42 vfdst = vec_perm(vdst, ppsum, fperm);\ 43\ 44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 45\ 46 vec_st(fsum, 0, dst);\ 47\ 48 vsrc0ssH = vsrc2ssH;\ 49 vsrc1ssH = vsrc3ssH;\ 50\ 51 dst += stride;\ 52 src += stride; 53 54#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ 55\ 56 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\ 57 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\ 58\ 59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 60 psum = vec_mladd(vE, vsrc1ssH, psum);\ 61 psum = vec_sr(psum, v6us);\ 62\ 63 vdst = vec_ld(0, dst);\ 64 ppsum = (vec_u8)vec_pack(psum, psum);\ 65 vfdst = vec_perm(vdst, ppsum, fperm);\ 66\ 67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 68\ 69 vec_st(fsum, 0, dst);\ 70\ 71 dst += stride;\ 72 src += stride; 73 74void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, 75 int stride, int h, int x, int y) { 76 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); 77 DECLARE_ALIGNED_16(signed int, ABCD[4]) = 78 {((8 - x) * (8 - y)), 79 (( x) * (8 - y)), 80 ((8 - x) * ( y)), 81 (( x) * ( y))}; 82 register int i; 83 vec_u8 fperm; 84 const vec_s32 vABCD = vec_ld(0, ABCD); 85 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); 86 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); 87 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); 88 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); 89 LOAD_ZERO; 90 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); 91 const vec_u16 v6us = vec_splat_u16(6); 92 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 93 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 94 95 vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; 96 vec_u8 vsrc0uc, vsrc1uc; 97 vec_s16 vsrc0ssH, vsrc1ssH; 98 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; 99 vec_s16 vsrc2ssH, vsrc3ssH, psum; 100 vec_u8 vdst, ppsum, vfdst, fsum; 101 102 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); 103 104 if (((unsigned long)dst) % 16 == 0) { 105 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 106 0x14, 0x15, 0x16, 0x17, 107 0x08, 0x09, 0x0A, 0x0B, 108 0x0C, 0x0D, 0x0E, 0x0F}; 109 } else { 110 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 111 0x04, 0x05, 0x06, 0x07, 112 0x18, 0x19, 0x1A, 0x1B, 113 0x1C, 0x1D, 0x1E, 0x1F}; 114 } 115 116 vsrcAuc = vec_ld(0, src); 117 118 if (loadSecond) 119 vsrcBuc = vec_ld(16, src); 120 vsrcperm0 = vec_lvsl(0, src); 121 vsrcperm1 = vec_lvsl(1, src); 122 123 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); 124 if (reallyBadAlign) 125 vsrc1uc = vsrcBuc; 126 else 127 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 128 129 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); 130 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); 131 132 if (ABCD[3]) { 133 if (!loadSecond) {// -> !reallyBadAlign 134 for (i = 0 ; i < h ; i++) { 135 vsrcCuc = vec_ld(stride + 0, src); 136 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 138 139 CHROMA_MC8_ALTIVEC_CORE 140 } 141 } else { 142 vec_u8 vsrcDuc; 143 for (i = 0 ; i < h ; i++) { 144 vsrcCuc = vec_ld(stride + 0, src); 145 vsrcDuc = vec_ld(stride + 16, src); 146 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 147 if (reallyBadAlign) 148 vsrc3uc = vsrcDuc; 149 else 150 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 151 152 CHROMA_MC8_ALTIVEC_CORE 153 } 154 } 155 } else { 156 const vec_s16 vE = vec_add(vB, vC); 157 if (ABCD[2]) { // x == 0 B == 0 158 if (!loadSecond) {// -> !reallyBadAlign 159 for (i = 0 ; i < h ; i++) { 160 vsrcCuc = vec_ld(stride + 0, src); 161 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 163 164 vsrc0uc = vsrc1uc; 165 } 166 } else { 167 vec_u8 vsrcDuc; 168 for (i = 0 ; i < h ; i++) { 169 vsrcCuc = vec_ld(stride + 0, src); 170 vsrcDuc = vec_ld(stride + 15, src); 171 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 173 174 vsrc0uc = vsrc1uc; 175 } 176 } 177 } else { // y == 0 C == 0 178 if (!loadSecond) {// -> !reallyBadAlign 179 for (i = 0 ; i < h ; i++) { 180 vsrcCuc = vec_ld(0, src); 181 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 182 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 183 184 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 185 } 186 } else { 187 vec_u8 vsrcDuc; 188 for (i = 0 ; i < h ; i++) { 189 vsrcCuc = vec_ld(0, src); 190 vsrcDuc = vec_ld(15, src); 191 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 192 if (reallyBadAlign) 193 vsrc1uc = vsrcDuc; 194 else 195 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 196 197 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 198 } 199 } 200 } 201 } 202 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); 203} 204 205#undef CHROMA_MC8_ALTIVEC_CORE 206 207/* this code assume stride % 16 == 0 */ 208static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); 210 register int i; 211 212 LOAD_ZERO; 213 const vec_u8 permM2 = vec_lvsl(-2, src); 214 const vec_u8 permM1 = vec_lvsl(-1, src); 215 const vec_u8 permP0 = vec_lvsl(+0, src); 216 const vec_u8 permP1 = vec_lvsl(+1, src); 217 const vec_u8 permP2 = vec_lvsl(+2, src); 218 const vec_u8 permP3 = vec_lvsl(+3, src); 219 const vec_s16 v5ss = vec_splat_s16(5); 220 const vec_u16 v5us = vec_splat_u16(5); 221 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 222 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 223 224 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 225 226 register int align = ((((unsigned long)src) - 2) % 16); 227 228 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 229 srcP2A, srcP2B, srcP3A, srcP3B, 230 srcM1A, srcM1B, srcM2A, srcM2B, 231 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 232 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 233 psumA, psumB, sumA, sumB; 234 235 vec_u8 sum, vdst, fsum; 236 237 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 238 239 for (i = 0 ; i < 16 ; i ++) { 240 vec_u8 srcR1 = vec_ld(-2, src); 241 vec_u8 srcR2 = vec_ld(14, src); 242 243 switch (align) { 244 default: { 245 srcM2 = vec_perm(srcR1, srcR2, permM2); 246 srcM1 = vec_perm(srcR1, srcR2, permM1); 247 srcP0 = vec_perm(srcR1, srcR2, permP0); 248 srcP1 = vec_perm(srcR1, srcR2, permP1); 249 srcP2 = vec_perm(srcR1, srcR2, permP2); 250 srcP3 = vec_perm(srcR1, srcR2, permP3); 251 } break; 252 case 11: { 253 srcM2 = vec_perm(srcR1, srcR2, permM2); 254 srcM1 = vec_perm(srcR1, srcR2, permM1); 255 srcP0 = vec_perm(srcR1, srcR2, permP0); 256 srcP1 = vec_perm(srcR1, srcR2, permP1); 257 srcP2 = vec_perm(srcR1, srcR2, permP2); 258 srcP3 = srcR2; 259 } break; 260 case 12: { 261 vec_u8 srcR3 = vec_ld(30, src); 262 srcM2 = vec_perm(srcR1, srcR2, permM2); 263 srcM1 = vec_perm(srcR1, srcR2, permM1); 264 srcP0 = vec_perm(srcR1, srcR2, permP0); 265 srcP1 = vec_perm(srcR1, srcR2, permP1); 266 srcP2 = srcR2; 267 srcP3 = vec_perm(srcR2, srcR3, permP3); 268 } break; 269 case 13: { 270 vec_u8 srcR3 = vec_ld(30, src); 271 srcM2 = vec_perm(srcR1, srcR2, permM2); 272 srcM1 = vec_perm(srcR1, srcR2, permM1); 273 srcP0 = vec_perm(srcR1, srcR2, permP0); 274 srcP1 = srcR2; 275 srcP2 = vec_perm(srcR2, srcR3, permP2); 276 srcP3 = vec_perm(srcR2, srcR3, permP3); 277 } break; 278 case 14: { 279 vec_u8 srcR3 = vec_ld(30, src); 280 srcM2 = vec_perm(srcR1, srcR2, permM2); 281 srcM1 = vec_perm(srcR1, srcR2, permM1); 282 srcP0 = srcR2; 283 srcP1 = vec_perm(srcR2, srcR3, permP1); 284 srcP2 = vec_perm(srcR2, srcR3, permP2); 285 srcP3 = vec_perm(srcR2, srcR3, permP3); 286 } break; 287 case 15: { 288 vec_u8 srcR3 = vec_ld(30, src); 289 srcM2 = vec_perm(srcR1, srcR2, permM2); 290 srcM1 = srcR2; 291 srcP0 = vec_perm(srcR2, srcR3, permP0); 292 srcP1 = vec_perm(srcR2, srcR3, permP1); 293 srcP2 = vec_perm(srcR2, srcR3, permP2); 294 srcP3 = vec_perm(srcR2, srcR3, permP3); 295 } break; 296 } 297 298 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 299 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 300 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 301 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 302 303 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 304 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 305 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 306 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 307 308 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 309 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 310 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 311 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 312 313 sum1A = vec_adds(srcP0A, srcP1A); 314 sum1B = vec_adds(srcP0B, srcP1B); 315 sum2A = vec_adds(srcM1A, srcP2A); 316 sum2B = vec_adds(srcM1B, srcP2B); 317 sum3A = vec_adds(srcM2A, srcP3A); 318 sum3B = vec_adds(srcM2B, srcP3B); 319 320 pp1A = vec_mladd(sum1A, v20ss, v16ss); 321 pp1B = vec_mladd(sum1B, v20ss, v16ss); 322 323 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 324 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 325 326 pp3A = vec_add(sum3A, pp1A); 327 pp3B = vec_add(sum3B, pp1B); 328 329 psumA = vec_sub(pp3A, pp2A); 330 psumB = vec_sub(pp3B, pp2B); 331 332 sumA = vec_sra(psumA, v5us); 333 sumB = vec_sra(psumB, v5us); 334 335 sum = vec_packsu(sumA, sumB); 336 337 ASSERT_ALIGNED(dst); 338 vdst = vec_ld(0, dst); 339 340 OP_U8_ALTIVEC(fsum, sum, vdst); 341 342 vec_st(fsum, 0, dst); 343 344 src += srcStride; 345 dst += dstStride; 346 } 347 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 348} 349 350/* this code assume stride % 16 == 0 */ 351static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 352 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); 353 354 register int i; 355 356 LOAD_ZERO; 357 const vec_u8 perm = vec_lvsl(0, src); 358 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 359 const vec_u16 v5us = vec_splat_u16(5); 360 const vec_s16 v5ss = vec_splat_s16(5); 361 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 362 363 uint8_t *srcbis = src - (srcStride * 2); 364 365 const vec_u8 srcM2a = vec_ld(0, srcbis); 366 const vec_u8 srcM2b = vec_ld(16, srcbis); 367 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); 368 //srcbis += srcStride; 369 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); 370 const vec_u8 srcM1b = vec_ld(16, srcbis); 371 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); 372 //srcbis += srcStride; 373 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); 374 const vec_u8 srcP0b = vec_ld(16, srcbis); 375 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); 376 //srcbis += srcStride; 377 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); 378 const vec_u8 srcP1b = vec_ld(16, srcbis); 379 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); 380 //srcbis += srcStride; 381 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); 382 const vec_u8 srcP2b = vec_ld(16, srcbis); 383 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); 384 //srcbis += srcStride; 385 386 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); 387 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); 388 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); 389 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); 390 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); 391 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); 392 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); 393 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); 394 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); 395 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); 396 397 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 398 psumA, psumB, sumA, sumB, 399 srcP3ssA, srcP3ssB, 400 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; 401 402 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3; 403 404 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 405 406 for (i = 0 ; i < 16 ; i++) { 407 srcP3a = vec_ld(0, srcbis += srcStride); 408 srcP3b = vec_ld(16, srcbis); 409 srcP3 = vec_perm(srcP3a, srcP3b, perm); 410 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); 411 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); 412 //srcbis += srcStride; 413 414 sum1A = vec_adds(srcP0ssA, srcP1ssA); 415 sum1B = vec_adds(srcP0ssB, srcP1ssB); 416 sum2A = vec_adds(srcM1ssA, srcP2ssA); 417 sum2B = vec_adds(srcM1ssB, srcP2ssB); 418 sum3A = vec_adds(srcM2ssA, srcP3ssA); 419 sum3B = vec_adds(srcM2ssB, srcP3ssB); 420 421 srcM2ssA = srcM1ssA; 422 srcM2ssB = srcM1ssB; 423 srcM1ssA = srcP0ssA; 424 srcM1ssB = srcP0ssB; 425 srcP0ssA = srcP1ssA; 426 srcP0ssB = srcP1ssB; 427 srcP1ssA = srcP2ssA; 428 srcP1ssB = srcP2ssB; 429 srcP2ssA = srcP3ssA; 430 srcP2ssB = srcP3ssB; 431 432 pp1A = vec_mladd(sum1A, v20ss, v16ss); 433 pp1B = vec_mladd(sum1B, v20ss, v16ss); 434 435 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 436 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 437 438 pp3A = vec_add(sum3A, pp1A); 439 pp3B = vec_add(sum3B, pp1B); 440 441 psumA = vec_sub(pp3A, pp2A); 442 psumB = vec_sub(pp3B, pp2B); 443 444 sumA = vec_sra(psumA, v5us); 445 sumB = vec_sra(psumB, v5us); 446 447 sum = vec_packsu(sumA, sumB); 448 449 ASSERT_ALIGNED(dst); 450 vdst = vec_ld(0, dst); 451 452 OP_U8_ALTIVEC(fsum, sum, vdst); 453 454 vec_st(fsum, 0, dst); 455 456 dst += dstStride; 457 } 458 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 459} 460 461/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 462static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { 463 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); 464 register int i; 465 LOAD_ZERO; 466 const vec_u8 permM2 = vec_lvsl(-2, src); 467 const vec_u8 permM1 = vec_lvsl(-1, src); 468 const vec_u8 permP0 = vec_lvsl(+0, src); 469 const vec_u8 permP1 = vec_lvsl(+1, src); 470 const vec_u8 permP2 = vec_lvsl(+2, src); 471 const vec_u8 permP3 = vec_lvsl(+3, src); 472 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 473 const vec_u32 v10ui = vec_splat_u32(10); 474 const vec_s16 v5ss = vec_splat_s16(5); 475 const vec_s16 v1ss = vec_splat_s16(1); 476 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 477 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 478 479 register int align = ((((unsigned long)src) - 2) % 16); 480 481 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 482 srcP2A, srcP2B, srcP3A, srcP3B, 483 srcM1A, srcM1B, srcM2A, srcM2B, 484 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 485 pp1A, pp1B, pp2A, pp2B, psumA, psumB; 486 487 const vec_u8 mperm = (const vec_u8) 488 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 489 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; 490 int16_t *tmpbis = tmp; 491 492 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, 493 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, 494 tmpP2ssA, tmpP2ssB; 495 496 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, 497 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 498 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 499 ssumAe, ssumAo, ssumBe, ssumBo; 500 vec_u8 fsum, sumv, sum, vdst; 501 vec_s16 ssume, ssumo; 502 503 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 504 src -= (2 * srcStride); 505 for (i = 0 ; i < 21 ; i ++) { 506 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 507 vec_u8 srcR1 = vec_ld(-2, src); 508 vec_u8 srcR2 = vec_ld(14, src); 509 510 switch (align) { 511 default: { 512 srcM2 = vec_perm(srcR1, srcR2, permM2); 513 srcM1 = vec_perm(srcR1, srcR2, permM1); 514 srcP0 = vec_perm(srcR1, srcR2, permP0); 515 srcP1 = vec_perm(srcR1, srcR2, permP1); 516 srcP2 = vec_perm(srcR1, srcR2, permP2); 517 srcP3 = vec_perm(srcR1, srcR2, permP3); 518 } break; 519 case 11: { 520 srcM2 = vec_perm(srcR1, srcR2, permM2); 521 srcM1 = vec_perm(srcR1, srcR2, permM1); 522 srcP0 = vec_perm(srcR1, srcR2, permP0); 523 srcP1 = vec_perm(srcR1, srcR2, permP1); 524 srcP2 = vec_perm(srcR1, srcR2, permP2); 525 srcP3 = srcR2; 526 } break; 527 case 12: { 528 vec_u8 srcR3 = vec_ld(30, src); 529 srcM2 = vec_perm(srcR1, srcR2, permM2); 530 srcM1 = vec_perm(srcR1, srcR2, permM1); 531 srcP0 = vec_perm(srcR1, srcR2, permP0); 532 srcP1 = vec_perm(srcR1, srcR2, permP1); 533 srcP2 = srcR2; 534 srcP3 = vec_perm(srcR2, srcR3, permP3); 535 } break; 536 case 13: { 537 vec_u8 srcR3 = vec_ld(30, src); 538 srcM2 = vec_perm(srcR1, srcR2, permM2); 539 srcM1 = vec_perm(srcR1, srcR2, permM1); 540 srcP0 = vec_perm(srcR1, srcR2, permP0); 541 srcP1 = srcR2; 542 srcP2 = vec_perm(srcR2, srcR3, permP2); 543 srcP3 = vec_perm(srcR2, srcR3, permP3); 544 } break; 545 case 14: { 546 vec_u8 srcR3 = vec_ld(30, src); 547 srcM2 = vec_perm(srcR1, srcR2, permM2); 548 srcM1 = vec_perm(srcR1, srcR2, permM1); 549 srcP0 = srcR2; 550 srcP1 = vec_perm(srcR2, srcR3, permP1); 551 srcP2 = vec_perm(srcR2, srcR3, permP2); 552 srcP3 = vec_perm(srcR2, srcR3, permP3); 553 } break; 554 case 15: { 555 vec_u8 srcR3 = vec_ld(30, src); 556 srcM2 = vec_perm(srcR1, srcR2, permM2); 557 srcM1 = srcR2; 558 srcP0 = vec_perm(srcR2, srcR3, permP0); 559 srcP1 = vec_perm(srcR2, srcR3, permP1); 560 srcP2 = vec_perm(srcR2, srcR3, permP2); 561 srcP3 = vec_perm(srcR2, srcR3, permP3); 562 } break; 563 } 564 565 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 566 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 567 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 568 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 569 570 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 571 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 572 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 573 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 574 575 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 576 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 577 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 578 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 579 580 sum1A = vec_adds(srcP0A, srcP1A); 581 sum1B = vec_adds(srcP0B, srcP1B); 582 sum2A = vec_adds(srcM1A, srcP2A); 583 sum2B = vec_adds(srcM1B, srcP2B); 584 sum3A = vec_adds(srcM2A, srcP3A); 585 sum3B = vec_adds(srcM2B, srcP3B); 586 587 pp1A = vec_mladd(sum1A, v20ss, sum3A); 588 pp1B = vec_mladd(sum1B, v20ss, sum3B); 589 590 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 591 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 592 593 psumA = vec_sub(pp1A, pp2A); 594 psumB = vec_sub(pp1B, pp2B); 595 596 vec_st(psumA, 0, tmp); 597 vec_st(psumB, 16, tmp); 598 599 src += srcStride; 600 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 601 } 602 603 tmpM2ssA = vec_ld(0, tmpbis); 604 tmpM2ssB = vec_ld(16, tmpbis); 605 tmpbis += tmpStride; 606 tmpM1ssA = vec_ld(0, tmpbis); 607 tmpM1ssB = vec_ld(16, tmpbis); 608 tmpbis += tmpStride; 609 tmpP0ssA = vec_ld(0, tmpbis); 610 tmpP0ssB = vec_ld(16, tmpbis); 611 tmpbis += tmpStride; 612 tmpP1ssA = vec_ld(0, tmpbis); 613 tmpP1ssB = vec_ld(16, tmpbis); 614 tmpbis += tmpStride; 615 tmpP2ssA = vec_ld(0, tmpbis); 616 tmpP2ssB = vec_ld(16, tmpbis); 617 tmpbis += tmpStride; 618 619 for (i = 0 ; i < 16 ; i++) { 620 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); 621 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); 622 623 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); 624 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); 625 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); 626 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); 627 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); 628 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); 629 630 tmpbis += tmpStride; 631 632 tmpM2ssA = tmpM1ssA; 633 tmpM2ssB = tmpM1ssB; 634 tmpM1ssA = tmpP0ssA; 635 tmpM1ssB = tmpP0ssB; 636 tmpP0ssA = tmpP1ssA; 637 tmpP0ssB = tmpP1ssB; 638 tmpP1ssA = tmpP2ssA; 639 tmpP1ssB = tmpP2ssB; 640 tmpP2ssA = tmpP3ssA; 641 tmpP2ssB = tmpP3ssB; 642 643 pp1Ae = vec_mule(sum1A, v20ss); 644 pp1Ao = vec_mulo(sum1A, v20ss); 645 pp1Be = vec_mule(sum1B, v20ss); 646 pp1Bo = vec_mulo(sum1B, v20ss); 647 648 pp2Ae = vec_mule(sum2A, v5ss); 649 pp2Ao = vec_mulo(sum2A, v5ss); 650 pp2Be = vec_mule(sum2B, v5ss); 651 pp2Bo = vec_mulo(sum2B, v5ss); 652 653 pp3Ae = vec_sra((vec_s32)sum3A, v16ui); 654 pp3Ao = vec_mulo(sum3A, v1ss); 655 pp3Be = vec_sra((vec_s32)sum3B, v16ui); 656 pp3Bo = vec_mulo(sum3B, v1ss); 657 658 pp1cAe = vec_add(pp1Ae, v512si); 659 pp1cAo = vec_add(pp1Ao, v512si); 660 pp1cBe = vec_add(pp1Be, v512si); 661 pp1cBo = vec_add(pp1Bo, v512si); 662 663 pp32Ae = vec_sub(pp3Ae, pp2Ae); 664 pp32Ao = vec_sub(pp3Ao, pp2Ao); 665 pp32Be = vec_sub(pp3Be, pp2Be); 666 pp32Bo = vec_sub(pp3Bo, pp2Bo); 667 668 sumAe = vec_add(pp1cAe, pp32Ae); 669 sumAo = vec_add(pp1cAo, pp32Ao); 670 sumBe = vec_add(pp1cBe, pp32Be); 671 sumBo = vec_add(pp1cBo, pp32Bo); 672 673 ssumAe = vec_sra(sumAe, v10ui); 674 ssumAo = vec_sra(sumAo, v10ui); 675 ssumBe = vec_sra(sumBe, v10ui); 676 ssumBo = vec_sra(sumBo, v10ui); 677 678 ssume = vec_packs(ssumAe, ssumBe); 679 ssumo = vec_packs(ssumAo, ssumBo); 680 681 sumv = vec_packsu(ssume, ssumo); 682 sum = vec_perm(sumv, sumv, mperm); 683 684 ASSERT_ALIGNED(dst); 685 vdst = vec_ld(0, dst); 686 687 OP_U8_ALTIVEC(fsum, sum, vdst); 688 689 vec_st(fsum, 0, dst); 690 691 dst += dstStride; 692 } 693 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 694} 695