1/* 2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> 3 * 4 * This file is part of Libav. 5 * 6 * Libav is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * Libav is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with Libav; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#ifdef DEBUG 22#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); 23#else 24#define ASSERT_ALIGNED(ptr) ; 25#endif 26 27/* this code assume that stride % 16 == 0 */ 28 29#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \ 30 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ 31 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ 32\ 33 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\ 34 psum = vec_mladd(vB, vsrc1ssH, psum);\ 35 psum = vec_mladd(vC, vsrc2ssH, psum);\ 36 psum = vec_mladd(vD, vsrc3ssH, psum);\ 37 psum = BIAS2(psum);\ 38 psum = vec_sr(psum, v6us);\ 39\ 40 vdst = vec_ld(0, dst);\ 41 ppsum = (vec_u8)vec_pack(psum, psum);\ 42 vfdst = vec_perm(vdst, ppsum, fperm);\ 43\ 44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 45\ 46 vec_st(fsum, 0, dst);\ 47\ 48 vsrc0ssH = vsrc2ssH;\ 49 vsrc1ssH = vsrc3ssH;\ 50\ 51 dst += stride;\ 52 src += stride; 53 54#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ 55\ 56 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\ 57 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\ 58\ 59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 60 psum = vec_mladd(vE, vsrc1ssH, psum);\ 61 psum = vec_sr(psum, v6us);\ 62\ 63 vdst = vec_ld(0, dst);\ 64 ppsum = (vec_u8)vec_pack(psum, psum);\ 65 vfdst = vec_perm(vdst, ppsum, fperm);\ 66\ 67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 68\ 69 vec_st(fsum, 0, dst);\ 70\ 71 dst += stride;\ 72 src += stride; 73 74#define noop(a) a 75#define add28(a) vec_add(v28ss, a) 76 77#ifdef PREFIX_h264_chroma_mc8_altivec 78static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, 79 int stride, int h, int x, int y) { 80 DECLARE_ALIGNED(16, signed int, ABCD)[4] = 81 {((8 - x) * (8 - y)), 82 (( x) * (8 - y)), 83 ((8 - x) * ( y)), 84 (( x) * ( y))}; 85 register int i; 86 vec_u8 fperm; 87 const vec_s32 vABCD = vec_ld(0, ABCD); 88 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); 89 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); 90 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); 91 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); 92 LOAD_ZERO; 93 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); 94 const vec_u16 v6us = vec_splat_u16(6); 95 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 96 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 97 98 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; 99 vec_u8 vsrc0uc, vsrc1uc; 100 vec_s16 vsrc0ssH, vsrc1ssH; 101 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; 102 vec_s16 vsrc2ssH, vsrc3ssH, psum; 103 vec_u8 vdst, ppsum, vfdst, fsum; 104 105 if (((unsigned long)dst) % 16 == 0) { 106 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 107 0x14, 0x15, 0x16, 0x17, 108 0x08, 0x09, 0x0A, 0x0B, 109 0x0C, 0x0D, 0x0E, 0x0F}; 110 } else { 111 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 112 0x04, 0x05, 0x06, 0x07, 113 0x18, 0x19, 0x1A, 0x1B, 114 0x1C, 0x1D, 0x1E, 0x1F}; 115 } 116 117 vsrcAuc = vec_ld(0, src); 118 119 if (loadSecond) 120 vsrcBuc = vec_ld(16, src); 121 vsrcperm0 = vec_lvsl(0, src); 122 vsrcperm1 = vec_lvsl(1, src); 123 124 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); 125 if (reallyBadAlign) 126 vsrc1uc = vsrcBuc; 127 else 128 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 129 130 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); 131 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); 132 133 if (ABCD[3]) { 134 if (!loadSecond) {// -> !reallyBadAlign 135 for (i = 0 ; i < h ; i++) { 136 vsrcCuc = vec_ld(stride + 0, src); 137 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 138 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 139 140 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) 141 } 142 } else { 143 vec_u8 vsrcDuc; 144 for (i = 0 ; i < h ; i++) { 145 vsrcCuc = vec_ld(stride + 0, src); 146 vsrcDuc = vec_ld(stride + 16, src); 147 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 148 if (reallyBadAlign) 149 vsrc3uc = vsrcDuc; 150 else 151 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 152 153 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) 154 } 155 } 156 } else { 157 const vec_s16 vE = vec_add(vB, vC); 158 if (ABCD[2]) { // x == 0 B == 0 159 if (!loadSecond) {// -> !reallyBadAlign 160 for (i = 0 ; i < h ; i++) { 161 vsrcCuc = vec_ld(stride + 0, src); 162 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 163 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 164 165 vsrc0uc = vsrc1uc; 166 } 167 } else { 168 vec_u8 vsrcDuc; 169 for (i = 0 ; i < h ; i++) { 170 vsrcCuc = vec_ld(stride + 0, src); 171 vsrcDuc = vec_ld(stride + 15, src); 172 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 173 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 174 175 vsrc0uc = vsrc1uc; 176 } 177 } 178 } else { // y == 0 C == 0 179 if (!loadSecond) {// -> !reallyBadAlign 180 for (i = 0 ; i < h ; i++) { 181 vsrcCuc = vec_ld(0, src); 182 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 183 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 184 185 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 186 } 187 } else { 188 vec_u8 vsrcDuc; 189 for (i = 0 ; i < h ; i++) { 190 vsrcCuc = vec_ld(0, src); 191 vsrcDuc = vec_ld(15, src); 192 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 193 if (reallyBadAlign) 194 vsrc1uc = vsrcDuc; 195 else 196 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 197 198 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 199 } 200 } 201 } 202 } 203} 204#endif 205 206/* this code assume that stride % 16 == 0 */ 207#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec 208static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { 209 DECLARE_ALIGNED(16, signed int, ABCD)[4] = 210 {((8 - x) * (8 - y)), 211 (( x) * (8 - y)), 212 ((8 - x) * ( y)), 213 (( x) * ( y))}; 214 register int i; 215 vec_u8 fperm; 216 const vec_s32 vABCD = vec_ld(0, ABCD); 217 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); 218 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); 219 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); 220 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); 221 LOAD_ZERO; 222 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); 223 const vec_u16 v6us = vec_splat_u16(6); 224 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 225 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 226 227 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; 228 vec_u8 vsrc0uc, vsrc1uc; 229 vec_s16 vsrc0ssH, vsrc1ssH; 230 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; 231 vec_s16 vsrc2ssH, vsrc3ssH, psum; 232 vec_u8 vdst, ppsum, vfdst, fsum; 233 234 if (((unsigned long)dst) % 16 == 0) { 235 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 236 0x14, 0x15, 0x16, 0x17, 237 0x08, 0x09, 0x0A, 0x0B, 238 0x0C, 0x0D, 0x0E, 0x0F}; 239 } else { 240 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 241 0x04, 0x05, 0x06, 0x07, 242 0x18, 0x19, 0x1A, 0x1B, 243 0x1C, 0x1D, 0x1E, 0x1F}; 244 } 245 246 vsrcAuc = vec_ld(0, src); 247 248 if (loadSecond) 249 vsrcBuc = vec_ld(16, src); 250 vsrcperm0 = vec_lvsl(0, src); 251 vsrcperm1 = vec_lvsl(1, src); 252 253 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); 254 if (reallyBadAlign) 255 vsrc1uc = vsrcBuc; 256 else 257 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 258 259 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc); 260 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc); 261 262 if (!loadSecond) {// -> !reallyBadAlign 263 for (i = 0 ; i < h ; i++) { 264 265 266 vsrcCuc = vec_ld(stride + 0, src); 267 268 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 269 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 270 271 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) 272 } 273 } else { 274 vec_u8 vsrcDuc; 275 for (i = 0 ; i < h ; i++) { 276 vsrcCuc = vec_ld(stride + 0, src); 277 vsrcDuc = vec_ld(stride + 16, src); 278 279 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 280 if (reallyBadAlign) 281 vsrc3uc = vsrcDuc; 282 else 283 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 284 285 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) 286 } 287 } 288} 289#endif 290 291#undef noop 292#undef add28 293#undef CHROMA_MC8_ALTIVEC_CORE 294 295/* this code assume stride % 16 == 0 */ 296#ifdef PREFIX_h264_qpel16_h_lowpass_altivec 297static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 298 register int i; 299 300 LOAD_ZERO; 301 const vec_u8 permM2 = vec_lvsl(-2, src); 302 const vec_u8 permM1 = vec_lvsl(-1, src); 303 const vec_u8 permP0 = vec_lvsl(+0, src); 304 const vec_u8 permP1 = vec_lvsl(+1, src); 305 const vec_u8 permP2 = vec_lvsl(+2, src); 306 const vec_u8 permP3 = vec_lvsl(+3, src); 307 const vec_s16 v5ss = vec_splat_s16(5); 308 const vec_u16 v5us = vec_splat_u16(5); 309 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 310 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 311 312 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 313 314 register int align = ((((unsigned long)src) - 2) % 16); 315 316 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 317 srcP2A, srcP2B, srcP3A, srcP3B, 318 srcM1A, srcM1B, srcM2A, srcM2B, 319 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 320 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 321 psumA, psumB, sumA, sumB; 322 323 vec_u8 sum, vdst, fsum; 324 325 for (i = 0 ; i < 16 ; i ++) { 326 vec_u8 srcR1 = vec_ld(-2, src); 327 vec_u8 srcR2 = vec_ld(14, src); 328 329 switch (align) { 330 default: { 331 srcM2 = vec_perm(srcR1, srcR2, permM2); 332 srcM1 = vec_perm(srcR1, srcR2, permM1); 333 srcP0 = vec_perm(srcR1, srcR2, permP0); 334 srcP1 = vec_perm(srcR1, srcR2, permP1); 335 srcP2 = vec_perm(srcR1, srcR2, permP2); 336 srcP3 = vec_perm(srcR1, srcR2, permP3); 337 } break; 338 case 11: { 339 srcM2 = vec_perm(srcR1, srcR2, permM2); 340 srcM1 = vec_perm(srcR1, srcR2, permM1); 341 srcP0 = vec_perm(srcR1, srcR2, permP0); 342 srcP1 = vec_perm(srcR1, srcR2, permP1); 343 srcP2 = vec_perm(srcR1, srcR2, permP2); 344 srcP3 = srcR2; 345 } break; 346 case 12: { 347 vec_u8 srcR3 = vec_ld(30, src); 348 srcM2 = vec_perm(srcR1, srcR2, permM2); 349 srcM1 = vec_perm(srcR1, srcR2, permM1); 350 srcP0 = vec_perm(srcR1, srcR2, permP0); 351 srcP1 = vec_perm(srcR1, srcR2, permP1); 352 srcP2 = srcR2; 353 srcP3 = vec_perm(srcR2, srcR3, permP3); 354 } break; 355 case 13: { 356 vec_u8 srcR3 = vec_ld(30, src); 357 srcM2 = vec_perm(srcR1, srcR2, permM2); 358 srcM1 = vec_perm(srcR1, srcR2, permM1); 359 srcP0 = vec_perm(srcR1, srcR2, permP0); 360 srcP1 = srcR2; 361 srcP2 = vec_perm(srcR2, srcR3, permP2); 362 srcP3 = vec_perm(srcR2, srcR3, permP3); 363 } break; 364 case 14: { 365 vec_u8 srcR3 = vec_ld(30, src); 366 srcM2 = vec_perm(srcR1, srcR2, permM2); 367 srcM1 = vec_perm(srcR1, srcR2, permM1); 368 srcP0 = srcR2; 369 srcP1 = vec_perm(srcR2, srcR3, permP1); 370 srcP2 = vec_perm(srcR2, srcR3, permP2); 371 srcP3 = vec_perm(srcR2, srcR3, permP3); 372 } break; 373 case 15: { 374 vec_u8 srcR3 = vec_ld(30, src); 375 srcM2 = vec_perm(srcR1, srcR2, permM2); 376 srcM1 = srcR2; 377 srcP0 = vec_perm(srcR2, srcR3, permP0); 378 srcP1 = vec_perm(srcR2, srcR3, permP1); 379 srcP2 = vec_perm(srcR2, srcR3, permP2); 380 srcP3 = vec_perm(srcR2, srcR3, permP3); 381 } break; 382 } 383 384 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 385 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 386 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 387 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 388 389 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 390 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 391 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 392 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 393 394 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 395 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 396 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 397 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 398 399 sum1A = vec_adds(srcP0A, srcP1A); 400 sum1B = vec_adds(srcP0B, srcP1B); 401 sum2A = vec_adds(srcM1A, srcP2A); 402 sum2B = vec_adds(srcM1B, srcP2B); 403 sum3A = vec_adds(srcM2A, srcP3A); 404 sum3B = vec_adds(srcM2B, srcP3B); 405 406 pp1A = vec_mladd(sum1A, v20ss, v16ss); 407 pp1B = vec_mladd(sum1B, v20ss, v16ss); 408 409 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 410 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 411 412 pp3A = vec_add(sum3A, pp1A); 413 pp3B = vec_add(sum3B, pp1B); 414 415 psumA = vec_sub(pp3A, pp2A); 416 psumB = vec_sub(pp3B, pp2B); 417 418 sumA = vec_sra(psumA, v5us); 419 sumB = vec_sra(psumB, v5us); 420 421 sum = vec_packsu(sumA, sumB); 422 423 ASSERT_ALIGNED(dst); 424 vdst = vec_ld(0, dst); 425 426 OP_U8_ALTIVEC(fsum, sum, vdst); 427 428 vec_st(fsum, 0, dst); 429 430 src += srcStride; 431 dst += dstStride; 432 } 433} 434#endif 435 436/* this code assume stride % 16 == 0 */ 437#ifdef PREFIX_h264_qpel16_v_lowpass_altivec 438static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 439 register int i; 440 441 LOAD_ZERO; 442 const vec_u8 perm = vec_lvsl(0, src); 443 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 444 const vec_u16 v5us = vec_splat_u16(5); 445 const vec_s16 v5ss = vec_splat_s16(5); 446 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 447 448 uint8_t *srcbis = src - (srcStride * 2); 449 450 const vec_u8 srcM2a = vec_ld(0, srcbis); 451 const vec_u8 srcM2b = vec_ld(16, srcbis); 452 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); 453 //srcbis += srcStride; 454 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); 455 const vec_u8 srcM1b = vec_ld(16, srcbis); 456 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); 457 //srcbis += srcStride; 458 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); 459 const vec_u8 srcP0b = vec_ld(16, srcbis); 460 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); 461 //srcbis += srcStride; 462 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); 463 const vec_u8 srcP1b = vec_ld(16, srcbis); 464 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); 465 //srcbis += srcStride; 466 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); 467 const vec_u8 srcP2b = vec_ld(16, srcbis); 468 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); 469 //srcbis += srcStride; 470 471 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); 472 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); 473 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); 474 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); 475 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); 476 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); 477 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); 478 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); 479 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); 480 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); 481 482 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 483 psumA, psumB, sumA, sumB, 484 srcP3ssA, srcP3ssB, 485 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; 486 487 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3; 488 489 for (i = 0 ; i < 16 ; i++) { 490 srcP3a = vec_ld(0, srcbis += srcStride); 491 srcP3b = vec_ld(16, srcbis); 492 srcP3 = vec_perm(srcP3a, srcP3b, perm); 493 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); 494 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); 495 //srcbis += srcStride; 496 497 sum1A = vec_adds(srcP0ssA, srcP1ssA); 498 sum1B = vec_adds(srcP0ssB, srcP1ssB); 499 sum2A = vec_adds(srcM1ssA, srcP2ssA); 500 sum2B = vec_adds(srcM1ssB, srcP2ssB); 501 sum3A = vec_adds(srcM2ssA, srcP3ssA); 502 sum3B = vec_adds(srcM2ssB, srcP3ssB); 503 504 srcM2ssA = srcM1ssA; 505 srcM2ssB = srcM1ssB; 506 srcM1ssA = srcP0ssA; 507 srcM1ssB = srcP0ssB; 508 srcP0ssA = srcP1ssA; 509 srcP0ssB = srcP1ssB; 510 srcP1ssA = srcP2ssA; 511 srcP1ssB = srcP2ssB; 512 srcP2ssA = srcP3ssA; 513 srcP2ssB = srcP3ssB; 514 515 pp1A = vec_mladd(sum1A, v20ss, v16ss); 516 pp1B = vec_mladd(sum1B, v20ss, v16ss); 517 518 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 519 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 520 521 pp3A = vec_add(sum3A, pp1A); 522 pp3B = vec_add(sum3B, pp1B); 523 524 psumA = vec_sub(pp3A, pp2A); 525 psumB = vec_sub(pp3B, pp2B); 526 527 sumA = vec_sra(psumA, v5us); 528 sumB = vec_sra(psumB, v5us); 529 530 sum = vec_packsu(sumA, sumB); 531 532 ASSERT_ALIGNED(dst); 533 vdst = vec_ld(0, dst); 534 535 OP_U8_ALTIVEC(fsum, sum, vdst); 536 537 vec_st(fsum, 0, dst); 538 539 dst += dstStride; 540 } 541} 542#endif 543 544/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 545#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec 546static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { 547 register int i; 548 LOAD_ZERO; 549 const vec_u8 permM2 = vec_lvsl(-2, src); 550 const vec_u8 permM1 = vec_lvsl(-1, src); 551 const vec_u8 permP0 = vec_lvsl(+0, src); 552 const vec_u8 permP1 = vec_lvsl(+1, src); 553 const vec_u8 permP2 = vec_lvsl(+2, src); 554 const vec_u8 permP3 = vec_lvsl(+3, src); 555 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 556 const vec_u32 v10ui = vec_splat_u32(10); 557 const vec_s16 v5ss = vec_splat_s16(5); 558 const vec_s16 v1ss = vec_splat_s16(1); 559 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 560 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 561 562 register int align = ((((unsigned long)src) - 2) % 16); 563 564 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 565 srcP2A, srcP2B, srcP3A, srcP3B, 566 srcM1A, srcM1B, srcM2A, srcM2B, 567 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 568 pp1A, pp1B, pp2A, pp2B, psumA, psumB; 569 570 const vec_u8 mperm = (const vec_u8) 571 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 572 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; 573 int16_t *tmpbis = tmp; 574 575 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, 576 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, 577 tmpP2ssA, tmpP2ssB; 578 579 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, 580 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 581 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 582 ssumAe, ssumAo, ssumBe, ssumBo; 583 vec_u8 fsum, sumv, sum, vdst; 584 vec_s16 ssume, ssumo; 585 586 src -= (2 * srcStride); 587 for (i = 0 ; i < 21 ; i ++) { 588 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 589 vec_u8 srcR1 = vec_ld(-2, src); 590 vec_u8 srcR2 = vec_ld(14, src); 591 592 switch (align) { 593 default: { 594 srcM2 = vec_perm(srcR1, srcR2, permM2); 595 srcM1 = vec_perm(srcR1, srcR2, permM1); 596 srcP0 = vec_perm(srcR1, srcR2, permP0); 597 srcP1 = vec_perm(srcR1, srcR2, permP1); 598 srcP2 = vec_perm(srcR1, srcR2, permP2); 599 srcP3 = vec_perm(srcR1, srcR2, permP3); 600 } break; 601 case 11: { 602 srcM2 = vec_perm(srcR1, srcR2, permM2); 603 srcM1 = vec_perm(srcR1, srcR2, permM1); 604 srcP0 = vec_perm(srcR1, srcR2, permP0); 605 srcP1 = vec_perm(srcR1, srcR2, permP1); 606 srcP2 = vec_perm(srcR1, srcR2, permP2); 607 srcP3 = srcR2; 608 } break; 609 case 12: { 610 vec_u8 srcR3 = vec_ld(30, src); 611 srcM2 = vec_perm(srcR1, srcR2, permM2); 612 srcM1 = vec_perm(srcR1, srcR2, permM1); 613 srcP0 = vec_perm(srcR1, srcR2, permP0); 614 srcP1 = vec_perm(srcR1, srcR2, permP1); 615 srcP2 = srcR2; 616 srcP3 = vec_perm(srcR2, srcR3, permP3); 617 } break; 618 case 13: { 619 vec_u8 srcR3 = vec_ld(30, src); 620 srcM2 = vec_perm(srcR1, srcR2, permM2); 621 srcM1 = vec_perm(srcR1, srcR2, permM1); 622 srcP0 = vec_perm(srcR1, srcR2, permP0); 623 srcP1 = srcR2; 624 srcP2 = vec_perm(srcR2, srcR3, permP2); 625 srcP3 = vec_perm(srcR2, srcR3, permP3); 626 } break; 627 case 14: { 628 vec_u8 srcR3 = vec_ld(30, src); 629 srcM2 = vec_perm(srcR1, srcR2, permM2); 630 srcM1 = vec_perm(srcR1, srcR2, permM1); 631 srcP0 = srcR2; 632 srcP1 = vec_perm(srcR2, srcR3, permP1); 633 srcP2 = vec_perm(srcR2, srcR3, permP2); 634 srcP3 = vec_perm(srcR2, srcR3, permP3); 635 } break; 636 case 15: { 637 vec_u8 srcR3 = vec_ld(30, src); 638 srcM2 = vec_perm(srcR1, srcR2, permM2); 639 srcM1 = srcR2; 640 srcP0 = vec_perm(srcR2, srcR3, permP0); 641 srcP1 = vec_perm(srcR2, srcR3, permP1); 642 srcP2 = vec_perm(srcR2, srcR3, permP2); 643 srcP3 = vec_perm(srcR2, srcR3, permP3); 644 } break; 645 } 646 647 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 648 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 649 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 650 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 651 652 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 653 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 654 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 655 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 656 657 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 658 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 659 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 660 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 661 662 sum1A = vec_adds(srcP0A, srcP1A); 663 sum1B = vec_adds(srcP0B, srcP1B); 664 sum2A = vec_adds(srcM1A, srcP2A); 665 sum2B = vec_adds(srcM1B, srcP2B); 666 sum3A = vec_adds(srcM2A, srcP3A); 667 sum3B = vec_adds(srcM2B, srcP3B); 668 669 pp1A = vec_mladd(sum1A, v20ss, sum3A); 670 pp1B = vec_mladd(sum1B, v20ss, sum3B); 671 672 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 673 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 674 675 psumA = vec_sub(pp1A, pp2A); 676 psumB = vec_sub(pp1B, pp2B); 677 678 vec_st(psumA, 0, tmp); 679 vec_st(psumB, 16, tmp); 680 681 src += srcStride; 682 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 683 } 684 685 tmpM2ssA = vec_ld(0, tmpbis); 686 tmpM2ssB = vec_ld(16, tmpbis); 687 tmpbis += tmpStride; 688 tmpM1ssA = vec_ld(0, tmpbis); 689 tmpM1ssB = vec_ld(16, tmpbis); 690 tmpbis += tmpStride; 691 tmpP0ssA = vec_ld(0, tmpbis); 692 tmpP0ssB = vec_ld(16, tmpbis); 693 tmpbis += tmpStride; 694 tmpP1ssA = vec_ld(0, tmpbis); 695 tmpP1ssB = vec_ld(16, tmpbis); 696 tmpbis += tmpStride; 697 tmpP2ssA = vec_ld(0, tmpbis); 698 tmpP2ssB = vec_ld(16, tmpbis); 699 tmpbis += tmpStride; 700 701 for (i = 0 ; i < 16 ; i++) { 702 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); 703 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); 704 705 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); 706 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); 707 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); 708 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); 709 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); 710 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); 711 712 tmpbis += tmpStride; 713 714 tmpM2ssA = tmpM1ssA; 715 tmpM2ssB = tmpM1ssB; 716 tmpM1ssA = tmpP0ssA; 717 tmpM1ssB = tmpP0ssB; 718 tmpP0ssA = tmpP1ssA; 719 tmpP0ssB = tmpP1ssB; 720 tmpP1ssA = tmpP2ssA; 721 tmpP1ssB = tmpP2ssB; 722 tmpP2ssA = tmpP3ssA; 723 tmpP2ssB = tmpP3ssB; 724 725 pp1Ae = vec_mule(sum1A, v20ss); 726 pp1Ao = vec_mulo(sum1A, v20ss); 727 pp1Be = vec_mule(sum1B, v20ss); 728 pp1Bo = vec_mulo(sum1B, v20ss); 729 730 pp2Ae = vec_mule(sum2A, v5ss); 731 pp2Ao = vec_mulo(sum2A, v5ss); 732 pp2Be = vec_mule(sum2B, v5ss); 733 pp2Bo = vec_mulo(sum2B, v5ss); 734 735 pp3Ae = vec_sra((vec_s32)sum3A, v16ui); 736 pp3Ao = vec_mulo(sum3A, v1ss); 737 pp3Be = vec_sra((vec_s32)sum3B, v16ui); 738 pp3Bo = vec_mulo(sum3B, v1ss); 739 740 pp1cAe = vec_add(pp1Ae, v512si); 741 pp1cAo = vec_add(pp1Ao, v512si); 742 pp1cBe = vec_add(pp1Be, v512si); 743 pp1cBo = vec_add(pp1Bo, v512si); 744 745 pp32Ae = vec_sub(pp3Ae, pp2Ae); 746 pp32Ao = vec_sub(pp3Ao, pp2Ao); 747 pp32Be = vec_sub(pp3Be, pp2Be); 748 pp32Bo = vec_sub(pp3Bo, pp2Bo); 749 750 sumAe = vec_add(pp1cAe, pp32Ae); 751 sumAo = vec_add(pp1cAo, pp32Ao); 752 sumBe = vec_add(pp1cBe, pp32Be); 753 sumBo = vec_add(pp1cBo, pp32Bo); 754 755 ssumAe = vec_sra(sumAe, v10ui); 756 ssumAo = vec_sra(sumAo, v10ui); 757 ssumBe = vec_sra(sumBe, v10ui); 758 ssumBo = vec_sra(sumBo, v10ui); 759 760 ssume = vec_packs(ssumAe, ssumBe); 761 ssumo = vec_packs(ssumAo, ssumBo); 762 763 sumv = vec_packsu(ssume, ssumo); 764 sum = vec_perm(sumv, sumv, mperm); 765 766 ASSERT_ALIGNED(dst); 767 vdst = vec_ld(0, dst); 768 769 OP_U8_ALTIVEC(fsum, sum, vdst); 770 771 vec_st(fsum, 0, dst); 772 773 dst += dstStride; 774 } 775} 776#endif 777