1/* 2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/mem.h" 22 23#ifdef DEBUG 24#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); 25#else 26#define ASSERT_ALIGNED(ptr) ; 27#endif 28 29/* this code assume stride % 16 == 0 */ 30#ifdef PREFIX_h264_qpel16_h_lowpass_altivec 31static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 32 register int i; 33 34 LOAD_ZERO; 35 const vec_u8 permM2 = vec_lvsl(-2, src); 36 const vec_u8 permM1 = vec_lvsl(-1, src); 37 const vec_u8 permP0 = vec_lvsl(+0, src); 38 const vec_u8 permP1 = vec_lvsl(+1, src); 39 const vec_u8 permP2 = vec_lvsl(+2, src); 40 const vec_u8 permP3 = vec_lvsl(+3, src); 41 const vec_s16 v5ss = vec_splat_s16(5); 42 const vec_u16 v5us = vec_splat_u16(5); 43 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 44 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 45 46 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 47 48 register int align = ((((unsigned long)src) - 2) % 16); 49 50 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 51 srcP2A, srcP2B, srcP3A, srcP3B, 52 srcM1A, srcM1B, srcM2A, srcM2B, 53 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 54 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 55 psumA, psumB, sumA, sumB; 56 57 vec_u8 sum, fsum; 58 59 for (i = 0 ; i < 16 ; i ++) { 60 vec_u8 srcR1 = vec_ld(-2, src); 61 vec_u8 srcR2 = vec_ld(14, src); 62 63 switch (align) { 64 default: { 65 srcM2 = vec_perm(srcR1, srcR2, permM2); 66 srcM1 = vec_perm(srcR1, srcR2, permM1); 67 srcP0 = vec_perm(srcR1, srcR2, permP0); 68 srcP1 = vec_perm(srcR1, srcR2, permP1); 69 srcP2 = vec_perm(srcR1, srcR2, permP2); 70 srcP3 = vec_perm(srcR1, srcR2, permP3); 71 } break; 72 case 11: { 73 srcM2 = vec_perm(srcR1, srcR2, permM2); 74 srcM1 = vec_perm(srcR1, srcR2, permM1); 75 srcP0 = vec_perm(srcR1, srcR2, permP0); 76 srcP1 = vec_perm(srcR1, srcR2, permP1); 77 srcP2 = vec_perm(srcR1, srcR2, permP2); 78 srcP3 = srcR2; 79 } break; 80 case 12: { 81 vec_u8 srcR3 = vec_ld(30, src); 82 srcM2 = vec_perm(srcR1, srcR2, permM2); 83 srcM1 = vec_perm(srcR1, srcR2, permM1); 84 srcP0 = vec_perm(srcR1, srcR2, permP0); 85 srcP1 = vec_perm(srcR1, srcR2, permP1); 86 srcP2 = srcR2; 87 srcP3 = vec_perm(srcR2, srcR3, permP3); 88 } break; 89 case 13: { 90 vec_u8 srcR3 = vec_ld(30, src); 91 srcM2 = vec_perm(srcR1, srcR2, permM2); 92 srcM1 = vec_perm(srcR1, srcR2, permM1); 93 srcP0 = vec_perm(srcR1, srcR2, permP0); 94 srcP1 = srcR2; 95 srcP2 = vec_perm(srcR2, srcR3, permP2); 96 srcP3 = vec_perm(srcR2, srcR3, permP3); 97 } break; 98 case 14: { 99 vec_u8 srcR3 = vec_ld(30, src); 100 srcM2 = vec_perm(srcR1, srcR2, permM2); 101 srcM1 = vec_perm(srcR1, srcR2, permM1); 102 srcP0 = srcR2; 103 srcP1 = vec_perm(srcR2, srcR3, permP1); 104 srcP2 = vec_perm(srcR2, srcR3, permP2); 105 srcP3 = vec_perm(srcR2, srcR3, permP3); 106 } break; 107 case 15: { 108 vec_u8 srcR3 = vec_ld(30, src); 109 srcM2 = vec_perm(srcR1, srcR2, permM2); 110 srcM1 = srcR2; 111 srcP0 = vec_perm(srcR2, srcR3, permP0); 112 srcP1 = vec_perm(srcR2, srcR3, permP1); 113 srcP2 = vec_perm(srcR2, srcR3, permP2); 114 srcP3 = vec_perm(srcR2, srcR3, permP3); 115 } break; 116 } 117 118 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 119 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 120 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 121 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 122 123 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 124 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 125 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 126 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 127 128 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 129 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 130 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 131 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 132 133 sum1A = vec_adds(srcP0A, srcP1A); 134 sum1B = vec_adds(srcP0B, srcP1B); 135 sum2A = vec_adds(srcM1A, srcP2A); 136 sum2B = vec_adds(srcM1B, srcP2B); 137 sum3A = vec_adds(srcM2A, srcP3A); 138 sum3B = vec_adds(srcM2B, srcP3B); 139 140 pp1A = vec_mladd(sum1A, v20ss, v16ss); 141 pp1B = vec_mladd(sum1B, v20ss, v16ss); 142 143 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 144 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 145 146 pp3A = vec_add(sum3A, pp1A); 147 pp3B = vec_add(sum3B, pp1B); 148 149 psumA = vec_sub(pp3A, pp2A); 150 psumB = vec_sub(pp3B, pp2B); 151 152 sumA = vec_sra(psumA, v5us); 153 sumB = vec_sra(psumB, v5us); 154 155 sum = vec_packsu(sumA, sumB); 156 157 ASSERT_ALIGNED(dst); 158 159 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); 160 161 vec_st(fsum, 0, dst); 162 163 src += srcStride; 164 dst += dstStride; 165 } 166} 167#endif 168 169/* this code assume stride % 16 == 0 */ 170#ifdef PREFIX_h264_qpel16_v_lowpass_altivec 171static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 172 register int i; 173 174 LOAD_ZERO; 175 const vec_u8 perm = vec_lvsl(0, src); 176 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 177 const vec_u16 v5us = vec_splat_u16(5); 178 const vec_s16 v5ss = vec_splat_s16(5); 179 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 180 181 uint8_t *srcbis = src - (srcStride * 2); 182 183 const vec_u8 srcM2a = vec_ld(0, srcbis); 184 const vec_u8 srcM2b = vec_ld(16, srcbis); 185 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); 186 //srcbis += srcStride; 187 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); 188 const vec_u8 srcM1b = vec_ld(16, srcbis); 189 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); 190 //srcbis += srcStride; 191 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); 192 const vec_u8 srcP0b = vec_ld(16, srcbis); 193 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); 194 //srcbis += srcStride; 195 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); 196 const vec_u8 srcP1b = vec_ld(16, srcbis); 197 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); 198 //srcbis += srcStride; 199 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); 200 const vec_u8 srcP2b = vec_ld(16, srcbis); 201 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); 202 //srcbis += srcStride; 203 204 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); 205 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); 206 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); 207 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); 208 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); 209 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); 210 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); 211 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); 212 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); 213 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); 214 215 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 216 psumA, psumB, sumA, sumB, 217 srcP3ssA, srcP3ssB, 218 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; 219 220 vec_u8 sum, fsum, srcP3a, srcP3b, srcP3; 221 222 for (i = 0 ; i < 16 ; i++) { 223 srcP3a = vec_ld(0, srcbis += srcStride); 224 srcP3b = vec_ld(16, srcbis); 225 srcP3 = vec_perm(srcP3a, srcP3b, perm); 226 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); 227 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); 228 //srcbis += srcStride; 229 230 sum1A = vec_adds(srcP0ssA, srcP1ssA); 231 sum1B = vec_adds(srcP0ssB, srcP1ssB); 232 sum2A = vec_adds(srcM1ssA, srcP2ssA); 233 sum2B = vec_adds(srcM1ssB, srcP2ssB); 234 sum3A = vec_adds(srcM2ssA, srcP3ssA); 235 sum3B = vec_adds(srcM2ssB, srcP3ssB); 236 237 srcM2ssA = srcM1ssA; 238 srcM2ssB = srcM1ssB; 239 srcM1ssA = srcP0ssA; 240 srcM1ssB = srcP0ssB; 241 srcP0ssA = srcP1ssA; 242 srcP0ssB = srcP1ssB; 243 srcP1ssA = srcP2ssA; 244 srcP1ssB = srcP2ssB; 245 srcP2ssA = srcP3ssA; 246 srcP2ssB = srcP3ssB; 247 248 pp1A = vec_mladd(sum1A, v20ss, v16ss); 249 pp1B = vec_mladd(sum1B, v20ss, v16ss); 250 251 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 252 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 253 254 pp3A = vec_add(sum3A, pp1A); 255 pp3B = vec_add(sum3B, pp1B); 256 257 psumA = vec_sub(pp3A, pp2A); 258 psumB = vec_sub(pp3B, pp2B); 259 260 sumA = vec_sra(psumA, v5us); 261 sumB = vec_sra(psumB, v5us); 262 263 sum = vec_packsu(sumA, sumB); 264 265 ASSERT_ALIGNED(dst); 266 267 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); 268 269 vec_st(fsum, 0, dst); 270 271 dst += dstStride; 272 } 273} 274#endif 275 276/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 277#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec 278static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { 279 register int i; 280 LOAD_ZERO; 281 const vec_u8 permM2 = vec_lvsl(-2, src); 282 const vec_u8 permM1 = vec_lvsl(-1, src); 283 const vec_u8 permP0 = vec_lvsl(+0, src); 284 const vec_u8 permP1 = vec_lvsl(+1, src); 285 const vec_u8 permP2 = vec_lvsl(+2, src); 286 const vec_u8 permP3 = vec_lvsl(+3, src); 287 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 288 const vec_u32 v10ui = vec_splat_u32(10); 289 const vec_s16 v5ss = vec_splat_s16(5); 290 const vec_s16 v1ss = vec_splat_s16(1); 291 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 292 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 293 294 register int align = ((((unsigned long)src) - 2) % 16); 295 296 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 297 srcP2A, srcP2B, srcP3A, srcP3B, 298 srcM1A, srcM1B, srcM2A, srcM2B, 299 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 300 pp1A, pp1B, pp2A, pp2B, psumA, psumB; 301 302 const vec_u8 mperm = (const vec_u8) 303 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 304 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; 305 int16_t *tmpbis = tmp; 306 307 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, 308 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, 309 tmpP2ssA, tmpP2ssB; 310 311 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, 312 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 313 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 314 ssumAe, ssumAo, ssumBe, ssumBo; 315 vec_u8 fsum, sumv, sum; 316 vec_s16 ssume, ssumo; 317 318 src -= (2 * srcStride); 319 for (i = 0 ; i < 21 ; i ++) { 320 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 321 vec_u8 srcR1 = vec_ld(-2, src); 322 vec_u8 srcR2 = vec_ld(14, src); 323 324 switch (align) { 325 default: { 326 srcM2 = vec_perm(srcR1, srcR2, permM2); 327 srcM1 = vec_perm(srcR1, srcR2, permM1); 328 srcP0 = vec_perm(srcR1, srcR2, permP0); 329 srcP1 = vec_perm(srcR1, srcR2, permP1); 330 srcP2 = vec_perm(srcR1, srcR2, permP2); 331 srcP3 = vec_perm(srcR1, srcR2, permP3); 332 } break; 333 case 11: { 334 srcM2 = vec_perm(srcR1, srcR2, permM2); 335 srcM1 = vec_perm(srcR1, srcR2, permM1); 336 srcP0 = vec_perm(srcR1, srcR2, permP0); 337 srcP1 = vec_perm(srcR1, srcR2, permP1); 338 srcP2 = vec_perm(srcR1, srcR2, permP2); 339 srcP3 = srcR2; 340 } break; 341 case 12: { 342 vec_u8 srcR3 = vec_ld(30, src); 343 srcM2 = vec_perm(srcR1, srcR2, permM2); 344 srcM1 = vec_perm(srcR1, srcR2, permM1); 345 srcP0 = vec_perm(srcR1, srcR2, permP0); 346 srcP1 = vec_perm(srcR1, srcR2, permP1); 347 srcP2 = srcR2; 348 srcP3 = vec_perm(srcR2, srcR3, permP3); 349 } break; 350 case 13: { 351 vec_u8 srcR3 = vec_ld(30, src); 352 srcM2 = vec_perm(srcR1, srcR2, permM2); 353 srcM1 = vec_perm(srcR1, srcR2, permM1); 354 srcP0 = vec_perm(srcR1, srcR2, permP0); 355 srcP1 = srcR2; 356 srcP2 = vec_perm(srcR2, srcR3, permP2); 357 srcP3 = vec_perm(srcR2, srcR3, permP3); 358 } break; 359 case 14: { 360 vec_u8 srcR3 = vec_ld(30, src); 361 srcM2 = vec_perm(srcR1, srcR2, permM2); 362 srcM1 = vec_perm(srcR1, srcR2, permM1); 363 srcP0 = srcR2; 364 srcP1 = vec_perm(srcR2, srcR3, permP1); 365 srcP2 = vec_perm(srcR2, srcR3, permP2); 366 srcP3 = vec_perm(srcR2, srcR3, permP3); 367 } break; 368 case 15: { 369 vec_u8 srcR3 = vec_ld(30, src); 370 srcM2 = vec_perm(srcR1, srcR2, permM2); 371 srcM1 = srcR2; 372 srcP0 = vec_perm(srcR2, srcR3, permP0); 373 srcP1 = vec_perm(srcR2, srcR3, permP1); 374 srcP2 = vec_perm(srcR2, srcR3, permP2); 375 srcP3 = vec_perm(srcR2, srcR3, permP3); 376 } break; 377 } 378 379 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 380 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 381 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 382 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 383 384 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 385 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 386 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 387 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 388 389 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 390 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 391 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 392 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 393 394 sum1A = vec_adds(srcP0A, srcP1A); 395 sum1B = vec_adds(srcP0B, srcP1B); 396 sum2A = vec_adds(srcM1A, srcP2A); 397 sum2B = vec_adds(srcM1B, srcP2B); 398 sum3A = vec_adds(srcM2A, srcP3A); 399 sum3B = vec_adds(srcM2B, srcP3B); 400 401 pp1A = vec_mladd(sum1A, v20ss, sum3A); 402 pp1B = vec_mladd(sum1B, v20ss, sum3B); 403 404 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 405 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 406 407 psumA = vec_sub(pp1A, pp2A); 408 psumB = vec_sub(pp1B, pp2B); 409 410 vec_st(psumA, 0, tmp); 411 vec_st(psumB, 16, tmp); 412 413 src += srcStride; 414 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 415 } 416 417 tmpM2ssA = vec_ld(0, tmpbis); 418 tmpM2ssB = vec_ld(16, tmpbis); 419 tmpbis += tmpStride; 420 tmpM1ssA = vec_ld(0, tmpbis); 421 tmpM1ssB = vec_ld(16, tmpbis); 422 tmpbis += tmpStride; 423 tmpP0ssA = vec_ld(0, tmpbis); 424 tmpP0ssB = vec_ld(16, tmpbis); 425 tmpbis += tmpStride; 426 tmpP1ssA = vec_ld(0, tmpbis); 427 tmpP1ssB = vec_ld(16, tmpbis); 428 tmpbis += tmpStride; 429 tmpP2ssA = vec_ld(0, tmpbis); 430 tmpP2ssB = vec_ld(16, tmpbis); 431 tmpbis += tmpStride; 432 433 for (i = 0 ; i < 16 ; i++) { 434 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); 435 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); 436 437 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); 438 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); 439 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); 440 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); 441 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); 442 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); 443 444 tmpbis += tmpStride; 445 446 tmpM2ssA = tmpM1ssA; 447 tmpM2ssB = tmpM1ssB; 448 tmpM1ssA = tmpP0ssA; 449 tmpM1ssB = tmpP0ssB; 450 tmpP0ssA = tmpP1ssA; 451 tmpP0ssB = tmpP1ssB; 452 tmpP1ssA = tmpP2ssA; 453 tmpP1ssB = tmpP2ssB; 454 tmpP2ssA = tmpP3ssA; 455 tmpP2ssB = tmpP3ssB; 456 457 pp1Ae = vec_mule(sum1A, v20ss); 458 pp1Ao = vec_mulo(sum1A, v20ss); 459 pp1Be = vec_mule(sum1B, v20ss); 460 pp1Bo = vec_mulo(sum1B, v20ss); 461 462 pp2Ae = vec_mule(sum2A, v5ss); 463 pp2Ao = vec_mulo(sum2A, v5ss); 464 pp2Be = vec_mule(sum2B, v5ss); 465 pp2Bo = vec_mulo(sum2B, v5ss); 466 467 pp3Ae = vec_sra((vec_s32)sum3A, v16ui); 468 pp3Ao = vec_mulo(sum3A, v1ss); 469 pp3Be = vec_sra((vec_s32)sum3B, v16ui); 470 pp3Bo = vec_mulo(sum3B, v1ss); 471 472 pp1cAe = vec_add(pp1Ae, v512si); 473 pp1cAo = vec_add(pp1Ao, v512si); 474 pp1cBe = vec_add(pp1Be, v512si); 475 pp1cBo = vec_add(pp1Bo, v512si); 476 477 pp32Ae = vec_sub(pp3Ae, pp2Ae); 478 pp32Ao = vec_sub(pp3Ao, pp2Ao); 479 pp32Be = vec_sub(pp3Be, pp2Be); 480 pp32Bo = vec_sub(pp3Bo, pp2Bo); 481 482 sumAe = vec_add(pp1cAe, pp32Ae); 483 sumAo = vec_add(pp1cAo, pp32Ao); 484 sumBe = vec_add(pp1cBe, pp32Be); 485 sumBo = vec_add(pp1cBo, pp32Bo); 486 487 ssumAe = vec_sra(sumAe, v10ui); 488 ssumAo = vec_sra(sumAo, v10ui); 489 ssumBe = vec_sra(sumBe, v10ui); 490 ssumBo = vec_sra(sumBo, v10ui); 491 492 ssume = vec_packs(ssumAe, ssumBe); 493 ssumo = vec_packs(ssumAo, ssumBo); 494 495 sumv = vec_packsu(ssume, ssumo); 496 sum = vec_perm(sumv, sumv, mperm); 497 498 ASSERT_ALIGNED(dst); 499 500 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); 501 502 vec_st(fsum, 0, dst); 503 504 dst += dstStride; 505 } 506} 507#endif 508