1/* 2 * Copyright (c) 2002 Brian Foley 3 * Copyright (c) 2002 Dieter Shirley 4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "config.h" 24 25#if HAVE_ALTIVEC_H 26#include <altivec.h> 27#endif 28 29#include "libavutil/attributes.h" 30#include "libavutil/cpu.h" 31#include "libavutil/ppc/cpu.h" 32#include "libavutil/ppc/types_altivec.h" 33#include "libavutil/ppc/util_altivec.h" 34#include "libavcodec/hpeldsp.h" 35#include "hpeldsp_altivec.h" 36 37#if HAVE_ALTIVEC 38/* next one assumes that ((line_size % 16) == 0) */ 39void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 40{ 41 register vector unsigned char pixelsv1, pixelsv2; 42 register vector unsigned char pixelsv1B, pixelsv2B; 43 register vector unsigned char pixelsv1C, pixelsv2C; 44 register vector unsigned char pixelsv1D, pixelsv2D; 45 46 register vector unsigned char perm = vec_lvsl(0, pixels); 47 int i; 48 register ptrdiff_t line_size_2 = line_size << 1; 49 register ptrdiff_t line_size_3 = line_size + line_size_2; 50 register ptrdiff_t line_size_4 = line_size << 2; 51 52// hand-unrolling the loop by 4 gains about 15% 53// mininum execution time goes from 74 to 60 cycles 54// it's faster than -funroll-loops, but using 55// -funroll-loops w/ this is bad - 74 cycles again. 56// all this is on a 7450, tuning for the 7450 57 for (i = 0; i < h; i += 4) { 58 pixelsv1 = vec_ld( 0, pixels); 59 pixelsv2 = vec_ld(15, pixels); 60 pixelsv1B = vec_ld(line_size, pixels); 61 pixelsv2B = vec_ld(15 + line_size, pixels); 62 pixelsv1C = vec_ld(line_size_2, pixels); 63 pixelsv2C = vec_ld(15 + line_size_2, pixels); 64 pixelsv1D = vec_ld(line_size_3, pixels); 65 pixelsv2D = vec_ld(15 + line_size_3, pixels); 66 vec_st(vec_perm(pixelsv1, pixelsv2, perm), 67 0, (unsigned char*)block); 68 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), 69 line_size, (unsigned char*)block); 70 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), 71 line_size_2, (unsigned char*)block); 72 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), 73 line_size_3, (unsigned char*)block); 74 pixels+=line_size_4; 75 block +=line_size_4; 76 } 77} 78 79/* next one assumes that ((line_size % 16) == 0) */ 80#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) 81void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 82{ 83 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 84 register vector unsigned char perm = vec_lvsl(0, pixels); 85 int i; 86 87 for (i = 0; i < h; i++) { 88 pixelsv1 = vec_ld( 0, pixels); 89 pixelsv2 = vec_ld(16,pixels); 90 blockv = vec_ld(0, block); 91 pixelsv = vec_perm(pixelsv1, pixelsv2, perm); 92 blockv = vec_avg(blockv,pixelsv); 93 vec_st(blockv, 0, (unsigned char*)block); 94 pixels+=line_size; 95 block +=line_size; 96 } 97} 98 99/* next one assumes that ((line_size % 8) == 0) */ 100static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) 101{ 102 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 103 int i; 104 105 for (i = 0; i < h; i++) { 106 /* block is 8 bytes-aligned, so we're either in the 107 left block (16 bytes-aligned) or in the right block (not) */ 108 int rightside = ((unsigned long)block & 0x0000000F); 109 110 blockv = vec_ld(0, block); 111 pixelsv1 = vec_ld( 0, pixels); 112 pixelsv2 = vec_ld(16, pixels); 113 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); 114 115 if (rightside) { 116 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); 117 } else { 118 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); 119 } 120 121 blockv = vec_avg(blockv, pixelsv); 122 123 vec_st(blockv, 0, block); 124 125 pixels += line_size; 126 block += line_size; 127 } 128} 129 130/* next one assumes that ((line_size % 8) == 0) */ 131static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 132{ 133 register int i; 134 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 135 register vector unsigned char blockv, temp1, temp2; 136 register vector unsigned short pixelssum1, pixelssum2, temp3; 137 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 138 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 139 140 temp1 = vec_ld(0, pixels); 141 temp2 = vec_ld(16, pixels); 142 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 143 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 144 pixelsv2 = temp2; 145 } else { 146 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 147 } 148 pixelsv1 = vec_mergeh(vczero, pixelsv1); 149 pixelsv2 = vec_mergeh(vczero, pixelsv2); 150 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 151 (vector unsigned short)pixelsv2); 152 pixelssum1 = vec_add(pixelssum1, vctwo); 153 154 for (i = 0; i < h ; i++) { 155 int rightside = ((unsigned long)block & 0x0000000F); 156 blockv = vec_ld(0, block); 157 158 temp1 = vec_ld(line_size, pixels); 159 temp2 = vec_ld(line_size + 16, pixels); 160 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 161 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 162 pixelsv2 = temp2; 163 } else { 164 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 165 } 166 167 pixelsv1 = vec_mergeh(vczero, pixelsv1); 168 pixelsv2 = vec_mergeh(vczero, pixelsv2); 169 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 170 (vector unsigned short)pixelsv2); 171 temp3 = vec_add(pixelssum1, pixelssum2); 172 temp3 = vec_sra(temp3, vctwo); 173 pixelssum1 = vec_add(pixelssum2, vctwo); 174 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 175 176 if (rightside) { 177 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 178 } else { 179 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 180 } 181 182 vec_st(blockv, 0, block); 183 184 block += line_size; 185 pixels += line_size; 186 } 187} 188 189/* next one assumes that ((line_size % 8) == 0) */ 190static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 191{ 192 register int i; 193 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 194 register vector unsigned char blockv, temp1, temp2; 195 register vector unsigned short pixelssum1, pixelssum2, temp3; 196 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 197 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 198 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 199 200 temp1 = vec_ld(0, pixels); 201 temp2 = vec_ld(16, pixels); 202 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 203 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 204 pixelsv2 = temp2; 205 } else { 206 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 207 } 208 pixelsv1 = vec_mergeh(vczero, pixelsv1); 209 pixelsv2 = vec_mergeh(vczero, pixelsv2); 210 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 211 (vector unsigned short)pixelsv2); 212 pixelssum1 = vec_add(pixelssum1, vcone); 213 214 for (i = 0; i < h ; i++) { 215 int rightside = ((unsigned long)block & 0x0000000F); 216 blockv = vec_ld(0, block); 217 218 temp1 = vec_ld(line_size, pixels); 219 temp2 = vec_ld(line_size + 16, pixels); 220 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 221 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 222 pixelsv2 = temp2; 223 } else { 224 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 225 } 226 227 pixelsv1 = vec_mergeh(vczero, pixelsv1); 228 pixelsv2 = vec_mergeh(vczero, pixelsv2); 229 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 230 (vector unsigned short)pixelsv2); 231 temp3 = vec_add(pixelssum1, pixelssum2); 232 temp3 = vec_sra(temp3, vctwo); 233 pixelssum1 = vec_add(pixelssum2, vcone); 234 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 235 236 if (rightside) { 237 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 238 } else { 239 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 240 } 241 242 vec_st(blockv, 0, block); 243 244 block += line_size; 245 pixels += line_size; 246 } 247} 248 249/* next one assumes that ((line_size % 16) == 0) */ 250static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) 251{ 252 register int i; 253 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 254 register vector unsigned char blockv, temp1, temp2; 255 register vector unsigned short temp3, temp4, 256 pixelssum1, pixelssum2, pixelssum3, pixelssum4; 257 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 258 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 259 260 temp1 = vec_ld(0, pixels); 261 temp2 = vec_ld(16, pixels); 262 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 263 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 264 pixelsv2 = temp2; 265 } else { 266 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 267 } 268 pixelsv3 = vec_mergel(vczero, pixelsv1); 269 pixelsv4 = vec_mergel(vczero, pixelsv2); 270 pixelsv1 = vec_mergeh(vczero, pixelsv1); 271 pixelsv2 = vec_mergeh(vczero, pixelsv2); 272 pixelssum3 = vec_add((vector unsigned short)pixelsv3, 273 (vector unsigned short)pixelsv4); 274 pixelssum3 = vec_add(pixelssum3, vctwo); 275 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 276 (vector unsigned short)pixelsv2); 277 pixelssum1 = vec_add(pixelssum1, vctwo); 278 279 for (i = 0; i < h ; i++) { 280 blockv = vec_ld(0, block); 281 282 temp1 = vec_ld(line_size, pixels); 283 temp2 = vec_ld(line_size + 16, pixels); 284 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 285 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 286 pixelsv2 = temp2; 287 } else { 288 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 289 } 290 291 pixelsv3 = vec_mergel(vczero, pixelsv1); 292 pixelsv4 = vec_mergel(vczero, pixelsv2); 293 pixelsv1 = vec_mergeh(vczero, pixelsv1); 294 pixelsv2 = vec_mergeh(vczero, pixelsv2); 295 296 pixelssum4 = vec_add((vector unsigned short)pixelsv3, 297 (vector unsigned short)pixelsv4); 298 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 299 (vector unsigned short)pixelsv2); 300 temp4 = vec_add(pixelssum3, pixelssum4); 301 temp4 = vec_sra(temp4, vctwo); 302 temp3 = vec_add(pixelssum1, pixelssum2); 303 temp3 = vec_sra(temp3, vctwo); 304 305 pixelssum3 = vec_add(pixelssum4, vctwo); 306 pixelssum1 = vec_add(pixelssum2, vctwo); 307 308 blockv = vec_packsu(temp3, temp4); 309 310 vec_st(blockv, 0, block); 311 312 block += line_size; 313 pixels += line_size; 314 } 315} 316 317/* next one assumes that ((line_size % 16) == 0) */ 318static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) 319{ 320 register int i; 321 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 322 register vector unsigned char blockv, temp1, temp2; 323 register vector unsigned short temp3, temp4, 324 pixelssum1, pixelssum2, pixelssum3, pixelssum4; 325 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 326 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 327 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 328 329 temp1 = vec_ld(0, pixels); 330 temp2 = vec_ld(16, pixels); 331 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 332 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 333 pixelsv2 = temp2; 334 } else { 335 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 336 } 337 pixelsv3 = vec_mergel(vczero, pixelsv1); 338 pixelsv4 = vec_mergel(vczero, pixelsv2); 339 pixelsv1 = vec_mergeh(vczero, pixelsv1); 340 pixelsv2 = vec_mergeh(vczero, pixelsv2); 341 pixelssum3 = vec_add((vector unsigned short)pixelsv3, 342 (vector unsigned short)pixelsv4); 343 pixelssum3 = vec_add(pixelssum3, vcone); 344 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 345 (vector unsigned short)pixelsv2); 346 pixelssum1 = vec_add(pixelssum1, vcone); 347 348 for (i = 0; i < h ; i++) { 349 blockv = vec_ld(0, block); 350 351 temp1 = vec_ld(line_size, pixels); 352 temp2 = vec_ld(line_size + 16, pixels); 353 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 354 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 355 pixelsv2 = temp2; 356 } else { 357 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 358 } 359 360 pixelsv3 = vec_mergel(vczero, pixelsv1); 361 pixelsv4 = vec_mergel(vczero, pixelsv2); 362 pixelsv1 = vec_mergeh(vczero, pixelsv1); 363 pixelsv2 = vec_mergeh(vczero, pixelsv2); 364 365 pixelssum4 = vec_add((vector unsigned short)pixelsv3, 366 (vector unsigned short)pixelsv4); 367 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 368 (vector unsigned short)pixelsv2); 369 temp4 = vec_add(pixelssum3, pixelssum4); 370 temp4 = vec_sra(temp4, vctwo); 371 temp3 = vec_add(pixelssum1, pixelssum2); 372 temp3 = vec_sra(temp3, vctwo); 373 374 pixelssum3 = vec_add(pixelssum4, vcone); 375 pixelssum1 = vec_add(pixelssum2, vcone); 376 377 blockv = vec_packsu(temp3, temp4); 378 379 vec_st(blockv, 0, block); 380 381 block += line_size; 382 pixels += line_size; 383 } 384} 385 386/* next one assumes that ((line_size % 8) == 0) */ 387static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 388{ 389 register int i; 390 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 391 register vector unsigned char blockv, temp1, temp2, blocktemp; 392 register vector unsigned short pixelssum1, pixelssum2, temp3; 393 394 register const vector unsigned char vczero = (const vector unsigned char) 395 vec_splat_u8(0); 396 register const vector unsigned short vctwo = (const vector unsigned short) 397 vec_splat_u16(2); 398 399 temp1 = vec_ld(0, pixels); 400 temp2 = vec_ld(16, pixels); 401 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 402 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 403 pixelsv2 = temp2; 404 } else { 405 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 406 } 407 pixelsv1 = vec_mergeh(vczero, pixelsv1); 408 pixelsv2 = vec_mergeh(vczero, pixelsv2); 409 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 410 (vector unsigned short)pixelsv2); 411 pixelssum1 = vec_add(pixelssum1, vctwo); 412 413 for (i = 0; i < h ; i++) { 414 int rightside = ((unsigned long)block & 0x0000000F); 415 blockv = vec_ld(0, block); 416 417 temp1 = vec_ld(line_size, pixels); 418 temp2 = vec_ld(line_size + 16, pixels); 419 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 420 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 421 pixelsv2 = temp2; 422 } else { 423 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 424 } 425 426 pixelsv1 = vec_mergeh(vczero, pixelsv1); 427 pixelsv2 = vec_mergeh(vczero, pixelsv2); 428 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 429 (vector unsigned short)pixelsv2); 430 temp3 = vec_add(pixelssum1, pixelssum2); 431 temp3 = vec_sra(temp3, vctwo); 432 pixelssum1 = vec_add(pixelssum2, vctwo); 433 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 434 435 if (rightside) { 436 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 437 } else { 438 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 439 } 440 441 blockv = vec_avg(blocktemp, blockv); 442 vec_st(blockv, 0, block); 443 444 block += line_size; 445 pixels += line_size; 446 } 447} 448#endif /* HAVE_ALTIVEC */ 449 450av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags) 451{ 452#if HAVE_ALTIVEC 453 if (!PPC_ALTIVEC(av_get_cpu_flags())) 454 return; 455 456 c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec; 457 c->avg_pixels_tab[1][0] = avg_pixels8_altivec; 458 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; 459 460 c->put_pixels_tab[0][0] = ff_put_pixels16_altivec; 461 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; 462 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; 463 464 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec; 465 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; 466 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; 467#endif /* HAVE_ALTIVEC */ 468} 469