1#ifndef AVCODEC_PPC_FFT_VSX_H 2#define AVCODEC_PPC_FFT_VSX_H 3/* 4 * FFT transform, optimized with VSX built-in functions 5 * Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt 6 * 7 * This algorithm (though not any of the implementation details) is 8 * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S. 9 * 10 * This file is part of FFmpeg. 11 * 12 * FFmpeg is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU Lesser General Public 14 * License as published by the Free Software Foundation; either 15 * version 2.1 of the License, or (at your option) any later version. 16 * 17 * FFmpeg is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 * Lesser General Public License for more details. 21 * 22 * You should have received a copy of the GNU Lesser General Public 23 * License along with FFmpeg; if not, write to the Free Software 24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25 */ 26 27 28#include "config.h" 29#include "libavutil/cpu.h" 30#include "libavutil/ppc/types_altivec.h" 31#include "libavutil/ppc/util_altivec.h" 32#include "libavcodec/fft.h" 33#include "libavcodec/fft-internal.h" 34 35#if HAVE_VSX 36 37void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z); 38void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z); 39 40 41#define byte_2complex (2*sizeof(FFTComplex)) 42#define byte_4complex (4*sizeof(FFTComplex)) 43#define byte_6complex (6*sizeof(FFTComplex)) 44#define byte_8complex (8*sizeof(FFTComplex)) 45#define byte_10complex (10*sizeof(FFTComplex)) 46#define byte_12complex (12*sizeof(FFTComplex)) 47#define byte_14complex (14*sizeof(FFTComplex)) 48 49inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n) 50{ 51 int o1 = n<<1; 52 int o2 = n<<2; 53 int o3 = o1+o2; 54 int i1, i2, i3; 55 FFTSample* out = (FFTSample*)z; 56 const FFTSample *wim = wre+o1; 57 vec_f vz0, vzo1, vzo2, vzo3; 58 vec_f x0, x1, x2, x3; 59 vec_f x4, x5, x6, x7; 60 vec_f x8, x9, x10, x11; 61 vec_f x12, x13, x14, x15; 62 vec_f x16, x17, x18, x19; 63 vec_f x20, x21, x22, x23; 64 vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1; 65 vec_f y0, y1, y2, y3; 66 vec_f y4, y5, y8, y9; 67 vec_f y10, y13, y14, y15; 68 vec_f y16, y17, y18, y19; 69 vec_f y20, y21, y22, y23; 70 vec_f wr1, wi1, wr0, wi0; 71 vec_f wr2, wi2, wr3, wi3; 72 vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3; 73 74 n = n-2; 75 i1 = o1*sizeof(FFTComplex); 76 i2 = o2*sizeof(FFTComplex); 77 i3 = o3*sizeof(FFTComplex); 78 vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i 79 vzo2plus1 = vec_ld(i2+16, &(out[0])); 80 vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i 81 vzo3plus1 = vec_ld(i3+16, &(out[0])); 82 vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i 83 vz0plus1 = vec_ld(16, &(out[0])); 84 vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i 85 vzo1plus1 = vec_ld(i1+16, &(out[0])); 86 87 x0 = vec_add(vzo2, vzo3); 88 x1 = vec_sub(vzo2, vzo3); 89 y0 = vec_add(vzo2plus1, vzo3plus1); 90 y1 = vec_sub(vzo2plus1, vzo3plus1); 91 92 wr1 = vec_splats(wre[1]); 93 wi1 = vec_splats(wim[-1]); 94 wi2 = vec_splats(wim[-2]); 95 wi3 = vec_splats(wim[-3]); 96 wr2 = vec_splats(wre[2]); 97 wr3 = vec_splats(wre[3]); 98 99 x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3)); 100 x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2)); 101 102 y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0)); 103 y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2)); 104 y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1)); 105 y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3)); 106 107 ymulwi2 = vec_mul(y4, wi2); 108 ymulwi3 = vec_mul(y5, wi3); 109 x4 = vec_mul(x2, wr1); 110 x5 = vec_mul(x3, wi1); 111 y8 = vec_madd(y2, wr2, ymulwi2); 112 y9 = vec_msub(y2, wr2, ymulwi2); 113 x6 = vec_add(x4, x5); 114 x7 = vec_sub(x4, x5); 115 y13 = vec_madd(y3, wr3, ymulwi3); 116 y14 = vec_msub(y3, wr3, ymulwi3); 117 118 x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3)); 119 y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3)); 120 y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3)); 121 122 x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2)); 123 x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1)); 124 125 y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2)); 126 y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1)); 127 128 x11 = vec_add(vz0, x9); 129 x12 = vec_sub(vz0, x9); 130 x13 = vec_add(vzo1, x10); 131 x14 = vec_sub(vzo1, x10); 132 133 y18 = vec_add(vz0plus1, y16); 134 y19 = vec_sub(vz0plus1, y16); 135 y20 = vec_add(vzo1plus1, y17); 136 y21 = vec_sub(vzo1plus1, y17); 137 138 x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3)); 139 x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3)); 140 y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3)); 141 y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3)); 142 143 144 vec_st(x11, 0, &(out[0])); 145 vec_st(y18, 16, &(out[0])); 146 vec_st(x15, i1, &(out[0])); 147 vec_st(y22, i1+16, &(out[0])); 148 vec_st(x12, i2, &(out[0])); 149 vec_st(y19, i2+16, &(out[0])); 150 vec_st(x16, i3, &(out[0])); 151 vec_st(y23, i3+16, &(out[0])); 152 153 do { 154 out += 8; 155 wre += 4; 156 wim -= 4; 157 wr0 = vec_splats(wre[0]); 158 wr1 = vec_splats(wre[1]); 159 wi0 = vec_splats(wim[0]); 160 wi1 = vec_splats(wim[-1]); 161 162 wr2 = vec_splats(wre[2]); 163 wr3 = vec_splats(wre[3]); 164 wi2 = vec_splats(wim[-2]); 165 wi3 = vec_splats(wim[-3]); 166 167 vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i 168 vzo2plus1 = vec_ld(i2+16, &(out[0])); 169 vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i 170 vzo3plus1 = vec_ld(i3+16, &(out[0])); 171 vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i 172 vz0plus1 = vec_ld(16, &(out[0])); 173 vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i 174 vzo1plus1 = vec_ld(i1+16, &(out[0])); 175 176 x0 = vec_add(vzo2, vzo3); 177 x1 = vec_sub(vzo2, vzo3); 178 179 y0 = vec_add(vzo2plus1, vzo3plus1); 180 y1 = vec_sub(vzo2plus1, vzo3plus1); 181 182 x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0)); 183 x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2)); 184 x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1)); 185 x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3)); 186 187 y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1)); 188 y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3)); 189 xmulwi0 = vec_mul(x4, wi0); 190 xmulwi1 = vec_mul(x5, wi1); 191 192 y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0)); 193 y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2)); 194 195 x8 = vec_madd(x2, wr0, xmulwi0); 196 x9 = vec_msub(x2, wr0, xmulwi0); 197 ymulwi2 = vec_mul(y4, wi2); 198 ymulwi3 = vec_mul(y5, wi3); 199 200 x13 = vec_madd(x3, wr1, xmulwi1); 201 x14 = vec_msub(x3, wr1, xmulwi1); 202 203 y8 = vec_madd(y2, wr2, ymulwi2); 204 y9 = vec_msub(y2, wr2, ymulwi2); 205 y13 = vec_madd(y3, wr3, ymulwi3); 206 y14 = vec_msub(y3, wr3, ymulwi3); 207 208 x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3)); 209 x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3)); 210 211 y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3)); 212 y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3)); 213 214 x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2)); 215 x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1)); 216 217 y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2)); 218 y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1)); 219 220 x18 = vec_add(vz0, x16); 221 x19 = vec_sub(vz0, x16); 222 x20 = vec_add(vzo1, x17); 223 x21 = vec_sub(vzo1, x17); 224 225 y18 = vec_add(vz0plus1, y16); 226 y19 = vec_sub(vz0plus1, y16); 227 y20 = vec_add(vzo1plus1, y17); 228 y21 = vec_sub(vzo1plus1, y17); 229 230 x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3)); 231 x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3)); 232 233 y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3)); 234 y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3)); 235 236 vec_st(x18, 0, &(out[0])); 237 vec_st(y18, 16, &(out[0])); 238 vec_st(x22, i1, &(out[0])); 239 vec_st(y22, i1+16, &(out[0])); 240 vec_st(x19, i2, &(out[0])); 241 vec_st(y19, i2+16, &(out[0])); 242 vec_st(x23, i3, &(out[0])); 243 vec_st(y23, i3+16, &(out[0])); 244 } while (n-=2); 245} 246 247inline static void fft2_vsx_interleave(FFTComplex *z) 248{ 249 FFTSample r1, i1; 250 251 r1 = z[0].re - z[1].re; 252 z[0].re += z[1].re; 253 z[1].re = r1; 254 255 i1 = z[0].im - z[1].im; 256 z[0].im += z[1].im; 257 z[1].im = i1; 258 } 259 260inline static void fft4_vsx_interleave(FFTComplex *z) 261{ 262 vec_f a, b, c, d; 263 float* out= (float*)z; 264 a = vec_ld(0, &(out[0])); 265 b = vec_ld(byte_2complex, &(out[0])); 266 267 c = vec_perm(a, b, vcprm(0,1,s2,s1)); 268 d = vec_perm(a, b, vcprm(2,3,s0,s3)); 269 a = vec_add(c, d); 270 b = vec_sub(c, d); 271 272 c = vec_perm(a, b, vcprm(0,1,s0,s1)); 273 d = vec_perm(a, b, vcprm(2,3,s3,s2)); 274 275 a = vec_add(c, d); 276 b = vec_sub(c, d); 277 vec_st(a, 0, &(out[0])); 278 vec_st(b, byte_2complex, &(out[0])); 279} 280 281inline static void fft8_vsx_interleave(FFTComplex *z) 282{ 283 vec_f vz0, vz1, vz2, vz3; 284 vec_f x0, x1, x2, x3; 285 vec_f x4, x5, x6, x7; 286 vec_f x8, x9, x10, x11; 287 vec_f x12, x13, x14, x15; 288 vec_f x16, x17, x18, x19; 289 vec_f x20, x21, x22, x23; 290 vec_f x24, x25, x26, x27; 291 vec_f x28, x29, x30, x31; 292 vec_f x32, x33, x34; 293 294 float* out= (float*)z; 295 vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; 296 297 vz0 = vec_ld(0, &(out[0])); 298 vz1 = vec_ld(byte_2complex, &(out[0])); 299 vz2 = vec_ld(byte_4complex, &(out[0])); 300 vz3 = vec_ld(byte_6complex, &(out[0])); 301 302 x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 303 x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 304 x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1)); 305 x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3)); 306 307 x4 = vec_add(x0, x1); 308 x5 = vec_sub(x0, x1); 309 x6 = vec_add(x2, x3); 310 x7 = vec_sub(x2, x3); 311 312 x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1)); 313 x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2)); 314 x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1)); 315 x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3)); 316 317 x12 = vec_add(x8, x9); 318 x13 = vec_sub(x8, x9); 319 x14 = vec_add(x10, x11); 320 x15 = vec_sub(x10, x11); 321 x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1)); 322 x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1)); 323 x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1)); 324 x19 = vec_add(x16, x18); // z0.r z2.r z0.i z2.i 325 x20 = vec_sub(x16, x18); // z4.r z6.r z4.i z6.i 326 327 x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3)); 328 x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3)); 329 x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2)); 330 x24 = vec_add(x22, x23); 331 x25 = vec_sub(x22, x23); 332 x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1); 333 334 x27 = vec_add(x21, x26); // z1.r z7.r z1.i z3.i 335 x28 = vec_sub(x21, x26); //z5.r z3.r z5.i z7.i 336 337 x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r z0.i z1.r z1.i 338 x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r z2.i z7.r z3.i 339 x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r z4.i z5.r z5.i 340 x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r z6.i z3.r z7.i 341 x33 = vec_perm(x30, x32, vcprm(0,1,s2,3)); // z2.r z2.i z3.r z3.i 342 x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r z6.i z7.r z7.i 343 344 vec_st(x29, 0, &(out[0])); 345 vec_st(x33, byte_2complex, &(out[0])); 346 vec_st(x31, byte_4complex, &(out[0])); 347 vec_st(x34, byte_6complex, &(out[0])); 348} 349 350inline static void fft16_vsx_interleave(FFTComplex *z) 351{ 352 float* out= (float*)z; 353 vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; 354 vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]}; 355 vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]}; 356 vec_f vz0, vz1, vz2, vz3; 357 vec_f vz4, vz5, vz6, vz7; 358 vec_f x0, x1, x2, x3; 359 vec_f x4, x5, x6, x7; 360 vec_f x8, x9, x10, x11; 361 vec_f x12, x13, x14, x15; 362 vec_f x16, x17, x18, x19; 363 vec_f x20, x21, x22, x23; 364 vec_f x24, x25, x26, x27; 365 vec_f x28, x29, x30, x31; 366 vec_f x32, x33, x34, x35; 367 vec_f x36, x37, x38, x39; 368 vec_f x40, x41, x42, x43; 369 vec_f x44, x45, x46, x47; 370 vec_f x48, x49, x50, x51; 371 vec_f x52, x53, x54, x55; 372 vec_f x56, x57, x58, x59; 373 vec_f x60, x61, x62, x63; 374 vec_f x64, x65, x66, x67; 375 vec_f x68, x69, x70, x71; 376 vec_f x72, x73, x74, x75; 377 vec_f x76, x77, x78, x79; 378 vec_f x80, x81, x82, x83; 379 vec_f x84, x85, x86; 380 381 vz0 = vec_ld(0, &(out[0])); 382 vz1 = vec_ld(byte_2complex, &(out[0])); 383 vz2 = vec_ld(byte_4complex, &(out[0])); 384 vz3 = vec_ld(byte_6complex, &(out[0])); 385 vz4 = vec_ld(byte_8complex, &(out[0])); 386 vz5 = vec_ld(byte_10complex, &(out[0])); 387 vz6 = vec_ld(byte_12complex, &(out[0])); 388 vz7 = vec_ld(byte_14complex, &(out[0])); 389 390 x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 391 x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 392 x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1)); 393 x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3)); 394 395 x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1)); 396 x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3)); 397 x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1)); 398 x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3)); 399 400 x8 = vec_add(x0, x1); 401 x9 = vec_sub(x0, x1); 402 x10 = vec_add(x2, x3); 403 x11 = vec_sub(x2, x3); 404 405 x12 = vec_add(x4, x5); 406 x13 = vec_sub(x4, x5); 407 x14 = vec_add(x6, x7); 408 x15 = vec_sub(x6, x7); 409 410 x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1)); 411 x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2)); 412 x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2)); 413 x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3)); 414 x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1)); 415 x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3)); 416 x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1)); 417 x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2)); 418 419 x24 = vec_add(x16, x17); 420 x25 = vec_sub(x16, x17); 421 x26 = vec_add(x18, x19); 422 x27 = vec_sub(x18, x19); 423 x28 = vec_add(x20, x21); 424 x29 = vec_sub(x20, x21); 425 x30 = vec_add(x22, x23); 426 x31 = vec_sub(x22, x23); 427 428 x32 = vec_add(x24, x26); 429 x33 = vec_sub(x24, x26); 430 x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1)); 431 432 x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2)); 433 x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3)); 434 x37 = vec_add(x35, x36); 435 x38 = vec_sub(x35, x36); 436 x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0)); 437 438 x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3)); 439 x41 = vec_perm(x26, x37, vcprm(2,3,s3,s2)); 440 x42 = vec_add(x40, x41); 441 x43 = vec_sub(x40, x41); 442 x44 = vec_mul(x42, vc0); 443 x45 = vec_mul(x43, vc0); 444 445 x46 = vec_add(x34, x39); // z0.r z0.i z4.r z4.i 446 x47 = vec_sub(x34, x39); // z8.r z8.i z12.r z12.i 447 448 x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2)); 449 x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0)); 450 x50 = vec_add(x48, x49); 451 x51 = vec_sub(x48, x49); 452 x52 = vec_mul(x50, vc1); 453 x53 = vec_mul(x50, vc2); 454 x54 = vec_mul(x51, vc1); 455 x55 = vec_mul(x51, vc2); 456 457 x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3)); 458 x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0)); 459 x58 = vec_add(x56, x57); 460 x59 = vec_sub(x56, x57); 461 462 x60 = vec_perm(x54, x55, vcprm(1,0,3,2)); 463 x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2)); 464 x62 = vec_add(x52, x61); 465 x63 = vec_sub(x52, x61); 466 x64 = vec_add(x60, x53); 467 x65 = vec_sub(x60, x53); 468 x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2)); 469 x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2)); 470 471 x68 = vec_add(x58, x66); // z1.r z1.i z3.r z3.i 472 x69 = vec_sub(x58, x66); // z9.r z9.i z11.r z11.i 473 x70 = vec_add(x59, x67); // z5.r z5.i z15.r z15.i 474 x71 = vec_sub(x59, x67); // z13.r z13.i z7.r z7.i 475 476 x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3)); 477 x73 = vec_add(x25, x72); 478 x74 = vec_sub(x25, x72); 479 x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1)); 480 x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3)); 481 x77 = vec_add(x75, x76); // z2.r z2.i z6.r z6.i 482 x78 = vec_sub(x75, x76); // z10.r z10.i z14.r z14.i 483 484 x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r z0.i z1.r z1.i 485 x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r z2.i z3.r z3.i 486 x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r z4.i z5.r z5.i 487 x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r z6.i z7.r z7.i 488 vec_st(x79, 0, &(out[0])); 489 vec_st(x80, byte_2complex, &(out[0])); 490 vec_st(x81, byte_4complex, &(out[0])); 491 vec_st(x82, byte_6complex, &(out[0])); 492 x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r z8.i z9.r z9.i 493 x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r z10.i z11.r z11.i 494 x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r z12.i z13.r z13.i 495 x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r z14.i z15.r z15.i 496 vec_st(x83, byte_8complex, &(out[0])); 497 vec_st(x84, byte_10complex, &(out[0])); 498 vec_st(x85, byte_12complex, &(out[0])); 499 vec_st(x86, byte_14complex, &(out[0])); 500} 501 502inline static void fft4_vsx(FFTComplex *z) 503{ 504 vec_f a, b, c, d; 505 float* out= (float*)z; 506 a = vec_ld(0, &(out[0])); 507 b = vec_ld(byte_2complex, &(out[0])); 508 509 c = vec_perm(a, b, vcprm(0,1,s2,s1)); 510 d = vec_perm(a, b, vcprm(2,3,s0,s3)); 511 a = vec_add(c, d); 512 b = vec_sub(c, d); 513 514 c = vec_perm(a,b, vcprm(0,s0,1,s1)); 515 d = vec_perm(a, b, vcprm(2,s3,3,s2)); 516 517 a = vec_add(c, d); 518 b = vec_sub(c, d); 519 520 c = vec_perm(a, b, vcprm(0,1,s0,s1)); 521 d = vec_perm(a, b, vcprm(2,3,s2,s3)); 522 523 vec_st(c, 0, &(out[0])); 524 vec_st(d, byte_2complex, &(out[0])); 525 return; 526} 527 528inline static void fft8_vsx(FFTComplex *z) 529{ 530 vec_f vz0, vz1, vz2, vz3; 531 vec_f vz4, vz5, vz6, vz7, vz8; 532 533 float* out= (float*)z; 534 vec_f vc0 = {0.0, 0.0, 0.0, 0.0}; 535 vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf}; 536 vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; 537 538 vz0 = vec_ld(0, &(out[0])); 539 vz1 = vec_ld(byte_2complex, &(out[0])); 540 vz2 = vec_ld(byte_4complex, &(out[0])); 541 vz3 = vec_ld(byte_6complex, &(out[0])); 542 543 vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); 544 vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3)); 545 vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 546 vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 547 548 vz2 = vec_add(vz6, vz7); 549 vz3 = vec_sub(vz6, vz7); 550 vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1)); 551 552 vz0 = vec_add(vz4, vz5); 553 vz1 = vec_sub(vz4, vz5); 554 555 vz3 = vec_madd(vz3, vc1, vc0); 556 vz3 = vec_madd(vz8, vc2, vz3); 557 558 vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); 559 vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); 560 vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0)); 561 vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1)); 562 563 vz0 = vec_add(vz4, vz5); 564 vz1 = vec_sub(vz4, vz5); 565 vz2 = vec_add(vz6, vz7); 566 vz3 = vec_sub(vz6, vz7); 567 568 vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); 569 vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); 570 vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3)); 571 vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2)); 572 573 574 vz2 = vec_sub(vz4, vz6); 575 vz3 = vec_sub(vz5, vz7); 576 577 vz0 = vec_add(vz4, vz6); 578 vz1 = vec_add(vz5, vz7); 579 580 vec_st(vz0, 0, &(out[0])); 581 vec_st(vz1, byte_2complex, &(out[0])); 582 vec_st(vz2, byte_4complex, &(out[0])); 583 vec_st(vz3, byte_6complex, &(out[0])); 584 return; 585} 586 587inline static void fft16_vsx(FFTComplex *z) 588{ 589 float* out= (float*)z; 590 vec_f vc0 = {0.0, 0.0, 0.0, 0.0}; 591 vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf}; 592 vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; 593 vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343}; 594 vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953}; 595 vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953}; 596 597 vec_f vz0, vz1, vz2, vz3; 598 vec_f vz4, vz5, vz6, vz7; 599 vec_f vz8, vz9, vz10, vz11; 600 vec_f vz12, vz13; 601 602 vz0 = vec_ld(byte_8complex, &(out[0])); 603 vz1 = vec_ld(byte_10complex, &(out[0])); 604 vz2 = vec_ld(byte_12complex, &(out[0])); 605 vz3 = vec_ld(byte_14complex, &(out[0])); 606 607 vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 608 vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 609 vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1)); 610 vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3)); 611 612 vz0 = vec_add(vz4, vz5); 613 vz1= vec_sub(vz4, vz5); 614 vz2 = vec_add(vz6, vz7); 615 vz3 = vec_sub(vz6, vz7); 616 617 vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); 618 vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); 619 vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); 620 vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2)); 621 622 vz0 = vec_add(vz4, vz5); 623 vz1 = vec_sub(vz4, vz5); 624 vz2 = vec_add(vz6, vz7); 625 vz3 = vec_sub(vz6, vz7); 626 627 vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); 628 vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); 629 630 vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1)); 631 vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3)); 632 633 vz0 = vec_ld(0, &(out[0])); 634 vz1 = vec_ld(byte_2complex, &(out[0])); 635 vz2 = vec_ld(byte_4complex, &(out[0])); 636 vz3 = vec_ld(byte_6complex, &(out[0])); 637 vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); 638 vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3)); 639 vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); 640 vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); 641 642 vz2 = vec_add(vz10, vz11); 643 vz3 = vec_sub(vz10, vz11); 644 vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1)); 645 vz0 = vec_add(vz8, vz9); 646 vz1 = vec_sub(vz8, vz9); 647 648 vz3 = vec_madd(vz3, vc1, vc0); 649 vz3 = vec_madd(vz12, vc2, vz3); 650 vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); 651 vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); 652 vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0)); 653 vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1)); 654 655 vz0 = vec_add(vz8, vz9); 656 vz1 = vec_sub(vz8, vz9); 657 vz2 = vec_add(vz10, vz11); 658 vz3 = vec_sub(vz10, vz11); 659 660 vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); 661 vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); 662 vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3)); 663 vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2)); 664 665 vz2 = vec_sub(vz8, vz10); 666 vz3 = vec_sub(vz9, vz11); 667 vz0 = vec_add(vz8, vz10); 668 vz1 = vec_add(vz9, vz11); 669 670 vz8 = vec_madd(vz4, vc3, vc0); 671 vz9 = vec_madd(vz5, vc3, vc0); 672 vz10 = vec_madd(vz6, vc3, vc0); 673 vz11 = vec_madd(vz7, vc3, vc0); 674 675 vz8 = vec_madd(vz5, vc4, vz8); 676 vz9 = vec_madd(vz4, vc5, vz9); 677 vz10 = vec_madd(vz7, vc5, vz10); 678 vz11 = vec_madd(vz6, vc4, vz11); 679 680 vz12 = vec_sub(vz10, vz8); 681 vz10 = vec_add(vz10, vz8); 682 683 vz13 = vec_sub(vz9, vz11); 684 vz11 = vec_add(vz9, vz11); 685 686 vz4 = vec_sub(vz0, vz10); 687 vz0 = vec_add(vz0, vz10); 688 689 vz7= vec_sub(vz3, vz12); 690 vz3= vec_add(vz3, vz12); 691 692 vz5 = vec_sub(vz1, vz11); 693 vz1 = vec_add(vz1, vz11); 694 695 vz6 = vec_sub(vz2, vz13); 696 vz2 = vec_add(vz2, vz13); 697 698 vec_st(vz0, 0, &(out[0])); 699 vec_st(vz1, byte_2complex, &(out[0])); 700 vec_st(vz2, byte_4complex, &(out[0])); 701 vec_st(vz3, byte_6complex, &(out[0])); 702 vec_st(vz4, byte_8complex, &(out[0])); 703 vec_st(vz5, byte_10complex, &(out[0])); 704 vec_st(vz6, byte_12complex, &(out[0])); 705 vec_st(vz7, byte_14complex, &(out[0])); 706 return; 707 708} 709inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n) 710{ 711 int o1 = n<<1; 712 int o2 = n<<2; 713 int o3 = o1+o2; 714 int i1, i2, i3; 715 FFTSample* out = (FFTSample*)z; 716 const FFTSample *wim = wre+o1; 717 vec_f v0, v1, v2, v3; 718 vec_f v4, v5, v6, v7; 719 vec_f v8, v9, v10, v11; 720 vec_f v12, v13; 721 722 n = n-2; 723 i1 = o1*sizeof(FFTComplex); 724 i2 = o2*sizeof(FFTComplex); 725 i3 = o3*sizeof(FFTComplex); 726 727 v8 = vec_ld(0, &(wre[0])); 728 v10 = vec_ld(0, &(wim[0])); 729 v9 = vec_ld(0, &(wim[-4])); 730 v9 = vec_perm(v9, v10, vcprm(s0,3,2,1)); 731 732 v4 = vec_ld(i2, &(out[0])); 733 v5 = vec_ld(i2+16, &(out[0])); 734 v6 = vec_ld(i3, &(out[0])); 735 v7 = vec_ld(i3+16, &(out[0])); 736 v10 = vec_mul(v4, v8); // r2*wre 737 v11 = vec_mul(v5, v8); // i2*wre 738 v12 = vec_mul(v6, v8); // r3*wre 739 v13 = vec_mul(v7, v8); // i3*wre 740 741 v0 = vec_ld(0, &(out[0])); // r0 742 v3 = vec_ld(i1+16, &(out[0])); // i1 743 v10 = vec_madd(v5, v9, v10); // r2*wim 744 v11 = vec_nmsub(v4, v9, v11); // i2*wim 745 v12 = vec_nmsub(v7, v9, v12); // r3*wim 746 v13 = vec_madd(v6, v9, v13); // i3*wim 747 748 v1 = vec_ld(16, &(out[0])); // i0 749 v2 = vec_ld(i1, &(out[0])); // r1 750 v8 = vec_sub(v12, v10); 751 v12 = vec_add(v12, v10); 752 v9 = vec_sub(v11, v13); 753 v13 = vec_add(v11, v13); 754 v4 = vec_sub(v0, v12); 755 v0 = vec_add(v0, v12); 756 v7 = vec_sub(v3, v8); 757 v3 = vec_add(v3, v8); 758 759 vec_st(v0, 0, &(out[0])); // r0 760 vec_st(v3, i1+16, &(out[0])); // i1 761 vec_st(v4, i2, &(out[0])); // r2 762 vec_st(v7, i3+16, &(out[0]));// i3 763 764 v5 = vec_sub(v1, v13); 765 v1 = vec_add(v1, v13); 766 v6 = vec_sub(v2, v9); 767 v2 = vec_add(v2, v9); 768 769 vec_st(v1, 16, &(out[0])); // i0 770 vec_st(v2, i1, &(out[0])); // r1 771 vec_st(v5, i2+16, &(out[0])); // i2 772 vec_st(v6, i3, &(out[0])); // r3 773 774 do { 775 out += 8; 776 wre += 4; 777 wim -= 4; 778 779 v8 = vec_ld(0, &(wre[0])); 780 v10 = vec_ld(0, &(wim[0])); 781 v9 = vec_ld(0, &(wim[-4])); 782 v9 = vec_perm(v9, v10, vcprm(s0,3,2,1)); 783 784 v4 = vec_ld(i2, &(out[0])); // r2 785 v5 = vec_ld(i2+16, &(out[0])); // i2 786 v6 = vec_ld(i3, &(out[0])); // r3 787 v7 = vec_ld(i3+16, &(out[0]));// i3 788 v10 = vec_mul(v4, v8); // r2*wre 789 v11 = vec_mul(v5, v8); // i2*wre 790 v12 = vec_mul(v6, v8); // r3*wre 791 v13 = vec_mul(v7, v8); // i3*wre 792 793 v0 = vec_ld(0, &(out[0])); // r0 794 v3 = vec_ld(i1+16, &(out[0])); // i1 795 v10 = vec_madd(v5, v9, v10); // r2*wim 796 v11 = vec_nmsub(v4, v9, v11); // i2*wim 797 v12 = vec_nmsub(v7, v9, v12); // r3*wim 798 v13 = vec_madd(v6, v9, v13); // i3*wim 799 800 v1 = vec_ld(16, &(out[0])); // i0 801 v2 = vec_ld(i1, &(out[0])); // r1 802 v8 = vec_sub(v12, v10); 803 v12 = vec_add(v12, v10); 804 v9 = vec_sub(v11, v13); 805 v13 = vec_add(v11, v13); 806 v4 = vec_sub(v0, v12); 807 v0 = vec_add(v0, v12); 808 v7 = vec_sub(v3, v8); 809 v3 = vec_add(v3, v8); 810 811 vec_st(v0, 0, &(out[0])); // r0 812 vec_st(v3, i1+16, &(out[0])); // i1 813 vec_st(v4, i2, &(out[0])); // r2 814 vec_st(v7, i3+16, &(out[0])); // i3 815 816 v5 = vec_sub(v1, v13); 817 v1 = vec_add(v1, v13); 818 v6 = vec_sub(v2, v9); 819 v2 = vec_add(v2, v9); 820 821 vec_st(v1, 16, &(out[0])); // i0 822 vec_st(v2, i1, &(out[0])); // r1 823 vec_st(v5, i2+16, &(out[0])); // i2 824 vec_st(v6, i3, &(out[0])); // r3 825 } while (n-=2); 826} 827 828#endif 829 830#endif /* AVCODEC_PPC_FFT_VSX_H */ 831