1/* 2 * FFT transform with Altivec optimizations 3 * Copyright (c) 2009 Loren Merritt 4 * 5 * This algorithm (though not any of the implementation details) is 6 * based on libdjbfft by D. J. Bernstein. 7 * 8 * This file is part of FFmpeg. 9 * 10 * FFmpeg is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU Lesser General Public 12 * License as published by the Free Software Foundation; either 13 * version 2.1 of the License, or (at your option) any later version. 14 * 15 * FFmpeg is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public 21 * License along with FFmpeg; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23 */ 24 25/* 26 * These functions are not individually interchangeable with the C versions. 27 * While C takes arrays of FFTComplex, Altivec leaves intermediate results 28 * in blocks as convenient to the vector size. 29 * i.e. {4x real, 4x imaginary, 4x real, ...} 30 * 31 * I ignore standard calling convention. 32 * Instead, the following registers are treated as global constants: 33 * v14: zero 34 * v15..v18: cosines 35 * v19..v29: permutations 36 * r9: 16 37 * r12: ff_cos_tabs 38 * and the rest are free for local use. 39 */ 40 41#include "config.h" 42#include "asm.S" 43 44.text 45 46.macro addi2 ra, imm // add 32-bit immediate 47.if \imm & 0xffff 48 addi \ra, \ra, \imm@l 49.endif 50.if (\imm+0x8000)>>16 51 addis \ra, \ra, \imm@ha 52.endif 53.endm 54 55.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3 56 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} 57 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} 58 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} 59 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7} 60 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4} 61 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8} 62 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1} 63 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3} 64 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3} 65 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3} 66.endm 67 68.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3 69 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} 70 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} 71 vperm \b2,\b0,\b1,v20 72 vperm \b3,\b0,\b1,v21 73 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} 74 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7} 75 vaddfp \b0,\b2,\b3 76 vsubfp \b1,\b2,\b3 77 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4} 78 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8} 79 vmrghw \b2,\b0,\b1 80 vperm \b3,\b0,\b1,v22 81 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1} 82 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3} 83 vaddfp \b0,\b2,\b3 84 vsubfp \b1,\b2,\b3 85 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3} 86 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3} 87 vperm \b2,\b0,\b1,v23 88 vperm \b3,\b0,\b1,v24 89.endm 90 91.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1 92 vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6} 93 vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7} 94 vperm \a2,\a0,\a1,v20 // FFT4 ... 95 vperm \a3,\a0,\a1,v21 96 vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4} 97 vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7} 98 vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7} 99 vaddfp \a0,\a2,\a3 100 vsubfp \a1,\a2,\a3 101 vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2) 102 vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9} 103 vmrghw \a2,\a0,\a1 104 vperm \a3,\a0,\a1,v22 105 vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8} 106 vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta} 107 vaddfp \a0,\a2,\a3 108 vsubfp \a1,\a2,\a3 109 vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta} 110 vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb} 111 vperm \a2,\a0,\a1,v23 112 vperm \a3,\a0,\a1,v24 113 vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb} 114 vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc} 115 vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7} 116 vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7} 117 vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3} 118 vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3} 119.endm 120 121.macro BF d0,d1,s0,s1 122 vsubfp \d1,\s0,\s1 123 vaddfp \d0,\s0,\s1 124.endm 125 126.macro zip d0,d1,s0,s1 127 vmrghw \d0,\s0,\s1 128 vmrglw \d1,\s0,\s1 129.endm 130 131.macro def_fft4 interleave 132fft4\interleave\()_altivec: 133 lvx v0, 0,r3 134 lvx v1,r9,r3 135 FFT4 v0,v1,v2,v3 136.ifnb \interleave 137 zip v0,v1,v2,v3 138 stvx v0, 0,r3 139 stvx v1,r9,r3 140.else 141 stvx v2, 0,r3 142 stvx v3,r9,r3 143.endif 144 blr 145.endm 146 147.macro def_fft8 interleave 148fft8\interleave\()_altivec: 149 addi r4,r3,32 150 lvx v0, 0,r3 151 lvx v1,r9,r3 152 lvx v2, 0,r4 153 lvx v3,r9,r4 154 FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8 155.ifnb \interleave 156 zip v4,v5,v0,v1 157 zip v6,v7,v2,v3 158 stvx v4, 0,r3 159 stvx v5,r9,r3 160 stvx v6, 0,r4 161 stvx v7,r9,r4 162.else 163 stvx v0, 0,r3 164 stvx v1,r9,r3 165 stvx v2, 0,r4 166 stvx v3,r9,r4 167.endif 168 blr 169.endm 170 171.macro def_fft16 interleave 172fft16\interleave\()_altivec: 173 addi r5,r3,64 174 addi r6,r3,96 175 addi r4,r3,32 176 lvx v0, 0,r5 177 lvx v1,r9,r5 178 lvx v2, 0,r6 179 lvx v3,r9,r6 180 FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7 181 lvx v0, 0,r3 182 lvx v1,r9,r3 183 lvx v2, 0,r4 184 lvx v3,r9,r4 185 FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12 186 vmaddfp v8,v4,v15,v14 // r2*wre 187 vmaddfp v9,v5,v15,v14 // i2*wre 188 vmaddfp v10,v6,v15,v14 // r3*wre 189 vmaddfp v11,v7,v15,v14 // i3*wre 190 vmaddfp v8,v5,v16,v8 // i2*wim 191 vnmsubfp v9,v4,v16,v9 // r2*wim 192 vnmsubfp v10,v7,v16,v10 // i3*wim 193 vmaddfp v11,v6,v16,v11 // r3*wim 194 BF v10,v12,v10,v8 195 BF v11,v13,v9,v11 196 BF v0,v4,v0,v10 197 BF v3,v7,v3,v12 198 BF v1,v5,v1,v11 199 BF v2,v6,v2,v13 200.ifnb \interleave 201 zip v8, v9,v0,v1 202 zip v10,v11,v2,v3 203 zip v12,v13,v4,v5 204 zip v14,v15,v6,v7 205 stvx v8, 0,r3 206 stvx v9,r9,r3 207 stvx v10, 0,r4 208 stvx v11,r9,r4 209 stvx v12, 0,r5 210 stvx v13,r9,r5 211 stvx v14, 0,r6 212 stvx v15,r9,r6 213.else 214 stvx v0, 0,r3 215 stvx v4, 0,r5 216 stvx v3,r9,r4 217 stvx v7,r9,r6 218 stvx v1,r9,r3 219 stvx v5,r9,r5 220 stvx v2, 0,r4 221 stvx v6, 0,r6 222.endif 223 blr 224.endm 225 226// void pass(float *z, float *wre, int n) 227.macro PASS interleave, suffix 228fft_pass\suffix\()_altivec: 229 mtctr r5 230 slwi r0,r5,4 231 slwi r7,r5,6 // o2 232 slwi r5,r5,5 // o1 233 add r10,r5,r7 // o3 234 add r0,r4,r0 // wim 235 addi r6,r5,16 // o1+16 236 addi r8,r7,16 // o2+16 237 addi r11,r10,16 // o3+16 2381: 239 lvx v8, 0,r4 // wre 240 lvx v10, 0,r0 // wim 241 sub r0,r0,r9 242 lvx v9, 0,r0 243 vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3] 244 lvx v4,r3,r7 // r2 = z[o2] 245 lvx v5,r3,r8 // i2 = z[o2+16] 246 lvx v6,r3,r10 // r3 = z[o3] 247 lvx v7,r3,r11 // i3 = z[o3+16] 248 vmaddfp v10,v4,v8,v14 // r2*wre 249 vmaddfp v11,v5,v8,v14 // i2*wre 250 vmaddfp v12,v6,v8,v14 // r3*wre 251 vmaddfp v13,v7,v8,v14 // i3*wre 252 lvx v0, 0,r3 // r0 = z[0] 253 lvx v3,r3,r6 // i1 = z[o1+16] 254 vmaddfp v10,v5,v9,v10 // i2*wim 255 vnmsubfp v11,v4,v9,v11 // r2*wim 256 vnmsubfp v12,v7,v9,v12 // i3*wim 257 vmaddfp v13,v6,v9,v13 // r3*wim 258 lvx v1,r3,r9 // i0 = z[16] 259 lvx v2,r3,r5 // r1 = z[o1] 260 BF v12,v8,v12,v10 261 BF v13,v9,v11,v13 262 BF v0,v4,v0,v12 263 BF v3,v7,v3,v8 264.if !\interleave 265 stvx v0, 0,r3 266 stvx v4,r3,r7 267 stvx v3,r3,r6 268 stvx v7,r3,r11 269.endif 270 BF v1,v5,v1,v13 271 BF v2,v6,v2,v9 272.if !\interleave 273 stvx v1,r3,r9 274 stvx v2,r3,r5 275 stvx v5,r3,r8 276 stvx v6,r3,r10 277.else 278 vmrghw v8,v0,v1 279 vmrglw v9,v0,v1 280 stvx v8, 0,r3 281 stvx v9,r3,r9 282 vmrghw v8,v2,v3 283 vmrglw v9,v2,v3 284 stvx v8,r3,r5 285 stvx v9,r3,r6 286 vmrghw v8,v4,v5 287 vmrglw v9,v4,v5 288 stvx v8,r3,r7 289 stvx v9,r3,r8 290 vmrghw v8,v6,v7 291 vmrglw v9,v6,v7 292 stvx v8,r3,r10 293 stvx v9,r3,r11 294.endif 295 addi r3,r3,32 296 addi r4,r4,16 297 bdnz 1b 298 sub r3,r3,r5 299 blr 300.endm 301 302#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */ 303 304#define WORD_0 0x00,0x01,0x02,0x03 305#define WORD_1 0x04,0x05,0x06,0x07 306#define WORD_2 0x08,0x09,0x0a,0x0b 307#define WORD_3 0x0c,0x0d,0x0e,0x0f 308#define WORD_s0 0x10,0x11,0x12,0x13 309#define WORD_s1 0x14,0x15,0x16,0x17 310#define WORD_s2 0x18,0x19,0x1a,0x1b 311#define WORD_s3 0x1c,0x1d,0x1e,0x1f 312 313#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d 314 315 .rodata 316 .align 4 317fft_data: 318 .float 0, 0, 0, 0 319 .float 1, 0.92387953, M_SQRT1_2, 0.38268343 320 .float 0, 0.38268343, M_SQRT1_2, 0.92387953 321 .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2 322 .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 323 vcprm(s0,3,2,1) 324 vcprm(0,1,s2,s1) 325 vcprm(2,3,s0,s3) 326 vcprm(2,s3,3,s2) 327 vcprm(0,1,s0,s1) 328 vcprm(2,3,s2,s3) 329 vcprm(2,3,0,1) 330 vcprm(1,2,s3,s0) 331 vcprm(0,3,s2,s1) 332 vcprm(0,2,s1,s3) 333 vcprm(1,3,s0,s2) 334 335.macro lvm b, r, regs:vararg 336 lvx \r, 0, \b 337 addi \b, \b, 16 338 .ifnb \regs 339 lvm \b, \regs 340 .endif 341.endm 342 343.macro stvm b, r, regs:vararg 344 stvx \r, 0, \b 345 addi \b, \b, 16 346 .ifnb \regs 347 stvm \b, \regs 348 .endif 349.endm 350 351.macro fft_calc interleave 352extfunc ff_fft_calc\interleave\()_altivec 353 mflr r0 354 stp r0, 2*PS(r1) 355 stpu r1, -(160+16*PS)(r1) 356 get_got r11 357 addi r6, r1, 16*PS 358 stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 359 mfvrsave r0 360 stw r0, 15*PS(r1) 361 li r6, 0xfffffffc 362 mtvrsave r6 363 364 movrel r6, fft_data, r11 365 lvm r6, v14, v15, v16, v17, v18, v19, v20, v21 366 lvm r6, v22, v23, v24, v25, v26, v27, v28, v29 367 368 li r9, 16 369 movrel r12, X(ff_cos_tabs), r11 370 371 movrel r6, fft_dispatch_tab\interleave\()_altivec, r11 372 lwz r3, 0(r3) 373 subi r3, r3, 2 374 slwi r3, r3, 2+ARCH_PPC64 375 lpx r3, r3, r6 376 mtctr r3 377 mr r3, r4 378 bctrl 379 380 addi r6, r1, 16*PS 381 lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 382 lwz r6, 15*PS(r1) 383 mtvrsave r6 384 lp r1, 0(r1) 385 lp r0, 2*PS(r1) 386 mtlr r0 387 blr 388.endm 389 390.macro DECL_FFT suffix, bits, n, n2, n4 391fft\n\suffix\()_altivec: 392 mflr r0 393 stp r0,PS*(\bits-3)(r1) 394 bl fft\n2\()_altivec 395 addi2 r3,\n*4 396 bl fft\n4\()_altivec 397 addi2 r3,\n*2 398 bl fft\n4\()_altivec 399 addi2 r3,\n*-6 400 lp r0,PS*(\bits-3)(r1) 401 lp r4,\bits*PS(r12) 402 mtlr r0 403 li r5,\n/16 404 b fft_pass\suffix\()_altivec 405.endm 406 407.macro DECL_FFTS interleave, suffix 408 .text 409 def_fft4 \suffix 410 def_fft8 \suffix 411 def_fft16 \suffix 412 PASS \interleave, \suffix 413 DECL_FFT \suffix, 5, 32, 16, 8 414 DECL_FFT \suffix, 6, 64, 32, 16 415 DECL_FFT \suffix, 7, 128, 64, 32 416 DECL_FFT \suffix, 8, 256, 128, 64 417 DECL_FFT \suffix, 9, 512, 256, 128 418 DECL_FFT \suffix,10, 1024, 512, 256 419 DECL_FFT \suffix,11, 2048, 1024, 512 420 DECL_FFT \suffix,12, 4096, 2048, 1024 421 DECL_FFT \suffix,13, 8192, 4096, 2048 422 DECL_FFT \suffix,14,16384, 8192, 4096 423 DECL_FFT \suffix,15,32768,16384, 8192 424 DECL_FFT \suffix,16,65536,32768,16384 425 426 fft_calc \suffix 427 428 .rodata 429 .align 3 430fft_dispatch_tab\suffix\()_altivec: 431 PTR fft4\suffix\()_altivec 432 PTR fft8\suffix\()_altivec 433 PTR fft16\suffix\()_altivec 434 PTR fft32\suffix\()_altivec 435 PTR fft64\suffix\()_altivec 436 PTR fft128\suffix\()_altivec 437 PTR fft256\suffix\()_altivec 438 PTR fft512\suffix\()_altivec 439 PTR fft1024\suffix\()_altivec 440 PTR fft2048\suffix\()_altivec 441 PTR fft4096\suffix\()_altivec 442 PTR fft8192\suffix\()_altivec 443 PTR fft16384\suffix\()_altivec 444 PTR fft32768\suffix\()_altivec 445 PTR fft65536\suffix\()_altivec 446.endm 447 448DECL_FFTS 0 449DECL_FFTS 1, _interleave 450