1/* 2 * FFT/IFFT transforms 3 * Copyright (c) 2008 Loren Merritt 4 * Copyright (c) 2002 Fabrice Bellard 5 * Partly based on libdjbfft by D. J. Bernstein 6 * 7 * This file is part of FFmpeg. 8 * 9 * FFmpeg is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * FFmpeg is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with FFmpeg; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 */ 23 24/** 25 * @file 26 * FFT/IFFT transforms. 27 */ 28 29#include <stdlib.h> 30#include <string.h> 31#include "libavutil/mathematics.h" 32#include "fft.h" 33 34/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */ 35#if !CONFIG_HARDCODED_TABLES 36COSTABLE(16); 37COSTABLE(32); 38COSTABLE(64); 39COSTABLE(128); 40COSTABLE(256); 41COSTABLE(512); 42COSTABLE(1024); 43COSTABLE(2048); 44COSTABLE(4096); 45COSTABLE(8192); 46COSTABLE(16384); 47COSTABLE(32768); 48COSTABLE(65536); 49#endif 50COSTABLE_CONST FFTSample * const ff_cos_tabs[] = { 51 NULL, NULL, NULL, NULL, 52 ff_cos_16, ff_cos_32, ff_cos_64, ff_cos_128, ff_cos_256, ff_cos_512, ff_cos_1024, 53 ff_cos_2048, ff_cos_4096, ff_cos_8192, ff_cos_16384, ff_cos_32768, ff_cos_65536, 54}; 55 56static int split_radix_permutation(int i, int n, int inverse) 57{ 58 int m; 59 if(n <= 2) return i&1; 60 m = n >> 1; 61 if(!(i&m)) return split_radix_permutation(i, m, inverse)*2; 62 m >>= 1; 63 if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1; 64 else return split_radix_permutation(i, m, inverse)*4 - 1; 65} 66 67av_cold void ff_init_ff_cos_tabs(int index) 68{ 69#if !CONFIG_HARDCODED_TABLES 70 int i; 71 int m = 1<<index; 72 double freq = 2*M_PI/m; 73 FFTSample *tab = ff_cos_tabs[index]; 74 for(i=0; i<=m/4; i++) 75 tab[i] = cos(i*freq); 76 for(i=1; i<m/4; i++) 77 tab[m/2-i] = tab[i]; 78#endif 79} 80 81av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) 82{ 83 int i, j, m, n; 84 float alpha, c1, s1, s2; 85 int av_unused has_vectors; 86 87 if (nbits < 2 || nbits > 16) 88 goto fail; 89 s->nbits = nbits; 90 n = 1 << nbits; 91 92 s->tmp_buf = NULL; 93 s->exptab = av_malloc((n / 2) * sizeof(FFTComplex)); 94 if (!s->exptab) 95 goto fail; 96 s->revtab = av_malloc(n * sizeof(uint16_t)); 97 if (!s->revtab) 98 goto fail; 99 s->inverse = inverse; 100 101 s2 = inverse ? 1.0 : -1.0; 102 103 s->fft_permute = ff_fft_permute_c; 104 s->fft_calc = ff_fft_calc_c; 105#if CONFIG_MDCT 106 s->imdct_calc = ff_imdct_calc_c; 107 s->imdct_half = ff_imdct_half_c; 108 s->mdct_calc = ff_mdct_calc_c; 109#endif 110 s->exptab1 = NULL; 111 s->split_radix = 1; 112 113 if (ARCH_ARM) ff_fft_init_arm(s); 114 if (HAVE_ALTIVEC) ff_fft_init_altivec(s); 115 if (HAVE_MMX) ff_fft_init_mmx(s); 116 117 if (s->split_radix) { 118 for(j=4; j<=nbits; j++) { 119 ff_init_ff_cos_tabs(j); 120 } 121 for(i=0; i<n; i++) 122 s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i; 123 s->tmp_buf = av_malloc(n * sizeof(FFTComplex)); 124 } else { 125 int np, nblocks, np2, l; 126 FFTComplex *q; 127 128 for(i=0; i<(n/2); i++) { 129 alpha = 2 * M_PI * (float)i / (float)n; 130 c1 = cos(alpha); 131 s1 = sin(alpha) * s2; 132 s->exptab[i].re = c1; 133 s->exptab[i].im = s1; 134 } 135 136 np = 1 << nbits; 137 nblocks = np >> 3; 138 np2 = np >> 1; 139 s->exptab1 = av_malloc(np * 2 * sizeof(FFTComplex)); 140 if (!s->exptab1) 141 goto fail; 142 q = s->exptab1; 143 do { 144 for(l = 0; l < np2; l += 2 * nblocks) { 145 *q++ = s->exptab[l]; 146 *q++ = s->exptab[l + nblocks]; 147 148 q->re = -s->exptab[l].im; 149 q->im = s->exptab[l].re; 150 q++; 151 q->re = -s->exptab[l + nblocks].im; 152 q->im = s->exptab[l + nblocks].re; 153 q++; 154 } 155 nblocks = nblocks >> 1; 156 } while (nblocks != 0); 157 av_freep(&s->exptab); 158 159 /* compute bit reverse table */ 160 for(i=0;i<n;i++) { 161 m=0; 162 for(j=0;j<nbits;j++) { 163 m |= ((i >> j) & 1) << (nbits-j-1); 164 } 165 s->revtab[i]=m; 166 } 167 } 168 169 return 0; 170 fail: 171 av_freep(&s->revtab); 172 av_freep(&s->exptab); 173 av_freep(&s->exptab1); 174 av_freep(&s->tmp_buf); 175 return -1; 176} 177 178void ff_fft_permute_c(FFTContext *s, FFTComplex *z) 179{ 180 int j, k, np; 181 FFTComplex tmp; 182 const uint16_t *revtab = s->revtab; 183 np = 1 << s->nbits; 184 185 if (s->tmp_buf) { 186 /* TODO: handle split-radix permute in a more optimal way, probably in-place */ 187 for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j]; 188 memcpy(z, s->tmp_buf, np * sizeof(FFTComplex)); 189 return; 190 } 191 192 /* reverse */ 193 for(j=0;j<np;j++) { 194 k = revtab[j]; 195 if (k < j) { 196 tmp = z[k]; 197 z[k] = z[j]; 198 z[j] = tmp; 199 } 200 } 201} 202 203av_cold void ff_fft_end(FFTContext *s) 204{ 205 av_freep(&s->revtab); 206 av_freep(&s->exptab); 207 av_freep(&s->exptab1); 208 av_freep(&s->tmp_buf); 209} 210 211#define sqrthalf (float)M_SQRT1_2 212 213#define BF(x,y,a,b) {\ 214 x = a - b;\ 215 y = a + b;\ 216} 217 218#define BUTTERFLIES(a0,a1,a2,a3) {\ 219 BF(t3, t5, t5, t1);\ 220 BF(a2.re, a0.re, a0.re, t5);\ 221 BF(a3.im, a1.im, a1.im, t3);\ 222 BF(t4, t6, t2, t6);\ 223 BF(a3.re, a1.re, a1.re, t4);\ 224 BF(a2.im, a0.im, a0.im, t6);\ 225} 226 227// force loading all the inputs before storing any. 228// this is slightly slower for small data, but avoids store->load aliasing 229// for addresses separated by large powers of 2. 230#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\ 231 FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\ 232 BF(t3, t5, t5, t1);\ 233 BF(a2.re, a0.re, r0, t5);\ 234 BF(a3.im, a1.im, i1, t3);\ 235 BF(t4, t6, t2, t6);\ 236 BF(a3.re, a1.re, r1, t4);\ 237 BF(a2.im, a0.im, i0, t6);\ 238} 239 240#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\ 241 t1 = a2.re * wre + a2.im * wim;\ 242 t2 = a2.im * wre - a2.re * wim;\ 243 t5 = a3.re * wre - a3.im * wim;\ 244 t6 = a3.im * wre + a3.re * wim;\ 245 BUTTERFLIES(a0,a1,a2,a3)\ 246} 247 248#define TRANSFORM_ZERO(a0,a1,a2,a3) {\ 249 t1 = a2.re;\ 250 t2 = a2.im;\ 251 t5 = a3.re;\ 252 t6 = a3.im;\ 253 BUTTERFLIES(a0,a1,a2,a3)\ 254} 255 256/* z[0...8n-1], w[1...2n-1] */ 257#define PASS(name)\ 258static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\ 259{\ 260 FFTSample t1, t2, t3, t4, t5, t6;\ 261 int o1 = 2*n;\ 262 int o2 = 4*n;\ 263 int o3 = 6*n;\ 264 const FFTSample *wim = wre+o1;\ 265 n--;\ 266\ 267 TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\ 268 TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ 269 do {\ 270 z += 2;\ 271 wre += 2;\ 272 wim -= 2;\ 273 TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\ 274 TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ 275 } while(--n);\ 276} 277 278PASS(pass) 279#undef BUTTERFLIES 280#define BUTTERFLIES BUTTERFLIES_BIG 281PASS(pass_big) 282 283#define DECL_FFT(n,n2,n4)\ 284static void fft##n(FFTComplex *z)\ 285{\ 286 fft##n2(z);\ 287 fft##n4(z+n4*2);\ 288 fft##n4(z+n4*3);\ 289 pass(z,ff_cos_##n,n4/2);\ 290} 291 292static void fft4(FFTComplex *z) 293{ 294 FFTSample t1, t2, t3, t4, t5, t6, t7, t8; 295 296 BF(t3, t1, z[0].re, z[1].re); 297 BF(t8, t6, z[3].re, z[2].re); 298 BF(z[2].re, z[0].re, t1, t6); 299 BF(t4, t2, z[0].im, z[1].im); 300 BF(t7, t5, z[2].im, z[3].im); 301 BF(z[3].im, z[1].im, t4, t8); 302 BF(z[3].re, z[1].re, t3, t7); 303 BF(z[2].im, z[0].im, t2, t5); 304} 305 306static void fft8(FFTComplex *z) 307{ 308 FFTSample t1, t2, t3, t4, t5, t6, t7, t8; 309 310 fft4(z); 311 312 BF(t1, z[5].re, z[4].re, -z[5].re); 313 BF(t2, z[5].im, z[4].im, -z[5].im); 314 BF(t3, z[7].re, z[6].re, -z[7].re); 315 BF(t4, z[7].im, z[6].im, -z[7].im); 316 BF(t8, t1, t3, t1); 317 BF(t7, t2, t2, t4); 318 BF(z[4].re, z[0].re, z[0].re, t1); 319 BF(z[4].im, z[0].im, z[0].im, t2); 320 BF(z[6].re, z[2].re, z[2].re, t7); 321 BF(z[6].im, z[2].im, z[2].im, t8); 322 323 TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf); 324} 325 326#if !CONFIG_SMALL 327static void fft16(FFTComplex *z) 328{ 329 FFTSample t1, t2, t3, t4, t5, t6; 330 331 fft8(z); 332 fft4(z+8); 333 fft4(z+12); 334 335 TRANSFORM_ZERO(z[0],z[4],z[8],z[12]); 336 TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf); 337 TRANSFORM(z[1],z[5],z[9],z[13],ff_cos_16[1],ff_cos_16[3]); 338 TRANSFORM(z[3],z[7],z[11],z[15],ff_cos_16[3],ff_cos_16[1]); 339} 340#else 341DECL_FFT(16,8,4) 342#endif 343DECL_FFT(32,16,8) 344DECL_FFT(64,32,16) 345DECL_FFT(128,64,32) 346DECL_FFT(256,128,64) 347DECL_FFT(512,256,128) 348#if !CONFIG_SMALL 349#define pass pass_big 350#endif 351DECL_FFT(1024,512,256) 352DECL_FFT(2048,1024,512) 353DECL_FFT(4096,2048,1024) 354DECL_FFT(8192,4096,2048) 355DECL_FFT(16384,8192,4096) 356DECL_FFT(32768,16384,8192) 357DECL_FFT(65536,32768,16384) 358 359static void (* const fft_dispatch[])(FFTComplex*) = { 360 fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024, 361 fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, 362}; 363 364void ff_fft_calc_c(FFTContext *s, FFTComplex *z) 365{ 366 fft_dispatch[s->nbits-2](z); 367} 368 369