1/* 2 * FFT/MDCT transform with Extended 3DNow! optimizations 3 * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt 4 * 5 * This file is part of Libav. 6 * 7 * Libav is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * Libav is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with Libav; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/x86_cpu.h" 23#include "libavcodec/dsputil.h" 24#include "fft.h" 25 26DECLARE_ALIGNED(8, static const unsigned int, m1m1)[2] = { 1U<<31, 1U<<31 }; 27 28#ifdef EMULATE_3DNOWEXT 29#define PSWAPD(s,d)\ 30 "movq "#s","#d"\n"\ 31 "psrlq $32,"#d"\n"\ 32 "punpckldq "#s","#d"\n" 33#define ff_fft_calc_3dn2 ff_fft_calc_3dn 34#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn 35#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn 36#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn 37#define ff_imdct_half_3dn2 ff_imdct_half_3dn 38#else 39#define PSWAPD(s,d) "pswapd "#s","#d"\n" 40#endif 41 42void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits); 43void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits); 44 45void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) 46{ 47 int n = 1<<s->nbits; 48 int i; 49 ff_fft_dispatch_interleave_3dn2(z, s->nbits); 50 __asm__ volatile("femms"); 51 if(n <= 8) 52 for(i=0; i<n; i+=2) 53 FFSWAP(FFTSample, z[i].im, z[i+1].re); 54} 55 56void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input) 57{ 58 x86_reg j, k; 59 long n = s->mdct_size; 60 long n2 = n >> 1; 61 long n4 = n >> 2; 62 long n8 = n >> 3; 63 const uint16_t *revtab = s->revtab; 64 const FFTSample *tcos = s->tcos; 65 const FFTSample *tsin = s->tsin; 66 const FFTSample *in1, *in2; 67 FFTComplex *z = (FFTComplex *)output; 68 69 /* pre rotation */ 70 in1 = input; 71 in2 = input + n2 - 1; 72#ifdef EMULATE_3DNOWEXT 73 __asm__ volatile("movd %0, %%mm7" ::"r"(1U<<31)); 74#endif 75 for(k = 0; k < n4; k++) { 76 // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it 77 __asm__ volatile( 78 "movd %0, %%mm0 \n" 79 "movd %2, %%mm1 \n" 80 "punpckldq %1, %%mm0 \n" 81 "punpckldq %3, %%mm1 \n" 82 "movq %%mm0, %%mm2 \n" 83 PSWAPD( %%mm1, %%mm3 ) 84 "pfmul %%mm1, %%mm0 \n" 85 "pfmul %%mm3, %%mm2 \n" 86#ifdef EMULATE_3DNOWEXT 87 "movq %%mm0, %%mm1 \n" 88 "punpckhdq %%mm2, %%mm0 \n" 89 "punpckldq %%mm2, %%mm1 \n" 90 "pxor %%mm7, %%mm0 \n" 91 "pfadd %%mm1, %%mm0 \n" 92#else 93 "pfpnacc %%mm2, %%mm0 \n" 94#endif 95 ::"m"(in2[-2*k]), "m"(in1[2*k]), 96 "m"(tcos[k]), "m"(tsin[k]) 97 ); 98 __asm__ volatile( 99 "movq %%mm0, %0 \n\t" 100 :"=m"(z[revtab[k]]) 101 ); 102 } 103 104 ff_fft_dispatch_3dn2(z, s->nbits); 105 106#define CMUL(j,mm0,mm1)\ 107 "movq (%2,"#j",2), %%mm6 \n"\ 108 "movq 8(%2,"#j",2), "#mm0"\n"\ 109 "movq %%mm6, "#mm1"\n"\ 110 "movq "#mm0",%%mm7 \n"\ 111 "pfmul (%3,"#j"), %%mm6 \n"\ 112 "pfmul (%4,"#j"), "#mm0"\n"\ 113 "pfmul (%4,"#j"), "#mm1"\n"\ 114 "pfmul (%3,"#j"), %%mm7 \n"\ 115 "pfsub %%mm6, "#mm0"\n"\ 116 "pfadd %%mm7, "#mm1"\n" 117 118 /* post rotation */ 119 j = -n2; 120 k = n2-8; 121 __asm__ volatile( 122 "1: \n" 123 CMUL(%0, %%mm0, %%mm1) 124 CMUL(%1, %%mm2, %%mm3) 125 "movd %%mm0, (%2,%0,2) \n" 126 "movd %%mm1,12(%2,%1,2) \n" 127 "movd %%mm2, (%2,%1,2) \n" 128 "movd %%mm3,12(%2,%0,2) \n" 129 "psrlq $32, %%mm0 \n" 130 "psrlq $32, %%mm1 \n" 131 "psrlq $32, %%mm2 \n" 132 "psrlq $32, %%mm3 \n" 133 "movd %%mm0, 8(%2,%0,2) \n" 134 "movd %%mm1, 4(%2,%1,2) \n" 135 "movd %%mm2, 8(%2,%1,2) \n" 136 "movd %%mm3, 4(%2,%0,2) \n" 137 "sub $8, %1 \n" 138 "add $8, %0 \n" 139 "jl 1b \n" 140 :"+r"(j), "+r"(k) 141 :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) 142 :"memory" 143 ); 144 __asm__ volatile("femms"); 145} 146 147void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input) 148{ 149 x86_reg j, k; 150 long n = s->mdct_size; 151 long n4 = n >> 2; 152 153 ff_imdct_half_3dn2(s, output+n4, input); 154 155 j = -n; 156 k = n-8; 157 __asm__ volatile( 158 "movq %4, %%mm7 \n" 159 "1: \n" 160 PSWAPD((%2,%1), %%mm0) 161 PSWAPD((%3,%0), %%mm1) 162 "pxor %%mm7, %%mm0 \n" 163 "movq %%mm1, (%3,%1) \n" 164 "movq %%mm0, (%2,%0) \n" 165 "sub $8, %1 \n" 166 "add $8, %0 \n" 167 "jl 1b \n" 168 :"+r"(j), "+r"(k) 169 :"r"(output+n4), "r"(output+n4*3), 170 "m"(*m1m1) 171 ); 172 __asm__ volatile("femms"); 173} 174 175