1/* 2 * FFT/MDCT transform with Extended 3DNow! optimizations 3 * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/x86_cpu.h" 23#include "libavcodec/dsputil.h" 24 25DECLARE_ALIGNED_8(static const int, m1m1[2]) = { 1<<31, 1<<31 }; 26 27#ifdef EMULATE_3DNOWEXT 28#define PSWAPD(s,d)\ 29 "movq "#s","#d"\n"\ 30 "psrlq $32,"#d"\n"\ 31 "punpckldq "#s","#d"\n" 32#define ff_fft_calc_3dn2 ff_fft_calc_3dn 33#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn 34#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn 35#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn 36#define ff_imdct_half_3dn2 ff_imdct_half_3dn 37#else 38#define PSWAPD(s,d) "pswapd "#s","#d"\n" 39#endif 40 41void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits); 42void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits); 43 44void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) 45{ 46 int n = 1<<s->nbits; 47 int i; 48 ff_fft_dispatch_interleave_3dn2(z, s->nbits); 49 __asm__ volatile("femms"); 50 if(n <= 8) 51 for(i=0; i<n; i+=2) 52 FFSWAP(FFTSample, z[i].im, z[i+1].re); 53} 54 55void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input) 56{ 57 x86_reg j, k; 58 long n = 1 << s->nbits; 59 long n2 = n >> 1; 60 long n4 = n >> 2; 61 long n8 = n >> 3; 62 const uint16_t *revtab = s->fft.revtab; 63 const FFTSample *tcos = s->tcos; 64 const FFTSample *tsin = s->tsin; 65 const FFTSample *in1, *in2; 66 FFTComplex *z = (FFTComplex *)output; 67 68 /* pre rotation */ 69 in1 = input; 70 in2 = input + n2 - 1; 71#ifdef EMULATE_3DNOWEXT 72 __asm__ volatile("movd %0, %%mm7" ::"r"(1<<31)); 73#endif 74 for(k = 0; k < n4; k++) { 75 // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it 76 __asm__ volatile( 77 "movd %0, %%mm0 \n" 78 "movd %2, %%mm1 \n" 79 "punpckldq %1, %%mm0 \n" 80 "punpckldq %3, %%mm1 \n" 81 "movq %%mm0, %%mm2 \n" 82 PSWAPD( %%mm1, %%mm3 ) 83 "pfmul %%mm1, %%mm0 \n" 84 "pfmul %%mm3, %%mm2 \n" 85#ifdef EMULATE_3DNOWEXT 86 "movq %%mm0, %%mm1 \n" 87 "punpckhdq %%mm2, %%mm0 \n" 88 "punpckldq %%mm2, %%mm1 \n" 89 "pxor %%mm7, %%mm0 \n" 90 "pfadd %%mm1, %%mm0 \n" 91#else 92 "pfpnacc %%mm2, %%mm0 \n" 93#endif 94 ::"m"(in2[-2*k]), "m"(in1[2*k]), 95 "m"(tcos[k]), "m"(tsin[k]) 96 ); 97 __asm__ volatile( 98 "movq %%mm0, %0 \n\t" 99 :"=m"(z[revtab[k]]) 100 ); 101 } 102 103 ff_fft_dispatch_3dn2(z, s->fft.nbits); 104 105#define CMUL(j,mm0,mm1)\ 106 "movq (%2,"#j",2), %%mm6 \n"\ 107 "movq 8(%2,"#j",2), "#mm0"\n"\ 108 "movq %%mm6, "#mm1"\n"\ 109 "movq "#mm0",%%mm7 \n"\ 110 "pfmul (%3,"#j"), %%mm6 \n"\ 111 "pfmul (%4,"#j"), "#mm0"\n"\ 112 "pfmul (%4,"#j"), "#mm1"\n"\ 113 "pfmul (%3,"#j"), %%mm7 \n"\ 114 "pfsub %%mm6, "#mm0"\n"\ 115 "pfadd %%mm7, "#mm1"\n" 116 117 /* post rotation */ 118 j = -n2; 119 k = n2-8; 120 __asm__ volatile( 121 "1: \n" 122 CMUL(%0, %%mm0, %%mm1) 123 CMUL(%1, %%mm2, %%mm3) 124 "movd %%mm0, (%2,%0,2) \n" 125 "movd %%mm1,12(%2,%1,2) \n" 126 "movd %%mm2, (%2,%1,2) \n" 127 "movd %%mm3,12(%2,%0,2) \n" 128 "psrlq $32, %%mm0 \n" 129 "psrlq $32, %%mm1 \n" 130 "psrlq $32, %%mm2 \n" 131 "psrlq $32, %%mm3 \n" 132 "movd %%mm0, 8(%2,%0,2) \n" 133 "movd %%mm1, 4(%2,%1,2) \n" 134 "movd %%mm2, 8(%2,%1,2) \n" 135 "movd %%mm3, 4(%2,%0,2) \n" 136 "sub $8, %1 \n" 137 "add $8, %0 \n" 138 "jl 1b \n" 139 :"+r"(j), "+r"(k) 140 :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) 141 :"memory" 142 ); 143 __asm__ volatile("femms"); 144} 145 146void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input) 147{ 148 x86_reg j, k; 149 long n = 1 << s->nbits; 150 long n4 = n >> 2; 151 152 ff_imdct_half_3dn2(s, output+n4, input); 153 154 j = -n; 155 k = n-8; 156 __asm__ volatile( 157 "movq %4, %%mm7 \n" 158 "1: \n" 159 PSWAPD((%2,%1), %%mm0) 160 PSWAPD((%3,%0), %%mm1) 161 "pxor %%mm7, %%mm0 \n" 162 "movq %%mm1, (%3,%1) \n" 163 "movq %%mm0, (%2,%0) \n" 164 "sub $8, %1 \n" 165 "add $8, %0 \n" 166 "jl 1b \n" 167 :"+r"(j), "+r"(k) 168 :"r"(output+n4), "r"(output+n4*3), 169 "m"(*m1m1) 170 ); 171 __asm__ volatile("femms"); 172} 173 174