1/* 2 * SIMD-optimized MP3 decoding functions 3 * Copyright (c) 2010 Vitor Sessak 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/attributes.h" 23#include "libavutil/cpu.h" 24#include "libavutil/internal.h" 25#include "libavutil/x86/asm.h" 26#include "libavutil/x86/cpu.h" 27#include "libavcodec/mpegaudiodsp.h" 28 29#define DECL(CPU)\ 30static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\ 31void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win); 32 33#if ARCH_X86_32 34DECL(sse) 35#endif 36DECL(sse2) 37DECL(sse3) 38DECL(ssse3) 39DECL(avx) 40 41void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, 42 float *tmpbuf); 43void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, 44 float *tmpbuf); 45 46DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; 47 48#if HAVE_6REGS && HAVE_SSE_INLINE 49 50#define MACS(rt, ra, rb) rt+=(ra)*(rb) 51#define MLSS(rt, ra, rb) rt-=(ra)*(rb) 52 53#define SUM8(op, sum, w, p) \ 54{ \ 55 op(sum, (w)[0 * 64], (p)[0 * 64]); \ 56 op(sum, (w)[1 * 64], (p)[1 * 64]); \ 57 op(sum, (w)[2 * 64], (p)[2 * 64]); \ 58 op(sum, (w)[3 * 64], (p)[3 * 64]); \ 59 op(sum, (w)[4 * 64], (p)[4 * 64]); \ 60 op(sum, (w)[5 * 64], (p)[5 * 64]); \ 61 op(sum, (w)[6 * 64], (p)[6 * 64]); \ 62 op(sum, (w)[7 * 64], (p)[7 * 64]); \ 63} 64 65static void apply_window(const float *buf, const float *win1, 66 const float *win2, float *sum1, float *sum2, int len) 67{ 68 x86_reg count = - 4*len; 69 const float *win1a = win1+len; 70 const float *win2a = win2+len; 71 const float *bufa = buf+len; 72 float *sum1a = sum1+len; 73 float *sum2a = sum2+len; 74 75 76#define MULT(a, b) \ 77 "movaps " #a "(%1,%0), %%xmm1 \n\t" \ 78 "movaps " #a "(%3,%0), %%xmm2 \n\t" \ 79 "mulps %%xmm2, %%xmm1 \n\t" \ 80 "subps %%xmm1, %%xmm0 \n\t" \ 81 "mulps " #b "(%2,%0), %%xmm2 \n\t" \ 82 "subps %%xmm2, %%xmm4 \n\t" \ 83 84 __asm__ volatile( 85 "1: \n\t" 86 "xorps %%xmm0, %%xmm0 \n\t" 87 "xorps %%xmm4, %%xmm4 \n\t" 88 89 MULT( 0, 0) 90 MULT( 256, 64) 91 MULT( 512, 128) 92 MULT( 768, 192) 93 MULT(1024, 256) 94 MULT(1280, 320) 95 MULT(1536, 384) 96 MULT(1792, 448) 97 98 "movaps %%xmm0, (%4,%0) \n\t" 99 "movaps %%xmm4, (%5,%0) \n\t" 100 "add $16, %0 \n\t" 101 "jl 1b \n\t" 102 :"+&r"(count) 103 :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a) 104 ); 105 106#undef MULT 107} 108 109static void apply_window_mp3(float *in, float *win, int *unused, float *out, 110 int incr) 111{ 112 LOCAL_ALIGNED_16(float, suma, [17]); 113 LOCAL_ALIGNED_16(float, sumb, [17]); 114 LOCAL_ALIGNED_16(float, sumc, [17]); 115 LOCAL_ALIGNED_16(float, sumd, [17]); 116 117 float sum; 118 119 /* copy to avoid wrap */ 120 __asm__ volatile( 121 "movaps 0(%0), %%xmm0 \n\t" \ 122 "movaps 16(%0), %%xmm1 \n\t" \ 123 "movaps 32(%0), %%xmm2 \n\t" \ 124 "movaps 48(%0), %%xmm3 \n\t" \ 125 "movaps %%xmm0, 0(%1) \n\t" \ 126 "movaps %%xmm1, 16(%1) \n\t" \ 127 "movaps %%xmm2, 32(%1) \n\t" \ 128 "movaps %%xmm3, 48(%1) \n\t" \ 129 "movaps 64(%0), %%xmm0 \n\t" \ 130 "movaps 80(%0), %%xmm1 \n\t" \ 131 "movaps 96(%0), %%xmm2 \n\t" \ 132 "movaps 112(%0), %%xmm3 \n\t" \ 133 "movaps %%xmm0, 64(%1) \n\t" \ 134 "movaps %%xmm1, 80(%1) \n\t" \ 135 "movaps %%xmm2, 96(%1) \n\t" \ 136 "movaps %%xmm3, 112(%1) \n\t" 137 ::"r"(in), "r"(in+512) 138 :"memory" 139 ); 140 141 apply_window(in + 16, win , win + 512, suma, sumc, 16); 142 apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); 143 144 SUM8(MACS, suma[0], win + 32, in + 48); 145 146 sumc[ 0] = 0; 147 sumb[16] = 0; 148 sumd[16] = 0; 149 150#define SUMS(suma, sumb, sumc, sumd, out1, out2) \ 151 "movups " #sumd "(%4), %%xmm0 \n\t" \ 152 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ 153 "subps " #suma "(%1), %%xmm0 \n\t" \ 154 "movaps %%xmm0," #out1 "(%0) \n\t" \ 155\ 156 "movups " #sumc "(%3), %%xmm0 \n\t" \ 157 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ 158 "addps " #sumb "(%2), %%xmm0 \n\t" \ 159 "movaps %%xmm0," #out2 "(%0) \n\t" 160 161 if (incr == 1) { 162 __asm__ volatile( 163 SUMS( 0, 48, 4, 52, 0, 112) 164 SUMS(16, 32, 20, 36, 16, 96) 165 SUMS(32, 16, 36, 20, 32, 80) 166 SUMS(48, 0, 52, 4, 48, 64) 167 168 :"+&r"(out) 169 :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0]) 170 :"memory" 171 ); 172 out += 16*incr; 173 } else { 174 int j; 175 float *out2 = out + 32 * incr; 176 out[0 ] = -suma[ 0]; 177 out += incr; 178 out2 -= incr; 179 for(j=1;j<16;j++) { 180 *out = -suma[ j] + sumd[16-j]; 181 *out2 = sumb[16-j] + sumc[ j]; 182 out += incr; 183 out2 -= incr; 184 } 185 } 186 187 sum = 0; 188 SUM8(MLSS, sum, win + 16 + 32, in + 32); 189 *out = sum; 190} 191 192#endif /* HAVE_6REGS && HAVE_SSE_INLINE */ 193 194#if HAVE_YASM 195#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ 196static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ 197 int count, int switch_point, int block_type) \ 198{ \ 199 int align_end = count - (count & 3); \ 200 int j; \ 201 for (j = 0; j < align_end; j+= 4) { \ 202 LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \ 203 float *win = mdct_win_sse[switch_point && j < 4][block_type]; \ 204 /* apply window & overlap with previous buffer */ \ 205 \ 206 /* select window */ \ 207 ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \ 208 in += 4*18; \ 209 buf += 4*18; \ 210 out += 4; \ 211 } \ 212 for (; j < count; j++) { \ 213 /* apply window & overlap with previous buffer */ \ 214 \ 215 /* select window */ \ 216 int win_idx = (switch_point && j < 2) ? 0 : block_type; \ 217 float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \ 218 \ 219 ff_imdct36_float_ ## CPU1(out, buf, in, win); \ 220 \ 221 in += 18; \ 222 buf++; \ 223 out++; \ 224 } \ 225} 226 227#if HAVE_SSE 228#if ARCH_X86_32 229DECL_IMDCT_BLOCKS(sse,sse) 230#endif 231DECL_IMDCT_BLOCKS(sse2,sse) 232DECL_IMDCT_BLOCKS(sse3,sse) 233DECL_IMDCT_BLOCKS(ssse3,sse) 234#endif 235#if HAVE_AVX_EXTERNAL 236DECL_IMDCT_BLOCKS(avx,avx) 237#endif 238#endif /* HAVE_YASM */ 239 240av_cold void ff_mpadsp_init_x86(MPADSPContext *s) 241{ 242 int cpu_flags = av_get_cpu_flags(); 243 244 int i, j; 245 for (j = 0; j < 4; j++) { 246 for (i = 0; i < 40; i ++) { 247 mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i]; 248 mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i]; 249 mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i]; 250 mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; 251 mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i]; 252 mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i]; 253 mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i]; 254 mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; 255 } 256 } 257 258#if HAVE_6REGS && HAVE_SSE_INLINE 259 if (INLINE_SSE(cpu_flags)) { 260 s->apply_window_float = apply_window_mp3; 261 } 262#endif /* HAVE_SSE_INLINE */ 263 264#if HAVE_YASM 265#if HAVE_SSE 266#if ARCH_X86_32 267 if (EXTERNAL_SSE(cpu_flags)) { 268 s->imdct36_blocks_float = imdct36_blocks_sse; 269 } 270#endif 271 if (EXTERNAL_SSE2(cpu_flags)) { 272 s->imdct36_blocks_float = imdct36_blocks_sse2; 273 } 274 if (EXTERNAL_SSE3(cpu_flags)) { 275 s->imdct36_blocks_float = imdct36_blocks_sse3; 276 } 277 if (EXTERNAL_SSSE3(cpu_flags)) { 278 s->imdct36_blocks_float = imdct36_blocks_ssse3; 279 } 280#endif 281#if HAVE_AVX_EXTERNAL 282 if (EXTERNAL_AVX(cpu_flags)) { 283 s->imdct36_blocks_float = imdct36_blocks_avx; 284 } 285#endif 286#endif /* HAVE_YASM */ 287} 288