1/* 2 * MMX optimized MP3 decoding functions 3 * Copyright (c) 2010 Vitor Sessak 4 * 5 * This file is part of Libav. 6 * 7 * Libav is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * Libav is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with Libav; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/cpu.h" 23#include "libavutil/x86_cpu.h" 24#include "libavcodec/dsputil.h" 25#include "libavcodec/mpegaudiodsp.h" 26 27void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win); 28void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win); 29void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win); 30void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win); 31void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win); 32void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, 33 float *tmpbuf); 34void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, 35 float *tmpbuf); 36 37DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; 38 39#define MACS(rt, ra, rb) rt+=(ra)*(rb) 40#define MLSS(rt, ra, rb) rt-=(ra)*(rb) 41 42#define SUM8(op, sum, w, p) \ 43{ \ 44 op(sum, (w)[0 * 64], (p)[0 * 64]); \ 45 op(sum, (w)[1 * 64], (p)[1 * 64]); \ 46 op(sum, (w)[2 * 64], (p)[2 * 64]); \ 47 op(sum, (w)[3 * 64], (p)[3 * 64]); \ 48 op(sum, (w)[4 * 64], (p)[4 * 64]); \ 49 op(sum, (w)[5 * 64], (p)[5 * 64]); \ 50 op(sum, (w)[6 * 64], (p)[6 * 64]); \ 51 op(sum, (w)[7 * 64], (p)[7 * 64]); \ 52} 53 54static void apply_window(const float *buf, const float *win1, 55 const float *win2, float *sum1, float *sum2, int len) 56{ 57 x86_reg count = - 4*len; 58 const float *win1a = win1+len; 59 const float *win2a = win2+len; 60 const float *bufa = buf+len; 61 float *sum1a = sum1+len; 62 float *sum2a = sum2+len; 63 64 65#define MULT(a, b) \ 66 "movaps " #a "(%1,%0), %%xmm1 \n\t" \ 67 "movaps " #a "(%3,%0), %%xmm2 \n\t" \ 68 "mulps %%xmm2, %%xmm1 \n\t" \ 69 "subps %%xmm1, %%xmm0 \n\t" \ 70 "mulps " #b "(%2,%0), %%xmm2 \n\t" \ 71 "subps %%xmm2, %%xmm4 \n\t" \ 72 73 __asm__ volatile( 74 "1: \n\t" 75 "xorps %%xmm0, %%xmm0 \n\t" 76 "xorps %%xmm4, %%xmm4 \n\t" 77 78 MULT( 0, 0) 79 MULT( 256, 64) 80 MULT( 512, 128) 81 MULT( 768, 192) 82 MULT(1024, 256) 83 MULT(1280, 320) 84 MULT(1536, 384) 85 MULT(1792, 448) 86 87 "movaps %%xmm0, (%4,%0) \n\t" 88 "movaps %%xmm4, (%5,%0) \n\t" 89 "add $16, %0 \n\t" 90 "jl 1b \n\t" 91 :"+&r"(count) 92 :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a) 93 ); 94 95#undef MULT 96} 97 98static void apply_window_mp3(float *in, float *win, int *unused, float *out, 99 int incr) 100{ 101 LOCAL_ALIGNED_16(float, suma, [17]); 102 LOCAL_ALIGNED_16(float, sumb, [17]); 103 LOCAL_ALIGNED_16(float, sumc, [17]); 104 LOCAL_ALIGNED_16(float, sumd, [17]); 105 106 float sum; 107 108 /* copy to avoid wrap */ 109 memcpy(in + 512, in, 32 * sizeof(*in)); 110 111 apply_window(in + 16, win , win + 512, suma, sumc, 16); 112 apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); 113 114 SUM8(MACS, suma[0], win + 32, in + 48); 115 116 sumc[ 0] = 0; 117 sumb[16] = 0; 118 sumd[16] = 0; 119 120#define SUMS(suma, sumb, sumc, sumd, out1, out2) \ 121 "movups " #sumd "(%4), %%xmm0 \n\t" \ 122 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ 123 "subps " #suma "(%1), %%xmm0 \n\t" \ 124 "movaps %%xmm0," #out1 "(%0) \n\t" \ 125\ 126 "movups " #sumc "(%3), %%xmm0 \n\t" \ 127 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ 128 "addps " #sumb "(%2), %%xmm0 \n\t" \ 129 "movaps %%xmm0," #out2 "(%0) \n\t" 130 131 if (incr == 1) { 132 __asm__ volatile( 133 SUMS( 0, 48, 4, 52, 0, 112) 134 SUMS(16, 32, 20, 36, 16, 96) 135 SUMS(32, 16, 36, 20, 32, 80) 136 SUMS(48, 0, 52, 4, 48, 64) 137 138 :"+&r"(out) 139 :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0]) 140 :"memory" 141 ); 142 out += 16*incr; 143 } else { 144 int j; 145 float *out2 = out + 32 * incr; 146 out[0 ] = -suma[ 0]; 147 out += incr; 148 out2 -= incr; 149 for(j=1;j<16;j++) { 150 *out = -suma[ j] + sumd[16-j]; 151 *out2 = sumb[16-j] + sumc[ j]; 152 out += incr; 153 out2 -= incr; 154 } 155 } 156 157 sum = 0; 158 SUM8(MLSS, sum, win + 16 + 32, in + 32); 159 *out = sum; 160} 161 162 163#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ 164static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ 165 int count, int switch_point, int block_type) \ 166{ \ 167 int align_end = count - (count & 3); \ 168 int j; \ 169 for (j = 0; j < align_end; j+= 4) { \ 170 LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \ 171 float *win = mdct_win_sse[switch_point && j < 4][block_type]; \ 172 /* apply window & overlap with previous buffer */ \ 173 \ 174 /* select window */ \ 175 ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \ 176 in += 4*18; \ 177 buf += 4*18; \ 178 out += 4; \ 179 } \ 180 for (; j < count; j++) { \ 181 /* apply window & overlap with previous buffer */ \ 182 \ 183 /* select window */ \ 184 int win_idx = (switch_point && j < 2) ? 0 : block_type; \ 185 float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \ 186 \ 187 ff_imdct36_float_ ## CPU1(out, buf, in, win); \ 188 \ 189 in += 18; \ 190 buf++; \ 191 out++; \ 192 } \ 193} 194 195DECL_IMDCT_BLOCKS(sse,sse) 196DECL_IMDCT_BLOCKS(sse2,sse) 197DECL_IMDCT_BLOCKS(sse3,sse) 198DECL_IMDCT_BLOCKS(ssse3,sse) 199DECL_IMDCT_BLOCKS(avx,avx) 200 201void ff_mpadsp_init_mmx(MPADSPContext *s) 202{ 203 int mm_flags = av_get_cpu_flags(); 204 205 int i, j; 206 for (j = 0; j < 4; j++) { 207 for (i = 0; i < 40; i ++) { 208 mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i]; 209 mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i]; 210 mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i]; 211 mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; 212 mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i]; 213 mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i]; 214 mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i]; 215 mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; 216 } 217 } 218 219 if (mm_flags & AV_CPU_FLAG_SSE2) { 220 s->apply_window_float = apply_window_mp3; 221 } 222#if HAVE_YASM 223 if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { 224 s->imdct36_blocks_float = imdct36_blocks_avx; 225#if HAVE_SSE 226 } else if (mm_flags & AV_CPU_FLAG_SSSE3) { 227 s->imdct36_blocks_float = imdct36_blocks_ssse3; 228 } else if (mm_flags & AV_CPU_FLAG_SSE3) { 229 s->imdct36_blocks_float = imdct36_blocks_sse3; 230 } else if (mm_flags & AV_CPU_FLAG_SSE2) { 231 s->imdct36_blocks_float = imdct36_blocks_sse2; 232 } else if (mm_flags & AV_CPU_FLAG_SSE) { 233 s->imdct36_blocks_float = imdct36_blocks_sse; 234#endif /* HAVE_SSE */ 235 } 236#endif /* HAVE_YASM */ 237} 238