1/*
2 * SIMD-optimized MP3 decoding functions
3 * Copyright (c) 2010 Vitor Sessak
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/attributes.h"
23#include "libavutil/cpu.h"
24#include "libavutil/internal.h"
25#include "libavutil/x86/asm.h"
26#include "libavutil/x86/cpu.h"
27#include "libavcodec/mpegaudiodsp.h"
28
29#define DECL(CPU)\
30static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
31void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
32
33#if ARCH_X86_32
34DECL(sse)
35#endif
36DECL(sse2)
37DECL(sse3)
38DECL(ssse3)
39DECL(avx)
40
41void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
42                               float *tmpbuf);
43void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
44                               float *tmpbuf);
45
46DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
47
48#if HAVE_6REGS && HAVE_SSE_INLINE
49
50#define MACS(rt, ra, rb) rt+=(ra)*(rb)
51#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
52
53#define SUM8(op, sum, w, p)               \
54{                                         \
55    op(sum, (w)[0 * 64], (p)[0 * 64]);    \
56    op(sum, (w)[1 * 64], (p)[1 * 64]);    \
57    op(sum, (w)[2 * 64], (p)[2 * 64]);    \
58    op(sum, (w)[3 * 64], (p)[3 * 64]);    \
59    op(sum, (w)[4 * 64], (p)[4 * 64]);    \
60    op(sum, (w)[5 * 64], (p)[5 * 64]);    \
61    op(sum, (w)[6 * 64], (p)[6 * 64]);    \
62    op(sum, (w)[7 * 64], (p)[7 * 64]);    \
63}
64
65static void apply_window(const float *buf, const float *win1,
66                         const float *win2, float *sum1, float *sum2, int len)
67{
68    x86_reg count = - 4*len;
69    const float *win1a = win1+len;
70    const float *win2a = win2+len;
71    const float *bufa  = buf+len;
72    float *sum1a = sum1+len;
73    float *sum2a = sum2+len;
74
75
76#define MULT(a, b)                                 \
77    "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
78    "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
79    "mulps         %%xmm2, %%xmm1           \n\t"  \
80    "subps         %%xmm1, %%xmm0           \n\t"  \
81    "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
82    "subps         %%xmm2, %%xmm4           \n\t"  \
83
84    __asm__ volatile(
85            "1:                                   \n\t"
86            "xorps       %%xmm0, %%xmm0           \n\t"
87            "xorps       %%xmm4, %%xmm4           \n\t"
88
89            MULT(   0,   0)
90            MULT( 256,  64)
91            MULT( 512, 128)
92            MULT( 768, 192)
93            MULT(1024, 256)
94            MULT(1280, 320)
95            MULT(1536, 384)
96            MULT(1792, 448)
97
98            "movaps      %%xmm0, (%4,%0)          \n\t"
99            "movaps      %%xmm4, (%5,%0)          \n\t"
100            "add            $16,  %0              \n\t"
101            "jl              1b                   \n\t"
102            :"+&r"(count)
103            :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
104            );
105
106#undef MULT
107}
108
109static void apply_window_mp3(float *in, float *win, int *unused, float *out,
110                             int incr)
111{
112    LOCAL_ALIGNED_16(float, suma, [17]);
113    LOCAL_ALIGNED_16(float, sumb, [17]);
114    LOCAL_ALIGNED_16(float, sumc, [17]);
115    LOCAL_ALIGNED_16(float, sumd, [17]);
116
117    float sum;
118
119    /* copy to avoid wrap */
120    __asm__ volatile(
121            "movaps    0(%0), %%xmm0   \n\t" \
122            "movaps   16(%0), %%xmm1   \n\t" \
123            "movaps   32(%0), %%xmm2   \n\t" \
124            "movaps   48(%0), %%xmm3   \n\t" \
125            "movaps   %%xmm0,   0(%1) \n\t" \
126            "movaps   %%xmm1,  16(%1) \n\t" \
127            "movaps   %%xmm2,  32(%1) \n\t" \
128            "movaps   %%xmm3,  48(%1) \n\t" \
129            "movaps   64(%0), %%xmm0   \n\t" \
130            "movaps   80(%0), %%xmm1   \n\t" \
131            "movaps   96(%0), %%xmm2   \n\t" \
132            "movaps  112(%0), %%xmm3   \n\t" \
133            "movaps   %%xmm0,  64(%1) \n\t" \
134            "movaps   %%xmm1,  80(%1) \n\t" \
135            "movaps   %%xmm2,  96(%1) \n\t" \
136            "movaps   %%xmm3, 112(%1) \n\t"
137            ::"r"(in), "r"(in+512)
138            :"memory"
139            );
140
141    apply_window(in + 16, win     , win + 512, suma, sumc, 16);
142    apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
143
144    SUM8(MACS, suma[0], win + 32, in + 48);
145
146    sumc[ 0] = 0;
147    sumb[16] = 0;
148    sumd[16] = 0;
149
150#define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
151            "movups " #sumd "(%4),       %%xmm0          \n\t" \
152            "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
153            "subps  " #suma "(%1),       %%xmm0          \n\t" \
154            "movaps        %%xmm0," #out1 "(%0)          \n\t" \
155\
156            "movups " #sumc "(%3),       %%xmm0          \n\t" \
157            "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
158            "addps  " #sumb "(%2),       %%xmm0          \n\t" \
159            "movaps        %%xmm0," #out2 "(%0)          \n\t"
160
161    if (incr == 1) {
162        __asm__ volatile(
163            SUMS( 0, 48,  4, 52,  0, 112)
164            SUMS(16, 32, 20, 36, 16,  96)
165            SUMS(32, 16, 36, 20, 32,  80)
166            SUMS(48,  0, 52,  4, 48,  64)
167
168            :"+&r"(out)
169            :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
170            :"memory"
171            );
172        out += 16*incr;
173    } else {
174        int j;
175        float *out2 = out + 32 * incr;
176        out[0  ]  = -suma[   0];
177        out += incr;
178        out2 -= incr;
179        for(j=1;j<16;j++) {
180            *out  = -suma[   j] + sumd[16-j];
181            *out2 =  sumb[16-j] + sumc[   j];
182            out  += incr;
183            out2 -= incr;
184        }
185    }
186
187    sum = 0;
188    SUM8(MLSS, sum, win + 16 + 32, in + 32);
189    *out = sum;
190}
191
192#endif /* HAVE_6REGS && HAVE_SSE_INLINE */
193
194#if HAVE_YASM
195#define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
196static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
197                               int count, int switch_point, int block_type) \
198{                                                                           \
199    int align_end = count - (count & 3);                                \
200    int j;                                                              \
201    for (j = 0; j < align_end; j+= 4) {                                 \
202        LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
203        float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
204        /* apply window & overlap with previous buffer */               \
205                                                                        \
206        /* select window */                                             \
207        ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
208        in      += 4*18;                                                \
209        buf     += 4*18;                                                \
210        out     += 4;                                                   \
211    }                                                                   \
212    for (; j < count; j++) {                                            \
213        /* apply window & overlap with previous buffer */               \
214                                                                        \
215        /* select window */                                             \
216        int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
217        float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
218                                                                        \
219        ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
220                                                                        \
221        in  += 18;                                                      \
222        buf++;                                                          \
223        out++;                                                          \
224    }                                                                   \
225}
226
227#if HAVE_SSE
228#if ARCH_X86_32
229DECL_IMDCT_BLOCKS(sse,sse)
230#endif
231DECL_IMDCT_BLOCKS(sse2,sse)
232DECL_IMDCT_BLOCKS(sse3,sse)
233DECL_IMDCT_BLOCKS(ssse3,sse)
234#endif
235#if HAVE_AVX_EXTERNAL
236DECL_IMDCT_BLOCKS(avx,avx)
237#endif
238#endif /* HAVE_YASM */
239
240av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
241{
242    int cpu_flags = av_get_cpu_flags();
243
244    int i, j;
245    for (j = 0; j < 4; j++) {
246        for (i = 0; i < 40; i ++) {
247            mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
248            mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
249            mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
250            mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
251            mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
252            mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
253            mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
254            mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
255        }
256    }
257
258#if HAVE_6REGS && HAVE_SSE_INLINE
259    if (INLINE_SSE(cpu_flags)) {
260        s->apply_window_float = apply_window_mp3;
261    }
262#endif /* HAVE_SSE_INLINE */
263
264#if HAVE_YASM
265#if HAVE_SSE
266#if ARCH_X86_32
267    if (EXTERNAL_SSE(cpu_flags)) {
268        s->imdct36_blocks_float = imdct36_blocks_sse;
269    }
270#endif
271    if (EXTERNAL_SSE2(cpu_flags)) {
272        s->imdct36_blocks_float = imdct36_blocks_sse2;
273    }
274    if (EXTERNAL_SSE3(cpu_flags)) {
275        s->imdct36_blocks_float = imdct36_blocks_sse3;
276    }
277    if (EXTERNAL_SSSE3(cpu_flags)) {
278        s->imdct36_blocks_float = imdct36_blocks_ssse3;
279    }
280#endif
281#if HAVE_AVX_EXTERNAL
282    if (EXTERNAL_AVX(cpu_flags)) {
283        s->imdct36_blocks_float = imdct36_blocks_avx;
284    }
285#endif
286#endif /* HAVE_YASM */
287}
288