1/*
2 * MMX optimized MP3 decoding functions
3 * Copyright (c) 2010 Vitor Sessak
4 *
5 * This file is part of Libav.
6 *
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/cpu.h"
23#include "libavutil/x86_cpu.h"
24#include "libavcodec/dsputil.h"
25#include "libavcodec/mpegaudiodsp.h"
26
27void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
28void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
29void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
30void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
31void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
32void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
33                               float *tmpbuf);
34void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
35                               float *tmpbuf);
36
37DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
38
39#define MACS(rt, ra, rb) rt+=(ra)*(rb)
40#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
41
42#define SUM8(op, sum, w, p)               \
43{                                         \
44    op(sum, (w)[0 * 64], (p)[0 * 64]);    \
45    op(sum, (w)[1 * 64], (p)[1 * 64]);    \
46    op(sum, (w)[2 * 64], (p)[2 * 64]);    \
47    op(sum, (w)[3 * 64], (p)[3 * 64]);    \
48    op(sum, (w)[4 * 64], (p)[4 * 64]);    \
49    op(sum, (w)[5 * 64], (p)[5 * 64]);    \
50    op(sum, (w)[6 * 64], (p)[6 * 64]);    \
51    op(sum, (w)[7 * 64], (p)[7 * 64]);    \
52}
53
54static void apply_window(const float *buf, const float *win1,
55                         const float *win2, float *sum1, float *sum2, int len)
56{
57    x86_reg count = - 4*len;
58    const float *win1a = win1+len;
59    const float *win2a = win2+len;
60    const float *bufa  = buf+len;
61    float *sum1a = sum1+len;
62    float *sum2a = sum2+len;
63
64
65#define MULT(a, b)                                 \
66    "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
67    "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
68    "mulps         %%xmm2, %%xmm1           \n\t"  \
69    "subps         %%xmm1, %%xmm0           \n\t"  \
70    "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
71    "subps         %%xmm2, %%xmm4           \n\t"  \
72
73    __asm__ volatile(
74            "1:                                   \n\t"
75            "xorps       %%xmm0, %%xmm0           \n\t"
76            "xorps       %%xmm4, %%xmm4           \n\t"
77
78            MULT(   0,   0)
79            MULT( 256,  64)
80            MULT( 512, 128)
81            MULT( 768, 192)
82            MULT(1024, 256)
83            MULT(1280, 320)
84            MULT(1536, 384)
85            MULT(1792, 448)
86
87            "movaps      %%xmm0, (%4,%0)          \n\t"
88            "movaps      %%xmm4, (%5,%0)          \n\t"
89            "add            $16,  %0              \n\t"
90            "jl              1b                   \n\t"
91            :"+&r"(count)
92            :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
93            );
94
95#undef MULT
96}
97
98static void apply_window_mp3(float *in, float *win, int *unused, float *out,
99                             int incr)
100{
101    LOCAL_ALIGNED_16(float, suma, [17]);
102    LOCAL_ALIGNED_16(float, sumb, [17]);
103    LOCAL_ALIGNED_16(float, sumc, [17]);
104    LOCAL_ALIGNED_16(float, sumd, [17]);
105
106    float sum;
107
108    /* copy to avoid wrap */
109    memcpy(in + 512, in, 32 * sizeof(*in));
110
111    apply_window(in + 16, win     , win + 512, suma, sumc, 16);
112    apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
113
114    SUM8(MACS, suma[0], win + 32, in + 48);
115
116    sumc[ 0] = 0;
117    sumb[16] = 0;
118    sumd[16] = 0;
119
120#define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
121            "movups " #sumd "(%4),       %%xmm0          \n\t" \
122            "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
123            "subps  " #suma "(%1),       %%xmm0          \n\t" \
124            "movaps        %%xmm0," #out1 "(%0)          \n\t" \
125\
126            "movups " #sumc "(%3),       %%xmm0          \n\t" \
127            "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
128            "addps  " #sumb "(%2),       %%xmm0          \n\t" \
129            "movaps        %%xmm0," #out2 "(%0)          \n\t"
130
131    if (incr == 1) {
132        __asm__ volatile(
133            SUMS( 0, 48,  4, 52,  0, 112)
134            SUMS(16, 32, 20, 36, 16,  96)
135            SUMS(32, 16, 36, 20, 32,  80)
136            SUMS(48,  0, 52,  4, 48,  64)
137
138            :"+&r"(out)
139            :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
140            :"memory"
141            );
142        out += 16*incr;
143    } else {
144        int j;
145        float *out2 = out + 32 * incr;
146        out[0  ]  = -suma[   0];
147        out += incr;
148        out2 -= incr;
149        for(j=1;j<16;j++) {
150            *out  = -suma[   j] + sumd[16-j];
151            *out2 =  sumb[16-j] + sumc[   j];
152            out  += incr;
153            out2 -= incr;
154        }
155    }
156
157    sum = 0;
158    SUM8(MLSS, sum, win + 16 + 32, in + 32);
159    *out = sum;
160}
161
162
163#define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
164static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
165                               int count, int switch_point, int block_type) \
166{                                                                           \
167    int align_end = count - (count & 3);                                \
168    int j;                                                              \
169    for (j = 0; j < align_end; j+= 4) {                                 \
170        LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
171        float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
172        /* apply window & overlap with previous buffer */               \
173                                                                        \
174        /* select window */                                             \
175        ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
176        in      += 4*18;                                                \
177        buf     += 4*18;                                                \
178        out     += 4;                                                   \
179    }                                                                   \
180    for (; j < count; j++) {                                            \
181        /* apply window & overlap with previous buffer */               \
182                                                                        \
183        /* select window */                                             \
184        int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
185        float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
186                                                                        \
187        ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
188                                                                        \
189        in  += 18;                                                      \
190        buf++;                                                          \
191        out++;                                                          \
192    }                                                                   \
193}
194
195DECL_IMDCT_BLOCKS(sse,sse)
196DECL_IMDCT_BLOCKS(sse2,sse)
197DECL_IMDCT_BLOCKS(sse3,sse)
198DECL_IMDCT_BLOCKS(ssse3,sse)
199DECL_IMDCT_BLOCKS(avx,avx)
200
201void ff_mpadsp_init_mmx(MPADSPContext *s)
202{
203    int mm_flags = av_get_cpu_flags();
204
205    int i, j;
206    for (j = 0; j < 4; j++) {
207        for (i = 0; i < 40; i ++) {
208            mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
209            mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
210            mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
211            mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
212            mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
213            mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
214            mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
215            mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
216        }
217    }
218
219    if (mm_flags & AV_CPU_FLAG_SSE2) {
220        s->apply_window_float = apply_window_mp3;
221    }
222#if HAVE_YASM
223    if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
224        s->imdct36_blocks_float = imdct36_blocks_avx;
225#if HAVE_SSE
226    } else if (mm_flags & AV_CPU_FLAG_SSSE3) {
227        s->imdct36_blocks_float = imdct36_blocks_ssse3;
228    } else if (mm_flags & AV_CPU_FLAG_SSE3) {
229        s->imdct36_blocks_float = imdct36_blocks_sse3;
230    } else if (mm_flags & AV_CPU_FLAG_SSE2) {
231        s->imdct36_blocks_float = imdct36_blocks_sse2;
232    } else if (mm_flags & AV_CPU_FLAG_SSE) {
233        s->imdct36_blocks_float = imdct36_blocks_sse;
234#endif /* HAVE_SSE */
235    }
236#endif /* HAVE_YASM */
237}
238