1/*
2 * MMX optimized LPC DSP utils
3 * Copyright (c) 2007 Loren Merritt
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/x86_cpu.h"
23#include "dsputil_mmx.h"
24
25static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data)
26{
27    double c = 2.0 / (len-1.0);
28    int n2 = len>>1;
29    x86_reg i = -n2*sizeof(int32_t);
30    x86_reg j =  n2*sizeof(int32_t);
31    __asm__ volatile(
32        "movsd   %0,     %%xmm7                \n\t"
33        "movapd  "MANGLE(ff_pd_1)", %%xmm6     \n\t"
34        "movapd  "MANGLE(ff_pd_2)", %%xmm5     \n\t"
35        "movlhps %%xmm7, %%xmm7                \n\t"
36        "subpd   %%xmm5, %%xmm7                \n\t"
37        "addsd   %%xmm6, %%xmm7                \n\t"
38        ::"m"(c)
39    );
40#define WELCH(MOVPD, offset)\
41    __asm__ volatile(\
42        "1:                                    \n\t"\
43        "movapd   %%xmm7,  %%xmm1              \n\t"\
44        "mulpd    %%xmm1,  %%xmm1              \n\t"\
45        "movapd   %%xmm6,  %%xmm0              \n\t"\
46        "subpd    %%xmm1,  %%xmm0              \n\t"\
47        "pshufd   $0x4e,   %%xmm0, %%xmm1      \n\t"\
48        "cvtpi2pd (%3,%0), %%xmm2              \n\t"\
49        "cvtpi2pd "#offset"*4(%3,%1), %%xmm3   \n\t"\
50        "mulpd    %%xmm0,  %%xmm2              \n\t"\
51        "mulpd    %%xmm1,  %%xmm3              \n\t"\
52        "movapd   %%xmm2, (%2,%0,2)            \n\t"\
53        MOVPD"    %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
54        "subpd    %%xmm5,  %%xmm7              \n\t"\
55        "sub      $8,      %1                  \n\t"\
56        "add      $8,      %0                  \n\t"\
57        "jl 1b                                 \n\t"\
58        :"+&r"(i), "+&r"(j)\
59        :"r"(w_data+n2), "r"(data+n2)\
60    );
61    if(len&1)
62        WELCH("movupd", -1)
63    else
64        WELCH("movapd", -2)
65#undef WELCH
66}
67
68void ff_lpc_compute_autocorr_sse2(const int32_t *data, int len, int lag,
69                                   double *autoc)
70{
71    double tmp[len + lag + 2];
72    double *data1 = tmp + lag;
73    int j;
74
75    if((x86_reg)data1 & 15)
76        data1++;
77
78    apply_welch_window_sse2(data, len, data1);
79
80    for(j=0; j<lag; j++)
81        data1[j-lag]= 0.0;
82    data1[len] = 0.0;
83
84    for(j=0; j<lag; j+=2){
85        x86_reg i = -len*sizeof(double);
86        if(j == lag-2) {
87            __asm__ volatile(
88                "movsd    "MANGLE(ff_pd_1)", %%xmm0 \n\t"
89                "movsd    "MANGLE(ff_pd_1)", %%xmm1 \n\t"
90                "movsd    "MANGLE(ff_pd_1)", %%xmm2 \n\t"
91                "1:                                 \n\t"
92                "movapd   (%2,%0), %%xmm3           \n\t"
93                "movupd -8(%3,%0), %%xmm4           \n\t"
94                "movapd   (%3,%0), %%xmm5           \n\t"
95                "mulpd     %%xmm3, %%xmm4           \n\t"
96                "mulpd     %%xmm3, %%xmm5           \n\t"
97                "mulpd -16(%3,%0), %%xmm3           \n\t"
98                "addpd     %%xmm4, %%xmm1           \n\t"
99                "addpd     %%xmm5, %%xmm0           \n\t"
100                "addpd     %%xmm3, %%xmm2           \n\t"
101                "add       $16,    %0               \n\t"
102                "jl 1b                              \n\t"
103                "movhlps   %%xmm0, %%xmm3           \n\t"
104                "movhlps   %%xmm1, %%xmm4           \n\t"
105                "movhlps   %%xmm2, %%xmm5           \n\t"
106                "addsd     %%xmm3, %%xmm0           \n\t"
107                "addsd     %%xmm4, %%xmm1           \n\t"
108                "addsd     %%xmm5, %%xmm2           \n\t"
109                "movsd     %%xmm0,   (%1)           \n\t"
110                "movsd     %%xmm1,  8(%1)           \n\t"
111                "movsd     %%xmm2, 16(%1)           \n\t"
112                :"+&r"(i)
113                :"r"(autoc+j), "r"(data1+len), "r"(data1+len-j)
114                :"memory"
115            );
116        } else {
117            __asm__ volatile(
118                "movsd    "MANGLE(ff_pd_1)", %%xmm0 \n\t"
119                "movsd    "MANGLE(ff_pd_1)", %%xmm1 \n\t"
120                "1:                                 \n\t"
121                "movapd   (%3,%0), %%xmm3           \n\t"
122                "movupd -8(%4,%0), %%xmm4           \n\t"
123                "mulpd     %%xmm3, %%xmm4           \n\t"
124                "mulpd    (%4,%0), %%xmm3           \n\t"
125                "addpd     %%xmm4, %%xmm1           \n\t"
126                "addpd     %%xmm3, %%xmm0           \n\t"
127                "add       $16,    %0               \n\t"
128                "jl 1b                              \n\t"
129                "movhlps   %%xmm0, %%xmm3           \n\t"
130                "movhlps   %%xmm1, %%xmm4           \n\t"
131                "addsd     %%xmm3, %%xmm0           \n\t"
132                "addsd     %%xmm4, %%xmm1           \n\t"
133                "movsd     %%xmm0, %1               \n\t"
134                "movsd     %%xmm1, %2               \n\t"
135                :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
136                :"r"(data1+len), "r"(data1+len-j)
137            );
138        }
139    }
140}
141