1;*****************************************************************************
2;* x86-optimized Float DSP functions
3;*
4;* Copyright 2006 Loren Merritt
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "x86util.asm"
24
25SECTION .text
26
27;-----------------------------------------------------------------------------
28; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
29;-----------------------------------------------------------------------------
30%macro VECTOR_FMUL 0
31cglobal vector_fmul, 4,4,2, dst, src0, src1, len
32    lea       lenq, [lend*4 - 64]
33ALIGN 16
34.loop:
35%assign a 0
36%rep 32/mmsize
37    mova      m0,   [src0q + lenq + (a+0)*mmsize]
38    mova      m1,   [src0q + lenq + (a+1)*mmsize]
39    mulps     m0, m0, [src1q + lenq + (a+0)*mmsize]
40    mulps     m1, m1, [src1q + lenq + (a+1)*mmsize]
41    mova      [dstq + lenq + (a+0)*mmsize], m0
42    mova      [dstq + lenq + (a+1)*mmsize], m1
43%assign a a+2
44%endrep
45
46    sub       lenq, 64
47    jge       .loop
48    REP_RET
49%endmacro
50
51INIT_XMM sse
52VECTOR_FMUL
53%if HAVE_AVX_EXTERNAL
54INIT_YMM avx
55VECTOR_FMUL
56%endif
57
58;------------------------------------------------------------------------------
59; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
60;------------------------------------------------------------------------------
61
62%macro VECTOR_FMAC_SCALAR 0
63%if UNIX64
64cglobal vector_fmac_scalar, 3,3,5, dst, src, len
65%else
66cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
67%endif
68%if ARCH_X86_32
69    VBROADCASTSS m0, mulm
70%else
71%if WIN64
72    SWAP 0, 2
73%endif
74    shufps      xm0, xm0, 0
75%if cpuflag(avx)
76    vinsertf128  m0, m0, xm0, 1
77%endif
78%endif
79    lea    lenq, [lend*4-64]
80.loop:
81%if cpuflag(fma3)
82    mova     m1,     [dstq+lenq]
83    mova     m2,     [dstq+lenq+1*mmsize]
84    fmaddps  m1, m0, [srcq+lenq], m1
85    fmaddps  m2, m0, [srcq+lenq+1*mmsize], m2
86%else ; cpuflag
87    mulps    m1, m0, [srcq+lenq]
88    mulps    m2, m0, [srcq+lenq+1*mmsize]
89%if mmsize < 32
90    mulps    m3, m0, [srcq+lenq+2*mmsize]
91    mulps    m4, m0, [srcq+lenq+3*mmsize]
92%endif ; mmsize
93    addps    m1, m1, [dstq+lenq]
94    addps    m2, m2, [dstq+lenq+1*mmsize]
95%if mmsize < 32
96    addps    m3, m3, [dstq+lenq+2*mmsize]
97    addps    m4, m4, [dstq+lenq+3*mmsize]
98%endif ; mmsize
99%endif ; cpuflag
100    mova  [dstq+lenq], m1
101    mova  [dstq+lenq+1*mmsize], m2
102%if mmsize < 32
103    mova  [dstq+lenq+2*mmsize], m3
104    mova  [dstq+lenq+3*mmsize], m4
105%endif ; mmsize
106    sub    lenq, 64
107    jge .loop
108    REP_RET
109%endmacro
110
111INIT_XMM sse
112VECTOR_FMAC_SCALAR
113%if HAVE_AVX_EXTERNAL
114INIT_YMM avx
115VECTOR_FMAC_SCALAR
116%endif
117%if HAVE_FMA3_EXTERNAL
118INIT_YMM fma3
119VECTOR_FMAC_SCALAR
120%endif
121
122;------------------------------------------------------------------------------
123; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
124;------------------------------------------------------------------------------
125
126%macro VECTOR_FMUL_SCALAR 0
127%if UNIX64
128cglobal vector_fmul_scalar, 3,3,2, dst, src, len
129%else
130cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
131%endif
132%if ARCH_X86_32
133    movss    m0, mulm
134%elif WIN64
135    SWAP 0, 2
136%endif
137    shufps   m0, m0, 0
138    lea    lenq, [lend*4-mmsize]
139.loop:
140    mova     m1, [srcq+lenq]
141    mulps    m1, m0
142    mova  [dstq+lenq], m1
143    sub    lenq, mmsize
144    jge .loop
145    REP_RET
146%endmacro
147
148INIT_XMM sse
149VECTOR_FMUL_SCALAR
150
151;------------------------------------------------------------------------------
152; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
153;                            int len)
154;------------------------------------------------------------------------------
155
156%macro VECTOR_DMUL_SCALAR 0
157%if ARCH_X86_32
158cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
159    mov          lenq, lenaddrm
160%elif UNIX64
161cglobal vector_dmul_scalar, 3,3,3, dst, src, len
162%else
163cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
164%endif
165%if ARCH_X86_32
166    VBROADCASTSD   m0, mulm
167%else
168%if WIN64
169    SWAP 0, 2
170%endif
171    movlhps       xm0, xm0
172%if cpuflag(avx)
173    vinsertf128   ym0, ym0, xm0, 1
174%endif
175%endif
176    lea          lenq, [lend*8-2*mmsize]
177.loop:
178    mulpd          m1, m0, [srcq+lenq       ]
179    mulpd          m2, m0, [srcq+lenq+mmsize]
180    mova   [dstq+lenq       ], m1
181    mova   [dstq+lenq+mmsize], m2
182    sub          lenq, 2*mmsize
183    jge .loop
184    REP_RET
185%endmacro
186
187INIT_XMM sse2
188VECTOR_DMUL_SCALAR
189%if HAVE_AVX_EXTERNAL
190INIT_YMM avx
191VECTOR_DMUL_SCALAR
192%endif
193
194;-----------------------------------------------------------------------------
195; vector_fmul_window(float *dst, const float *src0,
196;                    const float *src1, const float *win, int len);
197;-----------------------------------------------------------------------------
198%macro VECTOR_FMUL_WINDOW 0
199cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
200    shl     lend, 2
201    lea    len1q, [lenq - mmsize]
202    add    src0q, lenq
203    add     dstq, lenq
204    add     winq, lenq
205    neg     lenq
206.loop
207    mova      m0, [winq  + lenq]
208    mova      m4, [src0q + lenq]
209%if cpuflag(sse)
210    mova      m1, [winq  + len1q]
211    mova      m5, [src1q + len1q]
212    shufps    m1, m1, 0x1b
213    shufps    m5, m5, 0x1b
214    mova      m2, m0
215    mova      m3, m1
216    mulps     m2, m4
217    mulps     m3, m5
218    mulps     m1, m4
219    mulps     m0, m5
220    addps     m2, m3
221    subps     m1, m0
222    shufps    m2, m2, 0x1b
223%else
224    pswapd    m1, [winq  + len1q]
225    pswapd    m5, [src1q + len1q]
226    mova      m2, m0
227    mova      m3, m1
228    pfmul     m2, m4
229    pfmul     m3, m5
230    pfmul     m1, m4
231    pfmul     m0, m5
232    pfadd     m2, m3
233    pfsub     m1, m0
234    pswapd    m2, m2
235%endif
236    mova      [dstq + lenq], m1
237    mova      [dstq + len1q], m2
238    sub       len1q, mmsize
239    add       lenq,  mmsize
240    jl .loop
241%if mmsize == 8
242    femms
243%endif
244    REP_RET
245%endmacro
246
247INIT_MMX 3dnowext
248VECTOR_FMUL_WINDOW
249INIT_XMM sse
250VECTOR_FMUL_WINDOW
251
252;-----------------------------------------------------------------------------
253; vector_fmul_add(float *dst, const float *src0, const float *src1,
254;                 const float *src2, int len)
255;-----------------------------------------------------------------------------
256%macro VECTOR_FMUL_ADD 0
257cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
258    lea       lenq, [lend*4 - 2*mmsize]
259ALIGN 16
260.loop:
261    mova    m0,   [src0q + lenq]
262    mova    m1,   [src0q + lenq + mmsize]
263%if cpuflag(fma3)
264    mova    m2,     [src2q + lenq]
265    mova    m3,     [src2q + lenq + mmsize]
266    fmaddps m0, m0, [src1q + lenq], m2
267    fmaddps m1, m1, [src1q + lenq + mmsize], m3
268%else
269    mulps   m0, m0, [src1q + lenq]
270    mulps   m1, m1, [src1q + lenq + mmsize]
271    addps   m0, m0, [src2q + lenq]
272    addps   m1, m1, [src2q + lenq + mmsize]
273%endif
274    mova    [dstq + lenq], m0
275    mova    [dstq + lenq + mmsize], m1
276
277    sub     lenq,   2*mmsize
278    jge     .loop
279    REP_RET
280%endmacro
281
282INIT_XMM sse
283VECTOR_FMUL_ADD
284%if HAVE_AVX_EXTERNAL
285INIT_YMM avx
286VECTOR_FMUL_ADD
287%endif
288%if HAVE_FMA3_EXTERNAL
289INIT_YMM fma3
290VECTOR_FMUL_ADD
291%endif
292
293;-----------------------------------------------------------------------------
294; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
295;                          int len)
296;-----------------------------------------------------------------------------
297%macro VECTOR_FMUL_REVERSE 0
298cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
299    lea       lenq, [lend*4 - 2*mmsize]
300ALIGN 16
301.loop:
302%if cpuflag(avx)
303    vmovaps     xmm0, [src1q + 16]
304    vinsertf128 m0, m0, [src1q], 1
305    vshufps     m0, m0, m0, q0123
306    vmovaps     xmm1, [src1q + mmsize + 16]
307    vinsertf128 m1, m1, [src1q + mmsize], 1
308    vshufps     m1, m1, m1, q0123
309%else
310    mova    m0, [src1q]
311    mova    m1, [src1q + mmsize]
312    shufps  m0, m0, q0123
313    shufps  m1, m1, q0123
314%endif
315    mulps   m0, m0, [src0q + lenq + mmsize]
316    mulps   m1, m1, [src0q + lenq]
317    mova    [dstq + lenq + mmsize], m0
318    mova    [dstq + lenq], m1
319    add     src1q, 2*mmsize
320    sub     lenq,  2*mmsize
321    jge     .loop
322    REP_RET
323%endmacro
324
325INIT_XMM sse
326VECTOR_FMUL_REVERSE
327%if HAVE_AVX_EXTERNAL
328INIT_YMM avx
329VECTOR_FMUL_REVERSE
330%endif
331
332; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
333INIT_XMM sse
334cglobal scalarproduct_float, 3,3,2, v1, v2, offset
335    neg   offsetq
336    shl   offsetq, 2
337    sub       v1q, offsetq
338    sub       v2q, offsetq
339    xorps    xmm0, xmm0
340.loop:
341    movaps   xmm1, [v1q+offsetq]
342    mulps    xmm1, [v2q+offsetq]
343    addps    xmm0, xmm1
344    add   offsetq, 16
345    js .loop
346    movhlps  xmm1, xmm0
347    addps    xmm0, xmm1
348    movss    xmm1, xmm0
349    shufps   xmm0, xmm0, 1
350    addss    xmm0, xmm1
351%if ARCH_X86_64 == 0
352    movss     r0m,  xmm0
353    fld dword r0m
354%endif
355    RET
356
357;-----------------------------------------------------------------------------
358; void ff_butterflies_float(float *src0, float *src1, int len);
359;-----------------------------------------------------------------------------
360INIT_XMM sse
361cglobal butterflies_float, 3,3,3, src0, src1, len
362%if ARCH_X86_64
363    movsxd    lenq, lend
364%endif
365    test      lenq, lenq
366    jz .end
367    shl       lenq, 2
368    add      src0q, lenq
369    add      src1q, lenq
370    neg       lenq
371.loop:
372    mova        m0, [src0q + lenq]
373    mova        m1, [src1q + lenq]
374    subps       m2, m0, m1
375    addps       m0, m0, m1
376    mova        [src1q + lenq], m2
377    mova        [src0q + lenq], m0
378    add       lenq, mmsize
379    jl .loop
380.end:
381    REP_RET
382