1;******************************************************************************
2;* SSE-optimized functions for the DCA decoder
3;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25pf_inv16:  times 4 dd 0x3D800000 ; 1/16
26
27SECTION_TEXT
28
29; void decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS],
30;                const int8_t hf_vq[1024][32], intptr_t vq_offset,
31;                int32_t scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end)
32
33%macro DECODE_HF 0
34cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
35    lea       srcq, [srcq + offsetq]
36    shl     startq, 2
37    mov    offsetd, endm
38%define DICT offsetq
39    shl    offsetq, 2
40    mov       endm, offsetq
41.loop:
42%if ARCH_X86_64
43    mov    offsetd, [scaleq + 2 * startq]
44    cvtsi2ss    m0, offsetd
45%else
46    cvtsi2ss    m0, [scaleq + 2 * startq]
47%endif
48    mov    offsetd, [numq + startq]
49    mulss       m0, [pf_inv16]
50    shl       DICT, 5
51    shufps      m0, m0, 0
52%if cpuflag(sse2)
53%if cpuflag(sse4)
54    pmovsxbd    m1, [srcq + DICT + 0]
55    pmovsxbd    m2, [srcq + DICT + 4]
56%else
57    movq        m1, [srcq + DICT]
58    punpcklbw   m1, m1
59    mova        m2, m1
60    punpcklwd   m1, m1
61    punpckhwd   m2, m2
62    psrad       m1, 24
63    psrad       m2, 24
64%endif
65    cvtdq2ps    m1, m1
66    cvtdq2ps    m2, m2
67%else
68    movd       mm0, [srcq + DICT + 0]
69    movd       mm1, [srcq + DICT + 4]
70    punpcklbw  mm0, mm0
71    punpcklbw  mm1, mm1
72    movq       mm2, mm0
73    movq       mm3, mm1
74    punpcklwd  mm0, mm0
75    punpcklwd  mm1, mm1
76    punpckhwd  mm2, mm2
77    punpckhwd  mm3, mm3
78    psrad      mm0, 24
79    psrad      mm1, 24
80    psrad      mm2, 24
81    psrad      mm3, 24
82    cvtpi2ps    m1, mm0
83    cvtpi2ps    m2, mm1
84    cvtpi2ps    m3, mm2
85    cvtpi2ps    m4, mm3
86    shufps      m0, m0, 0
87    shufps      m1, m3, q1010
88    shufps      m2, m4, q1010
89%endif
90    mulps       m1, m0
91    mulps       m2, m0
92    mova [dstq + 8 * startq +  0], m1
93    mova [dstq + 8 * startq + 16], m2
94    add     startq, 4
95    cmp     startq, endm
96    jl       .loop
97.end:
98%if notcpuflag(sse2)
99    emms
100%endif
101    REP_RET
102%endmacro
103
104%if ARCH_X86_32
105INIT_XMM sse
106DECODE_HF
107%endif
108
109INIT_XMM sse2
110DECODE_HF
111
112INIT_XMM sse4
113DECODE_HF
114
115; %1=v0/v1  %2=in1  %3=in2
116%macro FIR_LOOP 2-3
117.loop%1:
118%define va          m1
119%define vb          m2
120%if %1
121%define OFFSET      0
122%else
123%define OFFSET      NUM_COEF*count
124%endif
125; for v0, incrementing and for v1, decrementing
126    mova        va, [cf0q + OFFSET]
127    mova        vb, [cf0q + OFFSET + 4*NUM_COEF]
128%if %0 == 3
129    mova        m4, [cf0q + OFFSET + mmsize]
130    mova        m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
131%endif
132    mulps       va, %2
133    mulps       vb, %2
134%if %0 == 3
135%if cpuflag(fma3)
136    fmaddps     va, m4, %3, va
137    fmaddps     vb, m0, %3, vb
138%else
139    mulps       m4, %3
140    mulps       m0, %3
141    addps       va, m4
142    addps       vb, m0
143%endif
144%endif
145    ; va = va1 va2 va3 va4
146    ; vb = vb1 vb2 vb3 vb4
147%if %1
148    SWAP        va, vb
149%endif
150    mova        m4, va
151    unpcklps    va, vb ; va3 vb3 va4 vb4
152    unpckhps    m4, vb ; va1 vb1 va2 vb2
153    addps       m4, va ; va1+3 vb1+3 va2+4 vb2+4
154    movhlps     vb, m4 ; va1+3  vb1+3
155    addps       vb, m4 ; va0..4 vb0..4
156    movlps  [outq + count], vb
157%if %1
158    sub       cf0q, 8*NUM_COEF
159%endif
160    add      count, 8
161    jl   .loop%1
162%endmacro
163
164; void dca_lfe_fir(float *out, float *in, float *coefs)
165%macro DCA_LFE_FIR 1
166cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
167%define IN1       m3
168%define IN2       m5
169%define count     inq
170%define NUM_COEF  4*(2-%1)
171%define NUM_OUT   32*(%1+1)
172
173    movu     IN1, [inq + 4 - 1*mmsize]
174    shufps   IN1, IN1, q0123
175%if %1 == 0
176    movu     IN2, [inq + 4 - 2*mmsize]
177    shufps   IN2, IN2, q0123
178%endif
179
180    mov    count, -4*NUM_OUT
181    add     cf0q, 4*NUM_COEF*NUM_OUT
182    add     outq, 4*NUM_OUT
183    ; compute v0 first
184%if %1 == 0
185    FIR_LOOP   0, IN1, IN2
186%else
187    FIR_LOOP   0, IN1
188%endif
189    shufps   IN1, IN1, q0123
190    mov    count, -4*NUM_OUT
191    ; cf1 already correctly positioned
192    add     outq, 4*NUM_OUT          ; outq now at out2
193    sub     cf0q, 8*NUM_COEF
194%if %1 == 0
195    shufps   IN2, IN2, q0123
196    FIR_LOOP   1, IN2, IN1
197%else
198    FIR_LOOP   1, IN1
199%endif
200    RET
201%endmacro
202
203INIT_XMM sse
204DCA_LFE_FIR 0
205DCA_LFE_FIR 1
206%if HAVE_FMA3_EXTERNAL
207INIT_XMM fma3
208DCA_LFE_FIR 0
209%endif
210
211%macro SETZERO 1
212%if cpuflag(sse2) && notcpuflag(avx)
213    pxor          %1, %1
214%else
215    xorps         %1, %1, %1
216%endif
217%endmacro
218
219%macro SHUF 3
220%if cpuflag(avx)
221    mova          %3, [%2 - 16]
222    vperm2f128    %1, %3, %3, 1
223    vshufps       %1, %1, %1, q0123
224%elif cpuflag(sse2)
225    pshufd        %1, [%2], q0123
226%else
227    mova          %1, [%2]
228    shufps        %1, %1, q0123
229%endif
230%endmacro
231
232%macro INNER_LOOP   1
233    ; reading backwards:  ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
234    ;~ a += window[i + j]      * (-synth_buf[15 - i + j])
235    ;~ b += window[i + j + 16] * (synth_buf[i + j])
236    SHUF          m5,  ptr2 + j + (15 - 3) * 4, m6
237    mova          m6, [ptr1 + j]
238%if ARCH_X86_64
239    SHUF         m11,  ptr2 + j + (15 - 3) * 4 - mmsize, m12
240    mova         m12, [ptr1 + j + mmsize]
241%endif
242%if cpuflag(fma3)
243    fmaddps       m2, m6,  [win + %1 + j + 16 * 4], m2
244    fnmaddps      m1, m5,  [win + %1 + j], m1
245%if ARCH_X86_64
246    fmaddps       m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
247    fnmaddps      m7, m11, [win + %1 + j + mmsize], m7
248%endif
249%else ; non-FMA
250    mulps         m6, m6,  [win + %1 + j + 16 * 4]
251    mulps         m5, m5,  [win + %1 + j]
252%if ARCH_X86_64
253    mulps        m12, m12, [win + %1 + j + mmsize + 16 * 4]
254    mulps        m11, m11, [win + %1 + j + mmsize]
255%endif
256    addps         m2, m2, m6
257    subps         m1, m1, m5
258%if ARCH_X86_64
259    addps         m8, m8, m12
260    subps         m7, m7, m11
261%endif
262%endif ; cpuflag(fma3)
263    ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
264    ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
265    SHUF          m6,  ptr2 + j + (31 - 3) * 4, m5
266    mova          m5, [ptr1 + j + 16 * 4]
267%if ARCH_X86_64
268    SHUF         m12,  ptr2 + j + (31 - 3) * 4 - mmsize, m11
269    mova         m11, [ptr1 + j + mmsize + 16 * 4]
270%endif
271%if cpuflag(fma3)
272    fmaddps       m3, m5,  [win + %1 + j + 32 * 4], m3
273    fmaddps       m4, m6,  [win + %1 + j + 48 * 4], m4
274%if ARCH_X86_64
275    fmaddps       m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
276    fmaddps      m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
277%endif
278%else ; non-FMA
279    mulps         m5, m5,  [win + %1 + j + 32 * 4]
280    mulps         m6, m6,  [win + %1 + j + 48 * 4]
281%if ARCH_X86_64
282    mulps        m11, m11, [win + %1 + j + mmsize + 32 * 4]
283    mulps        m12, m12, [win + %1 + j + mmsize + 48 * 4]
284%endif
285    addps         m3, m3, m5
286    addps         m4, m4, m6
287%if ARCH_X86_64
288    addps         m9, m9, m11
289    addps        m10, m10, m12
290%endif
291%endif ; cpuflag(fma3)
292    sub            j, 64 * 4
293%endmacro
294
295; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
296;                                  const float window[512], float out[32],
297;                                  intptr_t offset, float scale)
298%macro SYNTH_FILTER 0
299cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
300                              synth_buf, synth_buf2, window, out, off, scale
301%define scale m0
302%if ARCH_X86_32 || WIN64
303%if cpuflag(sse2) && notcpuflag(avx)
304    movd       scale, scalem
305    SPLATD        m0
306%else
307    VBROADCASTSS  m0, scalem
308%endif
309; Make sure offset is in a register and not on the stack
310%define OFFQ  r4q
311%else
312    SPLATD      xmm0
313%if cpuflag(avx)
314    vinsertf128   m0, m0, xmm0, 1
315%endif
316%define OFFQ  offq
317%endif
318    ; prepare inner counter limit 1
319    mov          r5q, 480
320    sub          r5q, offmp
321    and          r5q, -64
322    shl          r5q, 2
323%if ARCH_X86_32 || notcpuflag(avx)
324    mov         OFFQ, r5q
325%define i        r5q
326    mov            i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize  ; main loop counter
327%else
328%define i 0
329%define OFFQ  r5q
330%endif
331
332%define buf2     synth_buf2q
333%if ARCH_X86_32
334    mov         buf2, synth_buf2mp
335%endif
336.mainloop
337    ; m1 = a  m2 = b  m3 = c  m4 = d
338    SETZERO       m3
339    SETZERO       m4
340    mova          m1, [buf2 + i]
341    mova          m2, [buf2 + i + 16 * 4]
342%if ARCH_X86_32
343%define ptr1     r0q
344%define ptr2     r1q
345%define win      r2q
346%define j        r3q
347    mov          win, windowm
348    mov         ptr1, synth_bufm
349%if ARCH_X86_32 || notcpuflag(avx)
350    add          win, i
351    add         ptr1, i
352%endif
353%else ; ARCH_X86_64
354%define ptr1     r6q
355%define ptr2     r7q ; must be loaded
356%define win      r8q
357%define j        r9q
358    SETZERO       m9
359    SETZERO      m10
360    mova          m7, [buf2 + i + mmsize]
361    mova          m8, [buf2 + i + mmsize + 16 * 4]
362    lea          win, [windowq + i]
363    lea         ptr1, [synth_bufq + i]
364%endif
365    mov         ptr2, synth_bufmp
366    ; prepare the inner loop counter
367    mov            j, OFFQ
368%if ARCH_X86_32 || notcpuflag(avx)
369    sub         ptr2, i
370%endif
371.loop1:
372    INNER_LOOP  0
373    jge       .loop1
374
375    mov            j, 448 * 4
376    sub            j, OFFQ
377    jz          .end
378    sub         ptr1, j
379    sub         ptr2, j
380    add          win, OFFQ ; now at j-64, so define OFFSET
381    sub            j, 64 * 4
382.loop2:
383    INNER_LOOP  64 * 4
384    jge       .loop2
385
386.end:
387%if ARCH_X86_32
388    mov         buf2, synth_buf2m ; needed for next iteration anyway
389    mov         outq, outmp       ; j, which will be set again during it
390%endif
391    ;~ out[i]      = a * scale;
392    ;~ out[i + 16] = b * scale;
393    mulps         m1, m1, scale
394    mulps         m2, m2, scale
395%if ARCH_X86_64
396    mulps         m7, m7, scale
397    mulps         m8, m8, scale
398%endif
399    ;~ synth_buf2[i]      = c;
400    ;~ synth_buf2[i + 16] = d;
401    mova   [buf2 + i +  0 * 4], m3
402    mova   [buf2 + i + 16 * 4], m4
403%if ARCH_X86_64
404    mova   [buf2 + i +  0 * 4 + mmsize], m9
405    mova   [buf2 + i + 16 * 4 + mmsize], m10
406%endif
407    ;~ out[i]      = a;
408    ;~ out[i + 16] = a;
409    mova   [outq + i +  0 * 4], m1
410    mova   [outq + i + 16 * 4], m2
411%if ARCH_X86_64
412    mova   [outq + i +  0 * 4 + mmsize], m7
413    mova   [outq + i + 16 * 4 + mmsize], m8
414%endif
415%if ARCH_X86_32 || notcpuflag(avx)
416    sub            i, (ARCH_X86_64 + 1) * mmsize
417    jge    .mainloop
418%endif
419    RET
420%endmacro
421
422%if ARCH_X86_32
423INIT_XMM sse
424SYNTH_FILTER
425%endif
426INIT_XMM sse2
427SYNTH_FILTER
428INIT_YMM avx
429SYNTH_FILTER
430INIT_YMM fma3
431SYNTH_FILTER
432