1;******************************************************************************
2;* x86 optimized Format Conversion Utils
3;* Copyright (c) 2008 Loren Merritt
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_TEXT
25
26%macro CVTPS2PI 2
27%if cpuflag(sse)
28    cvtps2pi %1, %2
29%elif cpuflag(3dnow)
30    pf2id %1, %2
31%endif
32%endmacro
33
34;------------------------------------------------------------------------------
35; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul,
36;                                    int len);
37;------------------------------------------------------------------------------
38%macro INT32_TO_FLOAT_FMUL_SCALAR 1
39%if UNIX64
40cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
41%else
42cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
43%endif
44%if WIN64
45    SWAP 0, 2
46%elif ARCH_X86_32
47    movss   m0, mulm
48%endif
49    SPLATD  m0
50    shl     lenq, 2
51    add     srcq, lenq
52    add     dstq, lenq
53    neg     lenq
54.loop:
55%if cpuflag(sse2)
56    cvtdq2ps  m1, [srcq+lenq   ]
57    cvtdq2ps  m2, [srcq+lenq+16]
58%else
59    cvtpi2ps  m1, [srcq+lenq   ]
60    cvtpi2ps  m3, [srcq+lenq+ 8]
61    cvtpi2ps  m2, [srcq+lenq+16]
62    cvtpi2ps  m4, [srcq+lenq+24]
63    movlhps   m1, m3
64    movlhps   m2, m4
65%endif
66    mulps     m1, m0
67    mulps     m2, m0
68    mova  [dstq+lenq   ], m1
69    mova  [dstq+lenq+16], m2
70    add     lenq, 32
71    jl .loop
72    REP_RET
73%endmacro
74
75INIT_XMM sse
76INT32_TO_FLOAT_FMUL_SCALAR 5
77INIT_XMM sse2
78INT32_TO_FLOAT_FMUL_SCALAR 3
79
80
81;------------------------------------------------------------------------------
82; void ff_float_to_int16(int16_t *dst, const float *src, long len);
83;------------------------------------------------------------------------------
84%macro FLOAT_TO_INT16 1
85cglobal float_to_int16, 3, 3, %1, dst, src, len
86    add       lenq, lenq
87    lea       srcq, [srcq+2*lenq]
88    add       dstq, lenq
89    neg       lenq
90.loop:
91%if cpuflag(sse2)
92    cvtps2dq    m0, [srcq+2*lenq   ]
93    cvtps2dq    m1, [srcq+2*lenq+16]
94    packssdw    m0, m1
95    mova  [dstq+lenq], m0
96%else
97    CVTPS2PI    m0, [srcq+2*lenq   ]
98    CVTPS2PI    m1, [srcq+2*lenq+ 8]
99    CVTPS2PI    m2, [srcq+2*lenq+16]
100    CVTPS2PI    m3, [srcq+2*lenq+24]
101    packssdw    m0, m1
102    packssdw    m2, m3
103    mova  [dstq+lenq  ], m0
104    mova  [dstq+lenq+8], m2
105%endif
106    add       lenq, 16
107    js .loop
108%if mmsize == 8
109    emms
110%endif
111    REP_RET
112%endmacro
113
114INIT_XMM sse2
115FLOAT_TO_INT16 2
116INIT_MMX sse
117FLOAT_TO_INT16 0
118INIT_MMX 3dnow
119FLOAT_TO_INT16 0
120
121;------------------------------------------------------------------------------
122; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
123;------------------------------------------------------------------------------
124%macro FLOAT_TO_INT16_STEP 1
125cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
126    add       lenq, lenq
127    lea       srcq, [srcq+2*lenq]
128    lea     step3q, [stepq*3]
129    neg       lenq
130.loop:
131%if cpuflag(sse2)
132    cvtps2dq    m0, [srcq+2*lenq   ]
133    cvtps2dq    m1, [srcq+2*lenq+16]
134    packssdw    m0, m1
135    movd       v1d, m0
136    psrldq      m0, 4
137    movd       v2d, m0
138    psrldq      m0, 4
139    mov     [dstq], v1w
140    mov  [dstq+stepq*4], v2w
141    shr        v1d, 16
142    shr        v2d, 16
143    mov  [dstq+stepq*2], v1w
144    mov  [dstq+step3q*2], v2w
145    lea       dstq, [dstq+stepq*8]
146    movd       v1d, m0
147    psrldq      m0, 4
148    movd       v2d, m0
149    mov     [dstq], v1w
150    mov  [dstq+stepq*4], v2w
151    shr        v1d, 16
152    shr        v2d, 16
153    mov  [dstq+stepq*2], v1w
154    mov  [dstq+step3q*2], v2w
155    lea       dstq, [dstq+stepq*8]
156%else
157    CVTPS2PI    m0, [srcq+2*lenq   ]
158    CVTPS2PI    m1, [srcq+2*lenq+ 8]
159    CVTPS2PI    m2, [srcq+2*lenq+16]
160    CVTPS2PI    m3, [srcq+2*lenq+24]
161    packssdw    m0, m1
162    packssdw    m2, m3
163    movd       v1d, m0
164    psrlq       m0, 32
165    movd       v2d, m0
166    mov     [dstq], v1w
167    mov  [dstq+stepq*4], v2w
168    shr        v1d, 16
169    shr        v2d, 16
170    mov  [dstq+stepq*2], v1w
171    mov  [dstq+step3q*2], v2w
172    lea       dstq, [dstq+stepq*8]
173    movd       v1d, m2
174    psrlq       m2, 32
175    movd       v2d, m2
176    mov     [dstq], v1w
177    mov  [dstq+stepq*4], v2w
178    shr        v1d, 16
179    shr        v2d, 16
180    mov  [dstq+stepq*2], v1w
181    mov  [dstq+step3q*2], v2w
182    lea       dstq, [dstq+stepq*8]
183%endif
184    add       lenq, 16
185    js .loop
186%if mmsize == 8
187    emms
188%endif
189    REP_RET
190%endmacro
191
192INIT_XMM sse2
193FLOAT_TO_INT16_STEP 2
194INIT_MMX sse
195FLOAT_TO_INT16_STEP 0
196INIT_MMX 3dnow
197FLOAT_TO_INT16_STEP 0
198
199;-------------------------------------------------------------------------------
200; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
201;-------------------------------------------------------------------------------
202%macro FLOAT_TO_INT16_INTERLEAVE2 0
203cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
204    lea      lenq, [4*r2q]
205    mov     src1q, [src0q+gprsize]
206    mov     src0q, [src0q]
207    add      dstq, lenq
208    add     src0q, lenq
209    add     src1q, lenq
210    neg      lenq
211.loop:
212%if cpuflag(sse2)
213    cvtps2dq   m0, [src0q+lenq]
214    cvtps2dq   m1, [src1q+lenq]
215    packssdw   m0, m1
216    movhlps    m1, m0
217    punpcklwd  m0, m1
218    mova  [dstq+lenq], m0
219%else
220    CVTPS2PI   m0, [src0q+lenq  ]
221    CVTPS2PI   m1, [src0q+lenq+8]
222    CVTPS2PI   m2, [src1q+lenq  ]
223    CVTPS2PI   m3, [src1q+lenq+8]
224    packssdw   m0, m1
225    packssdw   m2, m3
226    mova       m1, m0
227    punpcklwd  m0, m2
228    punpckhwd  m1, m2
229    mova  [dstq+lenq  ], m0
230    mova  [dstq+lenq+8], m1
231%endif
232    add      lenq, 16
233    js .loop
234%if mmsize == 8
235    emms
236%endif
237    REP_RET
238%endmacro
239
240INIT_MMX 3dnow
241FLOAT_TO_INT16_INTERLEAVE2
242INIT_MMX sse
243FLOAT_TO_INT16_INTERLEAVE2
244INIT_XMM sse2
245FLOAT_TO_INT16_INTERLEAVE2
246
247;-----------------------------------------------------------------------------
248; void ff_float_to_int16_interleave6(int16_t *dst, const float **src, int len)
249;-----------------------------------------------------------------------------
250%macro FLOAT_TO_INT16_INTERLEAVE6 0
251cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
252%if ARCH_X86_64
253    mov     lend, r2d
254%else
255    %define lend dword r2m
256%endif
257    mov src1q, [srcq+1*gprsize]
258    mov src2q, [srcq+2*gprsize]
259    mov src3q, [srcq+3*gprsize]
260    mov src4q, [srcq+4*gprsize]
261    mov src5q, [srcq+5*gprsize]
262    mov srcq,  [srcq]
263    sub src1q, srcq
264    sub src2q, srcq
265    sub src3q, srcq
266    sub src4q, srcq
267    sub src5q, srcq
268.loop:
269    CVTPS2PI   mm0, [srcq]
270    CVTPS2PI   mm1, [srcq+src1q]
271    CVTPS2PI   mm2, [srcq+src2q]
272    CVTPS2PI   mm3, [srcq+src3q]
273    CVTPS2PI   mm4, [srcq+src4q]
274    CVTPS2PI   mm5, [srcq+src5q]
275    packssdw   mm0, mm3
276    packssdw   mm1, mm4
277    packssdw   mm2, mm5
278    PSWAPD     mm3, mm0
279    punpcklwd  mm0, mm1
280    punpckhwd  mm1, mm2
281    punpcklwd  mm2, mm3
282    PSWAPD     mm3, mm0
283    punpckldq  mm0, mm2
284    punpckhdq  mm2, mm1
285    punpckldq  mm1, mm3
286    movq [dstq   ], mm0
287    movq [dstq+16], mm2
288    movq [dstq+ 8], mm1
289    add srcq, 8
290    add dstq, 24
291    sub lend, 2
292    jg .loop
293    emms
294    RET
295%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
296
297INIT_MMX sse
298FLOAT_TO_INT16_INTERLEAVE6
299INIT_MMX 3dnow
300FLOAT_TO_INT16_INTERLEAVE6
301INIT_MMX 3dnowext
302FLOAT_TO_INT16_INTERLEAVE6
303
304;-----------------------------------------------------------------------------
305; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
306;-----------------------------------------------------------------------------
307
308%macro FLOAT_INTERLEAVE6 1
309cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
310%if ARCH_X86_64
311    mov     lend, r2d
312%else
313    %define lend dword r2m
314%endif
315    mov    src1q, [srcq+1*gprsize]
316    mov    src2q, [srcq+2*gprsize]
317    mov    src3q, [srcq+3*gprsize]
318    mov    src4q, [srcq+4*gprsize]
319    mov    src5q, [srcq+5*gprsize]
320    mov     srcq, [srcq]
321    sub    src1q, srcq
322    sub    src2q, srcq
323    sub    src3q, srcq
324    sub    src4q, srcq
325    sub    src5q, srcq
326.loop:
327%if cpuflag(sse)
328    movaps    m0, [srcq]
329    movaps    m1, [srcq+src1q]
330    movaps    m2, [srcq+src2q]
331    movaps    m3, [srcq+src3q]
332    movaps    m4, [srcq+src4q]
333    movaps    m5, [srcq+src5q]
334
335    SBUTTERFLYPS 0, 1, 6
336    SBUTTERFLYPS 2, 3, 6
337    SBUTTERFLYPS 4, 5, 6
338
339    movaps    m6, m4
340    shufps    m4, m0, 0xe4
341    movlhps   m0, m2
342    movhlps   m6, m2
343    movaps [dstq   ], m0
344    movaps [dstq+16], m4
345    movaps [dstq+32], m6
346
347    movaps    m6, m5
348    shufps    m5, m1, 0xe4
349    movlhps   m1, m3
350    movhlps   m6, m3
351    movaps [dstq+48], m1
352    movaps [dstq+64], m5
353    movaps [dstq+80], m6
354%else ; mmx
355    movq       m0, [srcq]
356    movq       m1, [srcq+src1q]
357    movq       m2, [srcq+src2q]
358    movq       m3, [srcq+src3q]
359    movq       m4, [srcq+src4q]
360    movq       m5, [srcq+src5q]
361
362    SBUTTERFLY dq, 0, 1, 6
363    SBUTTERFLY dq, 2, 3, 6
364    SBUTTERFLY dq, 4, 5, 6
365    movq [dstq   ], m0
366    movq [dstq+ 8], m2
367    movq [dstq+16], m4
368    movq [dstq+24], m1
369    movq [dstq+32], m3
370    movq [dstq+40], m5
371%endif
372    add      srcq, mmsize
373    add      dstq, mmsize*6
374    sub      lend, mmsize/4
375    jg .loop
376%if mmsize == 8
377    emms
378%endif
379    REP_RET
380%endmacro
381
382INIT_MMX mmx
383FLOAT_INTERLEAVE6 0
384INIT_XMM sse
385FLOAT_INTERLEAVE6 7
386
387;-----------------------------------------------------------------------------
388; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
389;-----------------------------------------------------------------------------
390
391%macro FLOAT_INTERLEAVE2 1
392cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
393    mov     src1q, [srcq+gprsize]
394    mov      srcq, [srcq        ]
395    sub     src1q, srcq
396.loop:
397    mova       m0, [srcq             ]
398    mova       m1, [srcq+src1q       ]
399    mova       m3, [srcq      +mmsize]
400    mova       m4, [srcq+src1q+mmsize]
401
402    mova       m2, m0
403    PUNPCKLDQ  m0, m1
404    PUNPCKHDQ  m2, m1
405
406    mova       m1, m3
407    PUNPCKLDQ  m3, m4
408    PUNPCKHDQ  m1, m4
409
410    mova  [dstq         ], m0
411    mova  [dstq+1*mmsize], m2
412    mova  [dstq+2*mmsize], m3
413    mova  [dstq+3*mmsize], m1
414
415    add      srcq, mmsize*2
416    add      dstq, mmsize*4
417    sub      lend, mmsize/2
418    jg .loop
419%if mmsize == 8
420    emms
421%endif
422    REP_RET
423%endmacro
424
425INIT_MMX mmx
426%define PUNPCKLDQ punpckldq
427%define PUNPCKHDQ punpckhdq
428FLOAT_INTERLEAVE2 0
429INIT_XMM sse
430%define PUNPCKLDQ unpcklps
431%define PUNPCKHDQ unpckhps
432FLOAT_INTERLEAVE2 5
433