1;******************************************************************************
2;* x86 optimized Format Conversion Utils
3;* Copyright (c) 2008 Loren Merritt
4;*
5;* This file is part of Libav.
6;*
7;* Libav is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* Libav is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with Libav; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "x86inc.asm"
23%include "x86util.asm"
24
25SECTION_TEXT
26
27;---------------------------------------------------------------------------------
28; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
29;---------------------------------------------------------------------------------
30%macro INT32_TO_FLOAT_FMUL_SCALAR 2
31%ifdef UNIX64
32cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
33%else
34cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
35%endif
36%ifdef WIN64
37    SWAP 0, 2
38%elifdef ARCH_X86_32
39    movss   m0, mulm
40%endif
41    SPLATD  m0
42    shl     lenq, 2
43    add     srcq, lenq
44    add     dstq, lenq
45    neg     lenq
46.loop:
47%ifidn %1, sse2
48    cvtdq2ps  m1, [srcq+lenq   ]
49    cvtdq2ps  m2, [srcq+lenq+16]
50%else
51    cvtpi2ps  m1, [srcq+lenq   ]
52    cvtpi2ps  m3, [srcq+lenq+ 8]
53    cvtpi2ps  m2, [srcq+lenq+16]
54    cvtpi2ps  m4, [srcq+lenq+24]
55    movlhps   m1, m3
56    movlhps   m2, m4
57%endif
58    mulps     m1, m0
59    mulps     m2, m0
60    mova  [dstq+lenq   ], m1
61    mova  [dstq+lenq+16], m2
62    add     lenq, 32
63    jl .loop
64    REP_RET
65%endmacro
66
67INIT_XMM
68%define SPLATD SPLATD_SSE
69%define movdqa movaps
70INT32_TO_FLOAT_FMUL_SCALAR sse, 5
71%undef movdqa
72%define SPLATD SPLATD_SSE2
73INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
74%undef SPLATD
75
76
77;------------------------------------------------------------------------------
78; void ff_float_to_int16(int16_t *dst, const float *src, long len);
79;------------------------------------------------------------------------------
80%macro FLOAT_TO_INT16 2
81cglobal float_to_int16_%1, 3,3,%2, dst, src, len
82    add       lenq, lenq
83    lea       srcq, [srcq+2*lenq]
84    add       dstq, lenq
85    neg       lenq
86.loop:
87%ifidn %1, sse2
88    cvtps2dq    m0, [srcq+2*lenq   ]
89    cvtps2dq    m1, [srcq+2*lenq+16]
90    packssdw    m0, m1
91    mova  [dstq+lenq], m0
92%else
93    cvtps2pi    m0, [srcq+2*lenq   ]
94    cvtps2pi    m1, [srcq+2*lenq+ 8]
95    cvtps2pi    m2, [srcq+2*lenq+16]
96    cvtps2pi    m3, [srcq+2*lenq+24]
97    packssdw    m0, m1
98    packssdw    m2, m3
99    mova  [dstq+lenq  ], m0
100    mova  [dstq+lenq+8], m2
101%endif
102    add       lenq, 16
103    js .loop
104%ifnidn %1, sse2
105    emms
106%endif
107    REP_RET
108%endmacro
109
110INIT_XMM
111FLOAT_TO_INT16 sse2, 2
112INIT_MMX
113FLOAT_TO_INT16 sse, 0
114%define cvtps2pi pf2id
115FLOAT_TO_INT16 3dnow, 0
116%undef cvtps2pi
117
118
119;-------------------------------------------------------------------------------
120; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
121;-------------------------------------------------------------------------------
122%macro FLOAT_TO_INT16_INTERLEAVE2 1
123cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
124    lea      lenq, [4*r2q]
125    mov     src1q, [src0q+gprsize]
126    mov     src0q, [src0q]
127    add      dstq, lenq
128    add     src0q, lenq
129    add     src1q, lenq
130    neg      lenq
131.loop:
132%ifidn %1, sse2
133    cvtps2dq   m0, [src0q+lenq]
134    cvtps2dq   m1, [src1q+lenq]
135    packssdw   m0, m1
136    movhlps    m1, m0
137    punpcklwd  m0, m1
138    mova  [dstq+lenq], m0
139%else
140    cvtps2pi   m0, [src0q+lenq  ]
141    cvtps2pi   m1, [src0q+lenq+8]
142    cvtps2pi   m2, [src1q+lenq  ]
143    cvtps2pi   m3, [src1q+lenq+8]
144    packssdw   m0, m1
145    packssdw   m2, m3
146    mova       m1, m0
147    punpcklwd  m0, m2
148    punpckhwd  m1, m2
149    mova  [dstq+lenq  ], m0
150    mova  [dstq+lenq+8], m1
151%endif
152    add      lenq, 16
153    js .loop
154%ifnidn %1, sse2
155    emms
156%endif
157    REP_RET
158%endmacro
159
160INIT_MMX
161%define cvtps2pi pf2id
162FLOAT_TO_INT16_INTERLEAVE2 3dnow
163%undef cvtps2pi
164%define movdqa movaps
165FLOAT_TO_INT16_INTERLEAVE2 sse
166%undef movdqa
167INIT_XMM
168FLOAT_TO_INT16_INTERLEAVE2 sse2
169
170
171%macro PSWAPD_SSE 2
172    pshufw %1, %2, 0x4e
173%endmacro
174%macro PSWAPD_3DN1 2
175    movq  %1, %2
176    psrlq %1, 32
177    punpckldq %1, %2
178%endmacro
179
180%macro FLOAT_TO_INT16_INTERLEAVE6 1
181; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
182cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
183%ifdef ARCH_X86_64
184    %define lend r10d
185    mov     lend, r2d
186%else
187    %define lend dword r2m
188%endif
189    mov src1q, [srcq+1*gprsize]
190    mov src2q, [srcq+2*gprsize]
191    mov src3q, [srcq+3*gprsize]
192    mov src4q, [srcq+4*gprsize]
193    mov src5q, [srcq+5*gprsize]
194    mov srcq,  [srcq]
195    sub src1q, srcq
196    sub src2q, srcq
197    sub src3q, srcq
198    sub src4q, srcq
199    sub src5q, srcq
200.loop:
201    cvtps2pi   mm0, [srcq]
202    cvtps2pi   mm1, [srcq+src1q]
203    cvtps2pi   mm2, [srcq+src2q]
204    cvtps2pi   mm3, [srcq+src3q]
205    cvtps2pi   mm4, [srcq+src4q]
206    cvtps2pi   mm5, [srcq+src5q]
207    packssdw   mm0, mm3
208    packssdw   mm1, mm4
209    packssdw   mm2, mm5
210    pswapd     mm3, mm0
211    punpcklwd  mm0, mm1
212    punpckhwd  mm1, mm2
213    punpcklwd  mm2, mm3
214    pswapd     mm3, mm0
215    punpckldq  mm0, mm2
216    punpckhdq  mm2, mm1
217    punpckldq  mm1, mm3
218    movq [dstq   ], mm0
219    movq [dstq+16], mm2
220    movq [dstq+ 8], mm1
221    add srcq, 8
222    add dstq, 24
223    sub lend, 2
224    jg .loop
225    emms
226    RET
227%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
228
229%define pswapd PSWAPD_SSE
230FLOAT_TO_INT16_INTERLEAVE6 sse
231%define cvtps2pi pf2id
232%define pswapd PSWAPD_3DN1
233FLOAT_TO_INT16_INTERLEAVE6 3dnow
234%undef pswapd
235FLOAT_TO_INT16_INTERLEAVE6 3dn2
236%undef cvtps2pi
237
238;-----------------------------------------------------------------------------
239; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
240;-----------------------------------------------------------------------------
241
242%macro FLOAT_INTERLEAVE6 2
243cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5
244%ifdef ARCH_X86_64
245    %define lend r10d
246    mov     lend, r2d
247%else
248    %define lend dword r2m
249%endif
250    mov    src1q, [srcq+1*gprsize]
251    mov    src2q, [srcq+2*gprsize]
252    mov    src3q, [srcq+3*gprsize]
253    mov    src4q, [srcq+4*gprsize]
254    mov    src5q, [srcq+5*gprsize]
255    mov     srcq, [srcq]
256    sub    src1q, srcq
257    sub    src2q, srcq
258    sub    src3q, srcq
259    sub    src4q, srcq
260    sub    src5q, srcq
261.loop:
262%ifidn %1, sse
263    movaps    m0, [srcq]
264    movaps    m1, [srcq+src1q]
265    movaps    m2, [srcq+src2q]
266    movaps    m3, [srcq+src3q]
267    movaps    m4, [srcq+src4q]
268    movaps    m5, [srcq+src5q]
269
270    SBUTTERFLYPS 0, 1, 6
271    SBUTTERFLYPS 2, 3, 6
272    SBUTTERFLYPS 4, 5, 6
273
274    movaps    m6, m4
275    shufps    m4, m0, 0xe4
276    movlhps   m0, m2
277    movhlps   m6, m2
278    movaps [dstq   ], m0
279    movaps [dstq+16], m4
280    movaps [dstq+32], m6
281
282    movaps    m6, m5
283    shufps    m5, m1, 0xe4
284    movlhps   m1, m3
285    movhlps   m6, m3
286    movaps [dstq+48], m1
287    movaps [dstq+64], m5
288    movaps [dstq+80], m6
289%else ; mmx
290    movq       m0, [srcq]
291    movq       m1, [srcq+src1q]
292    movq       m2, [srcq+src2q]
293    movq       m3, [srcq+src3q]
294    movq       m4, [srcq+src4q]
295    movq       m5, [srcq+src5q]
296
297    SBUTTERFLY dq, 0, 1, 6
298    SBUTTERFLY dq, 2, 3, 6
299    SBUTTERFLY dq, 4, 5, 6
300    movq [dstq   ], m0
301    movq [dstq+ 8], m2
302    movq [dstq+16], m4
303    movq [dstq+24], m1
304    movq [dstq+32], m3
305    movq [dstq+40], m5
306%endif
307    add      srcq, mmsize
308    add      dstq, mmsize*6
309    sub      lend, mmsize/4
310    jg .loop
311%ifidn %1, mmx
312    emms
313%endif
314    REP_RET
315%endmacro
316
317INIT_MMX
318FLOAT_INTERLEAVE6 mmx, 0
319INIT_XMM
320FLOAT_INTERLEAVE6 sse, 7
321
322;-----------------------------------------------------------------------------
323; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
324;-----------------------------------------------------------------------------
325
326%macro FLOAT_INTERLEAVE2 2
327cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1
328    mov     src1q, [srcq+gprsize]
329    mov      srcq, [srcq        ]
330    sub     src1q, srcq
331.loop
332    MOVPS      m0, [srcq             ]
333    MOVPS      m1, [srcq+src1q       ]
334    MOVPS      m3, [srcq      +mmsize]
335    MOVPS      m4, [srcq+src1q+mmsize]
336
337    MOVPS      m2, m0
338    PUNPCKLDQ  m0, m1
339    PUNPCKHDQ  m2, m1
340
341    MOVPS      m1, m3
342    PUNPCKLDQ  m3, m4
343    PUNPCKHDQ  m1, m4
344
345    MOVPS [dstq         ], m0
346    MOVPS [dstq+1*mmsize], m2
347    MOVPS [dstq+2*mmsize], m3
348    MOVPS [dstq+3*mmsize], m1
349
350    add      srcq, mmsize*2
351    add      dstq, mmsize*4
352    sub      lend, mmsize/2
353    jg .loop
354%ifidn %1, mmx
355    emms
356%endif
357    REP_RET
358%endmacro
359
360INIT_MMX
361%define MOVPS     movq
362%define PUNPCKLDQ punpckldq
363%define PUNPCKHDQ punpckhdq
364FLOAT_INTERLEAVE2 mmx, 0
365INIT_XMM
366%define MOVPS     movaps
367%define PUNPCKLDQ unpcklps
368%define PUNPCKHDQ unpckhps
369FLOAT_INTERLEAVE2 sse, 5
370