1;******************************************************************************
2;* Copyright (c) 2012 Michael Niedermayer
3;*
4;* This file is part of FFmpeg.
5;*
6;* FFmpeg is free software; you can redistribute it and/or
7;* modify it under the terms of the GNU Lesser General Public
8;* License as published by the Free Software Foundation; either
9;* version 2.1 of the License, or (at your option) any later version.
10;*
11;* FFmpeg is distributed in the hope that it will be useful,
12;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14;* Lesser General Public License for more details.
15;*
16;* You should have received a copy of the GNU Lesser General Public
17;* License along with FFmpeg; if not, write to the Free Software
18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19;******************************************************************************
20
21%include "libavutil/x86/x86util.asm"
22
23SECTION_RODATA 32
24flt2pm31: times 8 dd 4.6566129e-10
25flt2p31 : times 8 dd 2147483648.0
26flt2p15 : times 8 dd 32768.0
27
28word_unpack_shuf : db  0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15
29
30SECTION .text
31
32
33;to, from, a/u, log2_outsize, log_intsize, const
34%macro PACK_2CH 5-7
35cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2
36    mov src2q   , [srcq+gprsize]
37    mov srcq    , [srcq]
38    mov dstq    , [dstq]
39%ifidn %3, a
40    test dstq, mmsize-1
41        jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
42    test srcq, mmsize-1
43        jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
44    test src2q, mmsize-1
45        jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
46%else
47pack_2ch_%2_to_%1_u_int %+ SUFFIX
48%endif
49    lea     srcq , [srcq  + (1<<%5)*lenq]
50    lea     src2q, [src2q + (1<<%5)*lenq]
51    lea     dstq , [dstq  + (2<<%4)*lenq]
52    neg     lenq
53    %7 m0,m1,m2,m3,m4,m5
54.next:
55%if %4 >= %5
56    mov%3     m0, [         srcq +(1<<%5)*lenq]
57    mova      m1, m0
58    mov%3     m2, [         src2q+(1<<%5)*lenq]
59%if %5 == 1
60    punpcklwd m0, m2
61    punpckhwd m1, m2
62%else
63    punpckldq m0, m2
64    punpckhdq m1, m2
65%endif
66    %6 m0,m1,m2,m3,m4,m5
67%else
68    mov%3     m0, [         srcq +(1<<%5)*lenq]
69    mov%3     m1, [mmsize + srcq +(1<<%5)*lenq]
70    mov%3     m2, [         src2q+(1<<%5)*lenq]
71    mov%3     m3, [mmsize + src2q+(1<<%5)*lenq]
72    %6 m0,m1,m2,m3,m4,m5
73    mova      m2, m0
74    punpcklwd m0, m1
75    punpckhwd m2, m1
76    SWAP 1,2
77%endif
78    mov%3 [           dstq+(2<<%4)*lenq], m0
79    mov%3 [  mmsize + dstq+(2<<%4)*lenq], m1
80%if %4 > %5
81    mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2
82    mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3
83    add lenq, 4*mmsize/(2<<%4)
84%else
85    add lenq, 2*mmsize/(2<<%4)
86%endif
87        jl .next
88    REP_RET
89%endmacro
90
91%macro UNPACK_2CH 5-7
92cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2
93    mov dst2q   , [dstq+gprsize]
94    mov srcq    , [srcq]
95    mov dstq    , [dstq]
96%ifidn %3, a
97    test dstq, mmsize-1
98        jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
99    test srcq, mmsize-1
100        jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
101    test dst2q, mmsize-1
102        jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
103%else
104unpack_2ch_%2_to_%1_u_int %+ SUFFIX
105%endif
106    lea     srcq , [srcq  + (2<<%5)*lenq]
107    lea     dstq , [dstq  + (1<<%4)*lenq]
108    lea     dst2q, [dst2q + (1<<%4)*lenq]
109    neg     lenq
110    %7 m0,m1,m2,m3,m4,m5
111    mova      m6, [word_unpack_shuf]
112.next:
113    mov%3     m0, [           srcq +(2<<%5)*lenq]
114    mov%3     m2, [  mmsize + srcq +(2<<%5)*lenq]
115%if %5 == 1
116%ifidn SUFFIX, _ssse3
117    pshufb    m0, m6
118    mova      m1, m0
119    pshufb    m2, m6
120    punpcklqdq m0,m2
121    punpckhqdq m1,m2
122%else
123    mova      m1, m0
124    punpcklwd m0,m2
125    punpckhwd m1,m2
126
127    mova      m2, m0
128    punpcklwd m0,m1
129    punpckhwd m2,m1
130
131    mova      m1, m0
132    punpcklwd m0,m2
133    punpckhwd m1,m2
134%endif
135%else
136    mova      m1, m0
137    shufps    m0, m2, 10001000b
138    shufps    m1, m2, 11011101b
139%endif
140%if %4 < %5
141    mov%3     m2, [2*mmsize + srcq +(2<<%5)*lenq]
142    mova      m3, m2
143    mov%3     m4, [3*mmsize + srcq +(2<<%5)*lenq]
144    shufps    m2, m4, 10001000b
145    shufps    m3, m4, 11011101b
146    SWAP 1,2
147%endif
148    %6 m0,m1,m2,m3,m4,m5
149    mov%3 [           dstq+(1<<%4)*lenq], m0
150%if %4 > %5
151    mov%3 [          dst2q+(1<<%4)*lenq], m2
152    mov%3 [ mmsize +  dstq+(1<<%4)*lenq], m1
153    mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3
154    add lenq, 2*mmsize/(1<<%4)
155%else
156    mov%3 [          dst2q+(1<<%4)*lenq], m1
157    add lenq, mmsize/(1<<%4)
158%endif
159        jl .next
160    REP_RET
161%endmacro
162
163%macro CONV 5-7
164cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len
165    mov srcq    , [srcq]
166    mov dstq    , [dstq]
167%ifidn %3, a
168    test dstq, mmsize-1
169        jne %2_to_%1_u_int %+ SUFFIX
170    test srcq, mmsize-1
171        jne %2_to_%1_u_int %+ SUFFIX
172%else
173%2_to_%1_u_int %+ SUFFIX
174%endif
175    lea     srcq , [srcq  + (1<<%5)*lenq]
176    lea     dstq , [dstq  + (1<<%4)*lenq]
177    neg     lenq
178    %7 m0,m1,m2,m3,m4,m5
179.next:
180    mov%3     m0, [           srcq +(1<<%5)*lenq]
181    mov%3     m1, [  mmsize + srcq +(1<<%5)*lenq]
182%if %4 < %5
183    mov%3     m2, [2*mmsize + srcq +(1<<%5)*lenq]
184    mov%3     m3, [3*mmsize + srcq +(1<<%5)*lenq]
185%endif
186    %6 m0,m1,m2,m3,m4,m5
187    mov%3 [           dstq+(1<<%4)*lenq], m0
188    mov%3 [  mmsize + dstq+(1<<%4)*lenq], m1
189%if %4 > %5
190    mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2
191    mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3
192    add lenq, 4*mmsize/(1<<%4)
193%else
194    add lenq, 2*mmsize/(1<<%4)
195%endif
196        jl .next
197%if mmsize == 8
198    emms
199    RET
200%else
201    REP_RET
202%endif
203%endmacro
204
205%macro PACK_6CH 5-7
206cglobal pack_6ch_%2_to_%1_%3, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
207%if ARCH_X86_64
208    mov     lend, r2d
209%else
210    %define lend dword r2m
211%endif
212    mov    src1q, [srcq+1*gprsize]
213    mov    src2q, [srcq+2*gprsize]
214    mov    src3q, [srcq+3*gprsize]
215    mov    src4q, [srcq+4*gprsize]
216    mov    src5q, [srcq+5*gprsize]
217    mov     srcq, [srcq]
218    mov     dstq, [dstq]
219%ifidn %3, a
220    test dstq, mmsize-1
221        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
222    test srcq, mmsize-1
223        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
224    test src2q, mmsize-1
225        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
226    test src3q, mmsize-1
227        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
228    test src4q, mmsize-1
229        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
230    test src5q, mmsize-1
231        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
232%else
233pack_6ch_%2_to_%1_u_int %+ SUFFIX
234%endif
235    sub    src1q, srcq
236    sub    src2q, srcq
237    sub    src3q, srcq
238    sub    src4q, srcq
239    sub    src5q, srcq
240.loop:
241    mov%3     m0, [srcq      ]
242    mov%3     m1, [srcq+src1q]
243    mov%3     m2, [srcq+src2q]
244    mov%3     m3, [srcq+src3q]
245    mov%3     m4, [srcq+src4q]
246    mov%3     m5, [srcq+src5q]
247    %7 x,x,x,x,m7,x
248%if cpuflag(sse4)
249    SBUTTERFLYPS 0, 1, 6
250    SBUTTERFLYPS 2, 3, 6
251    SBUTTERFLYPS 4, 5, 6
252
253    blendps   m6, m4, m0, 1100b
254    movlhps   m0, m2
255    movhlps   m4, m2
256    blendps   m2, m5, m1, 1100b
257    movlhps   m1, m3
258    movhlps   m5, m3
259
260    %6 m0,m6,x,x,m7,m3
261    %6 m4,m1,x,x,m7,m3
262    %6 m2,m5,x,x,m7,m3
263
264    mov %+ %3 %+ ps [dstq   ], m0
265    mov %+ %3 %+ ps [dstq+16], m6
266    mov %+ %3 %+ ps [dstq+32], m4
267    mov %+ %3 %+ ps [dstq+48], m1
268    mov %+ %3 %+ ps [dstq+64], m2
269    mov %+ %3 %+ ps [dstq+80], m5
270%else ; mmx
271    SBUTTERFLY dq, 0, 1, 6
272    SBUTTERFLY dq, 2, 3, 6
273    SBUTTERFLY dq, 4, 5, 6
274
275    movq   [dstq   ], m0
276    movq   [dstq+ 8], m2
277    movq   [dstq+16], m4
278    movq   [dstq+24], m1
279    movq   [dstq+32], m3
280    movq   [dstq+40], m5
281%endif
282    add      srcq, mmsize
283    add      dstq, mmsize*6
284    sub      lend, mmsize/4
285    jg .loop
286%if mmsize == 8
287    emms
288    RET
289%else
290    REP_RET
291%endif
292%endmacro
293
294%macro INT16_TO_INT32_N 6
295    pxor      m2, m2
296    pxor      m3, m3
297    punpcklwd m2, m1
298    punpckhwd m3, m1
299    SWAP 4,0
300    pxor      m0, m0
301    pxor      m1, m1
302    punpcklwd m0, m4
303    punpckhwd m1, m4
304%endmacro
305
306%macro INT32_TO_INT16_N 6
307    psrad     m0, 16
308    psrad     m1, 16
309    psrad     m2, 16
310    psrad     m3, 16
311    packssdw  m0, m1
312    packssdw  m2, m3
313    SWAP 1,2
314%endmacro
315
316%macro INT32_TO_FLOAT_INIT 6
317    mova      %5, [flt2pm31]
318%endmacro
319%macro INT32_TO_FLOAT_N 6
320    cvtdq2ps  %1, %1
321    cvtdq2ps  %2, %2
322    mulps %1, %1, %5
323    mulps %2, %2, %5
324%endmacro
325
326%macro FLOAT_TO_INT32_INIT 6
327    mova      %5, [flt2p31]
328%endmacro
329%macro FLOAT_TO_INT32_N 6
330    mulps %1, %5
331    mulps %2, %5
332    cvtps2dq  %6, %1
333    cmpnltps %1, %5
334    paddd %1, %6
335    cvtps2dq  %6, %2
336    cmpnltps %2, %5
337    paddd %2, %6
338%endmacro
339
340%macro INT16_TO_FLOAT_INIT 6
341    mova      m5, [flt2pm31]
342%endmacro
343%macro INT16_TO_FLOAT_N 6
344    INT16_TO_INT32_N %1,%2,%3,%4,%5,%6
345    cvtdq2ps  m0, m0
346    cvtdq2ps  m1, m1
347    cvtdq2ps  m2, m2
348    cvtdq2ps  m3, m3
349    mulps m0, m0, m5
350    mulps m1, m1, m5
351    mulps m2, m2, m5
352    mulps m3, m3, m5
353%endmacro
354
355%macro FLOAT_TO_INT16_INIT 6
356    mova      m5, [flt2p15]
357%endmacro
358%macro FLOAT_TO_INT16_N 6
359    mulps m0, m5
360    mulps m1, m5
361    mulps m2, m5
362    mulps m3, m5
363    cvtps2dq  m0, m0
364    cvtps2dq  m1, m1
365    packssdw  m0, m1
366    cvtps2dq  m1, m2
367    cvtps2dq  m3, m3
368    packssdw  m1, m3
369%endmacro
370
371%macro NOP_N 0-6
372%endmacro
373
374INIT_MMX mmx
375CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
376CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
377CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
378CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
379
380PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
381PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
382
383INIT_XMM sse2
384CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
385CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
386CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
387CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
388
389PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
390PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
391PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
392PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
393PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
394PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
395PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
396PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
397
398UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
399UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
400UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
401UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
402UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
403UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
404UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
405UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
406
407CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
408CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
409CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
410CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
411CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
412CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
413CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
414CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
415
416PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
417PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
418PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
419PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
420PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
421PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
422PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
423PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
424
425UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
426UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
427UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
428UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
429UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
430UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
431UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
432UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
433
434
435INIT_XMM ssse3
436UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
437UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
438UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
439UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
440UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
441UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
442
443INIT_XMM sse4
444PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
445PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
446
447PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
448PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
449PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
450PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
451
452%if HAVE_AVX_EXTERNAL
453INIT_XMM avx
454PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
455PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
456
457PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
458PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
459PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
460PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
461
462INIT_YMM avx
463CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
464CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
465%endif
466