1;******************************************************************************
2;* 32 point SSE-optimized DCT transform
3;* Copyright (c) 2010 Vitor Sessak
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA 32
25
26align 32
27ps_cos_vec: dd   0.500603,  0.505471,  0.515447,  0.531043
28            dd   0.553104,  0.582935,  0.622504,  0.674808
29            dd -10.190008, -3.407609, -2.057781, -1.484165
30            dd  -1.169440, -0.972568, -0.839350, -0.744536
31            dd   0.502419,  0.522499,  0.566944,  0.646822
32            dd   0.788155,  1.060678,  1.722447,  5.101149
33            dd   0.509796,  0.601345,  0.899976,  2.562916
34            dd   0.509796,  0.601345,  0.899976,  2.562916
35            dd   1.000000,  1.000000,  1.306563,  0.541196
36            dd   1.000000,  1.000000,  1.306563,  0.541196
37            dd   1.000000,  0.707107,  1.000000, -0.707107
38            dd   1.000000,  0.707107,  1.000000, -0.707107
39            dd   0.707107,  0.707107,  0.707107,  0.707107
40
41align 32
42ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
43
44%macro BUTTERFLY 4
45    subps  %4, %1, %2
46    addps  %2, %2, %1
47    mulps  %1, %4, %3
48%endmacro
49
50%macro BUTTERFLY0 5
51%if cpuflag(sse2) && notcpuflag(avx)
52    pshufd %4, %1, %5
53    xorps  %1, %2
54    addps  %1, %4
55    mulps  %1, %3
56%else
57    shufps %4, %1, %1, %5
58    xorps  %1, %1, %2
59    addps  %4, %4, %1
60    mulps  %1, %4, %3
61%endif
62%endmacro
63
64%macro BUTTERFLY2 4
65    BUTTERFLY0 %1, %2, %3, %4, 0x1b
66%endmacro
67
68%macro BUTTERFLY3 4
69    BUTTERFLY0 %1, %2, %3, %4, 0xb1
70%endmacro
71
72%macro BUTTERFLY3V 5
73    movaps m%5, m%1
74    addps  m%1, m%2
75    subps  m%5, m%2
76    SWAP %2, %5
77    mulps  m%2, [ps_cos_vec+192]
78    movaps m%5, m%3
79    addps  m%3, m%4
80    subps  m%4, m%5
81    mulps  m%4, [ps_cos_vec+192]
82%endmacro
83
84%macro PASS6_AND_PERMUTE 0
85    mov         tmpd, [outq+4]
86    movss         m7, [outq+72]
87    addss         m7, [outq+76]
88    movss         m3, [outq+56]
89    addss         m3, [outq+60]
90    addss         m4, m3
91    movss         m2, [outq+52]
92    addss         m2, m3
93    movss         m3, [outq+104]
94    addss         m3, [outq+108]
95    addss         m1, m3
96    addss         m5, m4
97    movss [outq+ 16], m1
98    movss         m1, [outq+100]
99    addss         m1, m3
100    movss         m3, [outq+40]
101    movss [outq+ 48], m1
102    addss         m3, [outq+44]
103    movss         m1, [outq+100]
104    addss         m4, m3
105    addss         m3, m2
106    addss         m1, [outq+108]
107    movss [outq+ 40], m3
108    addss         m2, [outq+36]
109    movss         m3, [outq+8]
110    movss [outq+ 56], m2
111    addss         m3, [outq+12]
112    movss [outq+ 32], m3
113    movss         m3, [outq+80]
114    movss [outq+  8], m5
115    movss [outq+ 80], m1
116    movss         m2, [outq+52]
117    movss         m5, [outq+120]
118    addss         m5, [outq+124]
119    movss         m1, [outq+64]
120    addss         m2, [outq+60]
121    addss         m0, m5
122    addss         m5, [outq+116]
123    mov    [outq+64], tmpd
124    addss         m6, m0
125    addss         m1, m6
126    mov         tmpd, [outq+12]
127    mov   [outq+ 96], tmpd
128    movss [outq+  4], m1
129    movss         m1, [outq+24]
130    movss [outq+ 24], m4
131    movss         m4, [outq+88]
132    addss         m4, [outq+92]
133    addss         m3, m4
134    addss         m4, [outq+84]
135    mov         tmpd, [outq+108]
136    addss         m1, [outq+28]
137    addss         m0, m1
138    addss         m1, m5
139    addss         m6, m3
140    addss         m3, m0
141    addss         m0, m7
142    addss         m5, [outq+20]
143    addss         m7, m1
144    movss [outq+ 12], m6
145    mov   [outq+112], tmpd
146    movss         m6, [outq+28]
147    movss [outq+ 28], m0
148    movss         m0, [outq+36]
149    movss [outq+ 36], m7
150    addss         m1, m4
151    movss         m7, [outq+116]
152    addss         m0, m2
153    addss         m7, [outq+124]
154    movss [outq+ 72], m0
155    movss         m0, [outq+44]
156    addss         m2, m0
157    movss [outq+ 44], m1
158    movss [outq+ 88], m2
159    addss         m0, [outq+60]
160    mov         tmpd, [outq+60]
161    mov   [outq+120], tmpd
162    movss [outq+104], m0
163    addss         m4, m5
164    addss         m5, [outq+68]
165    movss  [outq+52], m4
166    movss  [outq+60], m5
167    movss         m4, [outq+68]
168    movss         m5, [outq+20]
169    movss [outq+ 20], m3
170    addss         m5, m7
171    addss         m7, m6
172    addss         m4, m5
173    movss         m2, [outq+84]
174    addss         m2, [outq+92]
175    addss         m5, m2
176    movss [outq+ 68], m4
177    addss         m2, m7
178    movss         m4, [outq+76]
179    movss [outq+ 84], m2
180    movss [outq+ 76], m5
181    addss         m7, m4
182    addss         m6, [outq+124]
183    addss         m4, m6
184    addss         m6, [outq+92]
185    movss [outq+100], m4
186    movss [outq+108], m6
187    movss         m6, [outq+92]
188    movss  [outq+92], m7
189    addss         m6, [outq+124]
190    movss [outq+116], m6
191%endmacro
192
193INIT_YMM avx
194SECTION_TEXT
195%if HAVE_AVX_EXTERNAL
196; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
197cglobal dct32_float, 2,3,8, out, in, tmp
198    ; pass 1
199    vmovaps     m4, [inq+0]
200    vinsertf128 m5, m5, [inq+96], 1
201    vinsertf128 m5, m5, [inq+112], 0
202    vshufps     m5, m5, m5, 0x1b
203    BUTTERFLY   m4, m5, [ps_cos_vec], m6
204
205    vmovaps     m2, [inq+64]
206    vinsertf128 m6, m6, [inq+32], 1
207    vinsertf128 m6, m6, [inq+48], 0
208    vshufps     m6, m6, m6, 0x1b
209    BUTTERFLY   m2, m6, [ps_cos_vec+32], m0
210
211    ; pass 2
212
213    BUTTERFLY  m5, m6, [ps_cos_vec+64], m0
214    BUTTERFLY  m4, m2, [ps_cos_vec+64], m7
215
216
217    ; pass 3
218    vperm2f128  m3, m6, m4, 0x31
219    vperm2f128  m1, m6, m4, 0x20
220    vshufps     m3, m3, m3, 0x1b
221
222    BUTTERFLY   m1, m3, [ps_cos_vec+96], m6
223
224
225    vperm2f128  m4, m5, m2, 0x20
226    vperm2f128  m5, m5, m2, 0x31
227    vshufps     m5, m5, m5, 0x1b
228
229    BUTTERFLY   m4, m5, [ps_cos_vec+96], m6
230
231    ; pass 4
232    vmovaps m6, [ps_p1p1m1m1+0]
233    vmovaps m2, [ps_cos_vec+128]
234
235    BUTTERFLY2  m5, m6, m2, m7
236    BUTTERFLY2  m4, m6, m2, m7
237    BUTTERFLY2  m1, m6, m2, m7
238    BUTTERFLY2  m3, m6, m2, m7
239
240
241    ; pass 5
242    vshufps m6, m6, m6, 0xcc
243    vmovaps m2, [ps_cos_vec+160]
244
245    BUTTERFLY3  m5, m6, m2, m7
246    BUTTERFLY3  m4, m6, m2, m7
247    BUTTERFLY3  m1, m6, m2, m7
248    BUTTERFLY3  m3, m6, m2, m7
249
250    vperm2f128  m6, m3, m3, 0x31
251    vmovaps [outq], m3
252
253    vextractf128  [outq+64], m5, 1
254    vextractf128  [outq+32], m5, 0
255
256    vextractf128  [outq+80], m4, 1
257    vextractf128  [outq+48], m4, 0
258
259    vperm2f128  m0, m1, m1, 0x31
260    vmovaps [outq+96], m1
261
262    vzeroupper
263
264    ;    pass 6, no SIMD...
265INIT_XMM
266    PASS6_AND_PERMUTE
267    RET
268%endif
269
270%if ARCH_X86_64
271%define SPILL SWAP
272%define UNSPILL SWAP
273
274%macro PASS5 0
275    nop ; FIXME code alignment
276    SWAP 5, 8
277    SWAP 4, 12
278    SWAP 6, 14
279    SWAP 7, 13
280    SWAP 0, 15
281    PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
282    TRANSPOSE4x4PS 8, 9, 10, 11, 0
283    BUTTERFLY3V    8, 9, 10, 11, 0
284    addps   m10, m11
285    TRANSPOSE4x4PS 12, 13, 14, 15, 0
286    BUTTERFLY3V    12, 13, 14, 15, 0
287    addps   m14, m15
288    addps   m12, m14
289    addps   m14, m13
290    addps   m13, m15
291%endmacro
292
293%macro PASS6 0
294    SWAP 9, 12
295    SWAP 11, 14
296    movss [outq+0x00], m8
297    pshuflw m0, m8, 0xe
298    movss [outq+0x10], m9
299    pshuflw m1, m9, 0xe
300    movss [outq+0x20], m10
301    pshuflw m2, m10, 0xe
302    movss [outq+0x30], m11
303    pshuflw m3, m11, 0xe
304    movss [outq+0x40], m12
305    pshuflw m4, m12, 0xe
306    movss [outq+0x50], m13
307    pshuflw m5, m13, 0xe
308    movss [outq+0x60], m14
309    pshuflw m6, m14, 0xe
310    movaps [outq+0x70], m15
311    pshuflw m7, m15, 0xe
312    addss   m0, m1
313    addss   m1, m2
314    movss [outq+0x08], m0
315    addss   m2, m3
316    movss [outq+0x18], m1
317    addss   m3, m4
318    movss [outq+0x28], m2
319    addss   m4, m5
320    movss [outq+0x38], m3
321    addss   m5, m6
322    movss [outq+0x48], m4
323    addss   m6, m7
324    movss [outq+0x58], m5
325    movss [outq+0x68], m6
326    movss [outq+0x78], m7
327
328    PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
329    movhlps m0, m1
330    pshufd  m1, m1, 3
331    SWAP 0, 2, 4, 6, 8, 10, 12, 14
332    SWAP 1, 3, 5, 7, 9, 11, 13, 15
333%rep 7
334    movhlps m0, m1
335    pshufd  m1, m1, 3
336    addss   m15, m1
337    SWAP 0, 2, 4, 6, 8, 10, 12, 14
338    SWAP 1, 3, 5, 7, 9, 11, 13, 15
339%endrep
340%assign i 4
341%rep 15
342    addss m0, m1
343    movss [outq+i], m0
344    SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
345    %assign i i+8
346%endrep
347%endmacro
348
349%else ; ARCH_X86_32
350%macro SPILL 2 ; xmm#, mempos
351    movaps [outq+(%2-8)*16], m%1
352%endmacro
353%macro UNSPILL 2
354    movaps m%1, [outq+(%2-8)*16]
355%endmacro
356
357%define PASS6 PASS6_AND_PERMUTE
358%macro PASS5 0
359    movaps      m2, [ps_cos_vec+160]
360    shufps      m3, m3, 0xcc
361
362    BUTTERFLY3  m5, m3, m2, m1
363    SPILL 5, 8
364
365    UNSPILL 1, 9
366    BUTTERFLY3  m1, m3, m2, m5
367    SPILL 1, 14
368
369    BUTTERFLY3  m4, m3, m2, m5
370    SPILL 4, 12
371
372    BUTTERFLY3  m7, m3, m2, m5
373    SPILL 7, 13
374
375    UNSPILL 5, 10
376    BUTTERFLY3  m5, m3, m2, m7
377    SPILL 5, 10
378
379    UNSPILL 4, 11
380    BUTTERFLY3  m4, m3, m2, m7
381    SPILL 4, 11
382
383    BUTTERFLY3  m6, m3, m2, m7
384    SPILL 6, 9
385
386    BUTTERFLY3  m0, m3, m2, m7
387    SPILL 0, 15
388%endmacro
389%endif
390
391
392; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
393%macro DCT32_FUNC 0
394cglobal dct32_float, 2, 3, 16, out, in, tmp
395    ; pass 1
396
397    movaps      m0, [inq+0]
398    LOAD_INV    m1, [inq+112]
399    BUTTERFLY   m0, m1, [ps_cos_vec], m3
400
401    movaps      m7, [inq+64]
402    LOAD_INV    m4, [inq+48]
403    BUTTERFLY   m7, m4, [ps_cos_vec+32], m3
404
405    ; pass 2
406    movaps      m2, [ps_cos_vec+64]
407    BUTTERFLY   m1, m4, m2, m3
408    SPILL 1, 11
409    SPILL 4, 8
410
411    ; pass 1
412    movaps      m1, [inq+16]
413    LOAD_INV    m6, [inq+96]
414    BUTTERFLY   m1, m6, [ps_cos_vec+16], m3
415
416    movaps      m4, [inq+80]
417    LOAD_INV    m5, [inq+32]
418    BUTTERFLY   m4, m5, [ps_cos_vec+48], m3
419
420    ; pass 2
421    BUTTERFLY   m0, m7, m2, m3
422
423    movaps      m2, [ps_cos_vec+80]
424    BUTTERFLY   m6, m5, m2, m3
425
426    BUTTERFLY   m1, m4, m2, m3
427
428    ; pass 3
429    movaps      m2, [ps_cos_vec+96]
430    shufps      m1, m1, 0x1b
431    BUTTERFLY   m0, m1, m2, m3
432    SPILL 0, 15
433    SPILL 1, 14
434
435    UNSPILL 0, 8
436    shufps      m5, m5, 0x1b
437    BUTTERFLY   m0, m5, m2, m3
438
439    UNSPILL 1, 11
440    shufps      m6, m6, 0x1b
441    BUTTERFLY   m1, m6, m2, m3
442    SPILL 1, 11
443
444    shufps      m4, m4, 0x1b
445    BUTTERFLY   m7, m4, m2, m3
446
447    ; pass 4
448    movaps      m3, [ps_p1p1m1m1+0]
449    movaps      m2, [ps_cos_vec+128]
450
451    BUTTERFLY2  m5, m3, m2, m1
452
453    BUTTERFLY2  m0, m3, m2, m1
454    SPILL 0, 9
455
456    BUTTERFLY2  m6, m3, m2, m1
457    SPILL 6, 10
458
459    UNSPILL 0, 11
460    BUTTERFLY2  m0, m3, m2, m1
461    SPILL 0, 11
462
463    BUTTERFLY2  m4, m3, m2, m1
464
465    BUTTERFLY2  m7, m3, m2, m1
466
467    UNSPILL 6, 14
468    BUTTERFLY2  m6, m3, m2, m1
469
470    UNSPILL 0, 15
471    BUTTERFLY2  m0, m3, m2, m1
472
473    PASS5
474    PASS6
475    RET
476%endmacro
477
478%macro LOAD_INV 2
479%if cpuflag(sse2)
480    pshufd      %1, %2, 0x1b
481%elif cpuflag(sse)
482    movaps      %1, %2
483    shufps      %1, %1, 0x1b
484%endif
485%endmacro
486
487%if ARCH_X86_32
488INIT_XMM sse
489DCT32_FUNC
490%endif
491INIT_XMM sse2
492DCT32_FUNC
493