1;******************************************************************************
2;* 32 point SSE-optimized DCT transform
3;* Copyright (c) 2010 Vitor Sessak
4;*
5;* This file is part of Libav.
6;*
7;* Libav is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* Libav is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with Libav; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "x86inc.asm"
23%include "x86util.asm"
24
25SECTION_RODATA 32
26
27align 32
28ps_cos_vec: dd   0.500603,  0.505471,  0.515447,  0.531043
29            dd   0.553104,  0.582935,  0.622504,  0.674808
30            dd -10.190008, -3.407609, -2.057781, -1.484165
31            dd  -1.169440, -0.972568, -0.839350, -0.744536
32            dd   0.502419,  0.522499,  0.566944,  0.646822
33            dd   0.788155,  1.060678,  1.722447,  5.101149
34            dd   0.509796,  0.601345,  0.899976,  2.562916
35            dd   0.509796,  0.601345,  0.899976,  2.562916
36            dd   1.000000,  1.000000,  1.306563,  0.541196
37            dd   1.000000,  1.000000,  1.306563,  0.541196
38            dd   1.000000,  0.707107,  1.000000, -0.707107
39            dd   1.000000,  0.707107,  1.000000, -0.707107
40            dd   0.707107,  0.707107,  0.707107,  0.707107
41
42align 32
43ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
44
45%macro BUTTERFLY_SSE 4
46    movaps %4, %1
47    subps  %1, %2
48    addps  %2, %4
49    mulps  %1, %3
50%endmacro
51
52%macro BUTTERFLY_AVX 4
53    vsubps  %4, %1, %2
54    vaddps  %2, %2, %1
55    vmulps  %1, %4, %3
56%endmacro
57
58%macro BUTTERFLY0_SSE 5
59    movaps %4, %1
60    shufps %1, %1, %5
61    xorps  %4, %2
62    addps  %1, %4
63    mulps  %1, %3
64%endmacro
65
66%macro BUTTERFLY0_SSE2 5
67    pshufd %4, %1, %5
68    xorps  %1, %2
69    addps  %1, %4
70    mulps  %1, %3
71%endmacro
72
73%macro BUTTERFLY0_AVX 5
74    vshufps %4, %1, %1, %5
75    vxorps  %1, %1, %2
76    vaddps  %4, %4, %1
77    vmulps  %1, %4, %3
78%endmacro
79
80%macro BUTTERFLY2 4
81    BUTTERFLY0 %1, %2, %3, %4, 0x1b
82%endmacro
83
84%macro BUTTERFLY3 4
85    BUTTERFLY0 %1, %2, %3, %4, 0xb1
86%endmacro
87
88%macro BUTTERFLY3V 5
89    movaps m%5, m%1
90    addps  m%1, m%2
91    subps  m%5, m%2
92    SWAP %2, %5
93    mulps  m%2, [ps_cos_vec+192]
94    movaps m%5, m%3
95    addps  m%3, m%4
96    subps  m%4, m%5
97    mulps  m%4, [ps_cos_vec+192]
98%endmacro
99
100%macro PASS6_AND_PERMUTE 0
101    mov         tmpd, [outq+4]
102    movss         m7, [outq+72]
103    addss         m7, [outq+76]
104    movss         m3, [outq+56]
105    addss         m3, [outq+60]
106    addss         m4, m3
107    movss         m2, [outq+52]
108    addss         m2, m3
109    movss         m3, [outq+104]
110    addss         m3, [outq+108]
111    addss         m1, m3
112    addss         m5, m4
113    movss [outq+ 16], m1
114    movss         m1, [outq+100]
115    addss         m1, m3
116    movss         m3, [outq+40]
117    movss [outq+ 48], m1
118    addss         m3, [outq+44]
119    movss         m1, [outq+100]
120    addss         m4, m3
121    addss         m3, m2
122    addss         m1, [outq+108]
123    movss [outq+ 40], m3
124    addss         m2, [outq+36]
125    movss         m3, [outq+8]
126    movss [outq+ 56], m2
127    addss         m3, [outq+12]
128    movss [outq+ 32], m3
129    movss         m3, [outq+80]
130    movss [outq+  8], m5
131    movss [outq+ 80], m1
132    movss         m2, [outq+52]
133    movss         m5, [outq+120]
134    addss         m5, [outq+124]
135    movss         m1, [outq+64]
136    addss         m2, [outq+60]
137    addss         m0, m5
138    addss         m5, [outq+116]
139    mov    [outq+64], tmpd
140    addss         m6, m0
141    addss         m1, m6
142    mov         tmpd, [outq+12]
143    mov   [outq+ 96], tmpd
144    movss [outq+  4], m1
145    movss         m1, [outq+24]
146    movss [outq+ 24], m4
147    movss         m4, [outq+88]
148    addss         m4, [outq+92]
149    addss         m3, m4
150    addss         m4, [outq+84]
151    mov         tmpd, [outq+108]
152    addss         m1, [outq+28]
153    addss         m0, m1
154    addss         m1, m5
155    addss         m6, m3
156    addss         m3, m0
157    addss         m0, m7
158    addss         m5, [outq+20]
159    addss         m7, m1
160    movss [outq+ 12], m6
161    mov   [outq+112], tmpd
162    movss         m6, [outq+28]
163    movss [outq+ 28], m0
164    movss         m0, [outq+36]
165    movss [outq+ 36], m7
166    addss         m1, m4
167    movss         m7, [outq+116]
168    addss         m0, m2
169    addss         m7, [outq+124]
170    movss [outq+ 72], m0
171    movss         m0, [outq+44]
172    addss         m2, m0
173    movss [outq+ 44], m1
174    movss [outq+ 88], m2
175    addss         m0, [outq+60]
176    mov         tmpd, [outq+60]
177    mov   [outq+120], tmpd
178    movss [outq+104], m0
179    addss         m4, m5
180    addss         m5, [outq+68]
181    movss  [outq+52], m4
182    movss  [outq+60], m5
183    movss         m4, [outq+68]
184    movss         m5, [outq+20]
185    movss [outq+ 20], m3
186    addss         m5, m7
187    addss         m7, m6
188    addss         m4, m5
189    movss         m2, [outq+84]
190    addss         m2, [outq+92]
191    addss         m5, m2
192    movss [outq+ 68], m4
193    addss         m2, m7
194    movss         m4, [outq+76]
195    movss [outq+ 84], m2
196    movss [outq+ 76], m5
197    addss         m7, m4
198    addss         m6, [outq+124]
199    addss         m4, m6
200    addss         m6, [outq+92]
201    movss [outq+100], m4
202    movss [outq+108], m6
203    movss         m6, [outq+92]
204    movss  [outq+92], m7
205    addss         m6, [outq+124]
206    movss [outq+116], m6
207%endmacro
208
209%define BUTTERFLY  BUTTERFLY_AVX
210%define BUTTERFLY0 BUTTERFLY0_AVX
211
212INIT_YMM
213SECTION_TEXT
214%ifdef HAVE_AVX
215; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
216cglobal dct32_float_avx, 2,3,8, out, in, tmp
217    ; pass 1
218    vmovaps     m4, [inq+0]
219    vinsertf128 m5, m5, [inq+96], 1
220    vinsertf128 m5, m5, [inq+112], 0
221    vshufps     m5, m5, m5, 0x1b
222    BUTTERFLY   m4, m5, [ps_cos_vec], m6
223
224    vmovaps     m2, [inq+64]
225    vinsertf128 m6, m6, [inq+32], 1
226    vinsertf128 m6, m6, [inq+48], 0
227    vshufps     m6, m6, m6, 0x1b
228    BUTTERFLY   m2, m6, [ps_cos_vec+32], m0
229
230    ; pass 2
231
232    BUTTERFLY  m5, m6, [ps_cos_vec+64], m0
233    BUTTERFLY  m4, m2, [ps_cos_vec+64], m7
234
235
236    ; pass 3
237    vperm2f128  m3, m6, m4, 0x31
238    vperm2f128  m1, m6, m4, 0x20
239    vshufps     m3, m3, m3, 0x1b
240
241    BUTTERFLY   m1, m3, [ps_cos_vec+96], m6
242
243
244    vperm2f128  m4, m5, m2, 0x20
245    vperm2f128  m5, m5, m2, 0x31
246    vshufps     m5, m5, m5, 0x1b
247
248    BUTTERFLY   m4, m5, [ps_cos_vec+96], m6
249
250    ; pass 4
251    vmovaps m6, [ps_p1p1m1m1+0]
252    vmovaps m2, [ps_cos_vec+128]
253
254    BUTTERFLY2  m5, m6, m2, m7
255    BUTTERFLY2  m4, m6, m2, m7
256    BUTTERFLY2  m1, m6, m2, m7
257    BUTTERFLY2  m3, m6, m2, m7
258
259
260    ; pass 5
261    vshufps m6, m6, m6, 0xcc
262    vmovaps m2, [ps_cos_vec+160]
263
264    BUTTERFLY3  m5, m6, m2, m7
265    BUTTERFLY3  m4, m6, m2, m7
266    BUTTERFLY3  m1, m6, m2, m7
267    BUTTERFLY3  m3, m6, m2, m7
268
269    vperm2f128  m6, m3, m3, 0x31
270    vmovaps [outq], m3
271
272    vextractf128  [outq+64], m5, 1
273    vextractf128  [outq+32], m5, 0
274
275    vextractf128  [outq+80], m4, 1
276    vextractf128  [outq+48], m4, 0
277
278    vperm2f128  m0, m1, m1, 0x31
279    vmovaps [outq+96], m1
280
281    vzeroupper
282
283    ;    pass 6, no SIMD...
284INIT_XMM
285    PASS6_AND_PERMUTE
286    RET
287%endif
288
289%define BUTTERFLY  BUTTERFLY_SSE
290%define BUTTERFLY0 BUTTERFLY0_SSE
291
292%ifdef ARCH_X86_64
293%define SPILL SWAP
294%define UNSPILL SWAP
295
296%macro PASS5 0
297    nop ; FIXME code alignment
298    SWAP 5, 8
299    SWAP 4, 12
300    SWAP 6, 14
301    SWAP 7, 13
302    SWAP 0, 15
303    PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
304    TRANSPOSE4x4PS 8, 9, 10, 11, 0
305    BUTTERFLY3V    8, 9, 10, 11, 0
306    addps   m10, m11
307    TRANSPOSE4x4PS 12, 13, 14, 15, 0
308    BUTTERFLY3V    12, 13, 14, 15, 0
309    addps   m14, m15
310    addps   m12, m14
311    addps   m14, m13
312    addps   m13, m15
313%endmacro
314
315%macro PASS6 0
316    SWAP 9, 12
317    SWAP 11, 14
318    movss [outq+0x00], m8
319    pshuflw m0, m8, 0xe
320    movss [outq+0x10], m9
321    pshuflw m1, m9, 0xe
322    movss [outq+0x20], m10
323    pshuflw m2, m10, 0xe
324    movss [outq+0x30], m11
325    pshuflw m3, m11, 0xe
326    movss [outq+0x40], m12
327    pshuflw m4, m12, 0xe
328    movss [outq+0x50], m13
329    pshuflw m5, m13, 0xe
330    movss [outq+0x60], m14
331    pshuflw m6, m14, 0xe
332    movaps [outq+0x70], m15
333    pshuflw m7, m15, 0xe
334    addss   m0, m1
335    addss   m1, m2
336    movss [outq+0x08], m0
337    addss   m2, m3
338    movss [outq+0x18], m1
339    addss   m3, m4
340    movss [outq+0x28], m2
341    addss   m4, m5
342    movss [outq+0x38], m3
343    addss   m5, m6
344    movss [outq+0x48], m4
345    addss   m6, m7
346    movss [outq+0x58], m5
347    movss [outq+0x68], m6
348    movss [outq+0x78], m7
349
350    PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
351    movhlps m0, m1
352    pshufd  m1, m1, 3
353    SWAP 0, 2, 4, 6, 8, 10, 12, 14
354    SWAP 1, 3, 5, 7, 9, 11, 13, 15
355%rep 7
356    movhlps m0, m1
357    pshufd  m1, m1, 3
358    addss   m15, m1
359    SWAP 0, 2, 4, 6, 8, 10, 12, 14
360    SWAP 1, 3, 5, 7, 9, 11, 13, 15
361%endrep
362%assign i 4
363%rep 15
364    addss m0, m1
365    movss [outq+i], m0
366    SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
367    %assign i i+8
368%endrep
369%endmacro
370
371%else ; ARCH_X86_32
372%macro SPILL 2 ; xmm#, mempos
373    movaps [outq+(%2-8)*16], m%1
374%endmacro
375%macro UNSPILL 2
376    movaps m%1, [outq+(%2-8)*16]
377%endmacro
378
379%define PASS6 PASS6_AND_PERMUTE
380%macro PASS5 0
381    movaps      m2, [ps_cos_vec+160]
382    shufps      m3, m3, 0xcc
383
384    BUTTERFLY3  m5, m3, m2, m1
385    SPILL 5, 8
386
387    UNSPILL 1, 9
388    BUTTERFLY3  m1, m3, m2, m5
389    SPILL 1, 14
390
391    BUTTERFLY3  m4, m3, m2, m5
392    SPILL 4, 12
393
394    BUTTERFLY3  m7, m3, m2, m5
395    SPILL 7, 13
396
397    UNSPILL 5, 10
398    BUTTERFLY3  m5, m3, m2, m7
399    SPILL 5, 10
400
401    UNSPILL 4, 11
402    BUTTERFLY3  m4, m3, m2, m7
403    SPILL 4, 11
404
405    BUTTERFLY3  m6, m3, m2, m7
406    SPILL 6, 9
407
408    BUTTERFLY3  m0, m3, m2, m7
409    SPILL 0, 15
410%endmacro
411%endif
412
413
414INIT_XMM
415%macro DCT32_FUNC 1
416; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
417cglobal dct32_float_%1, 2,3,16, out, in, tmp
418    ; pass 1
419
420    movaps      m0, [inq+0]
421    LOAD_INV    m1, [inq+112]
422    BUTTERFLY   m0, m1, [ps_cos_vec], m3
423
424    movaps      m7, [inq+64]
425    LOAD_INV    m4, [inq+48]
426    BUTTERFLY   m7, m4, [ps_cos_vec+32], m3
427
428    ; pass 2
429    movaps      m2, [ps_cos_vec+64]
430    BUTTERFLY   m1, m4, m2, m3
431    SPILL 1, 11
432    SPILL 4, 8
433
434    ; pass 1
435    movaps      m1, [inq+16]
436    LOAD_INV    m6, [inq+96]
437    BUTTERFLY   m1, m6, [ps_cos_vec+16], m3
438
439    movaps      m4, [inq+80]
440    LOAD_INV    m5, [inq+32]
441    BUTTERFLY   m4, m5, [ps_cos_vec+48], m3
442
443    ; pass 2
444    BUTTERFLY   m0, m7, m2, m3
445
446    movaps      m2, [ps_cos_vec+80]
447    BUTTERFLY   m6, m5, m2, m3
448
449    BUTTERFLY   m1, m4, m2, m3
450
451    ; pass 3
452    movaps      m2, [ps_cos_vec+96]
453    shufps      m1, m1, 0x1b
454    BUTTERFLY   m0, m1, m2, m3
455    SPILL 0, 15
456    SPILL 1, 14
457
458    UNSPILL 0, 8
459    shufps      m5, m5, 0x1b
460    BUTTERFLY   m0, m5, m2, m3
461
462    UNSPILL 1, 11
463    shufps      m6, m6, 0x1b
464    BUTTERFLY   m1, m6, m2, m3
465    SPILL 1, 11
466
467    shufps      m4, m4, 0x1b
468    BUTTERFLY   m7, m4, m2, m3
469
470    ; pass 4
471    movaps      m3, [ps_p1p1m1m1+0]
472    movaps      m2, [ps_cos_vec+128]
473
474    BUTTERFLY2  m5, m3, m2, m1
475
476    BUTTERFLY2  m0, m3, m2, m1
477    SPILL 0, 9
478
479    BUTTERFLY2  m6, m3, m2, m1
480    SPILL 6, 10
481
482    UNSPILL 0, 11
483    BUTTERFLY2  m0, m3, m2, m1
484    SPILL 0, 11
485
486    BUTTERFLY2  m4, m3, m2, m1
487
488    BUTTERFLY2  m7, m3, m2, m1
489
490    UNSPILL 6, 14
491    BUTTERFLY2  m6, m3, m2, m1
492
493    UNSPILL 0, 15
494    BUTTERFLY2  m0, m3, m2, m1
495
496    PASS5
497    PASS6
498    RET
499%endmacro
500
501%macro LOAD_INV_SSE 2
502    movaps      %1, %2
503    shufps      %1, %1, 0x1b
504%endmacro
505
506%define LOAD_INV LOAD_INV_SSE
507DCT32_FUNC sse
508
509%macro LOAD_INV_SSE2 2
510    pshufd      %1, %2, 0x1b
511%endmacro
512
513%define LOAD_INV LOAD_INV_SSE2
514%define BUTTERFLY0 BUTTERFLY0_SSE2
515DCT32_FUNC sse2
516