1;******************************************************************************
2;* 36 point SSE-optimized IMDCT transform
3;* Copyright (c) 2011 Vitor Sessak
4;*
5;* This file is part of Libav.
6;*
7;* Libav is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* Libav is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with Libav; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86inc.asm"
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27align 16
28ps_mask:  dd 0, ~0, ~0, ~0
29ps_mask2: dd 0, ~0,  0, ~0
30ps_mask3: dd 0,  0,  0, ~0
31ps_mask4: dd 0, ~0,  0,  0
32
33ps_val1:  dd          -0.5,          -0.5, -0.8660254038, -0.8660254038
34ps_val2:  dd           1.0,           1.0,  0.8660254038,  0.8660254038
35ps_val3:  dd  0.1736481777,  0.1736481777,  0.3420201433,  0.3420201433
36ps_val4:  dd -0.7660444431, -0.7660444431,  0.8660254038,  0.8660254038
37ps_val5:  dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
38ps_val6:  dd           0.5,           0.5, -0.6427876097, -0.6427876097
39ps_val7:  dd           1.0,           1.0, -0.6427876097, -0.6427876097
40
41ps_p1p1m1m1: dd 0,          0, 0x80000000, 0x80000000
42ps_p1m1p1m1: dd 0, 0x80000000,          0, 0x80000000
43
44ps_cosh:       dd 1.0, 0.50190991877167369479,  1.0,  5.73685662283492756461
45               dd 1.0, 0.51763809020504152469,  1.0,  1.93185165257813657349
46               dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
47               dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
48               dd 1.0, 0.70710678118654752439,  0.0,  0.0
49
50ps_cosh_sse3:  dd 1.0, -0.50190991877167369479,  1.0, -5.73685662283492756461
51               dd 1.0, -0.51763809020504152469,  1.0, -1.93185165257813657349
52               dd 1.0, -0.55168895948124587824, -1.0,  1.18310079157624925896
53               dd 1.0, -0.61038729438072803416, -1.0,  0.87172339781054900991
54               dd 1.0,  0.70710678118654752439,  0.0,  0.0
55
56costabs:  times 4 dd  0.98480773
57          times 4 dd  0.93969262
58          times 4 dd  0.86602539
59          times 4 dd -0.76604444
60          times 4 dd -0.64278764
61          times 4 dd  0.50000000
62          times 4 dd -0.50000000
63          times 4 dd -0.34202015
64          times 4 dd -0.17364818
65          times 4 dd  0.50190992
66          times 4 dd  0.51763808
67          times 4 dd  0.55168896
68          times 4 dd  0.61038726
69          times 4 dd  0.70710677
70          times 4 dd  0.87172341
71          times 4 dd  1.18310082
72          times 4 dd  1.93185163
73          times 4 dd  5.73685646
74
75%define SBLIMIT 32
76SECTION_TEXT
77
78%macro PSHUFD 3
79%if cpuflag(sse2) && notcpuflag(avx)
80    pshufd %1, %2, %3
81%else
82    shufps %1, %2, %2, %3
83%endif
84%endmacro
85
86; input  %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
87; output %1={x3,x4,y1,y2}
88%macro BUILDINVHIGHLOW 3
89%if cpuflag(avx)
90    shufps %1, %2, %3, 0x4e
91%else
92    movlhps %1, %3
93    movhlps %1, %2
94%endif
95%endmacro
96
97; input  %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
98; output %1={x4,y1,y2,y3}
99%macro ROTLEFT 3
100%if cpuflag(ssse3)
101    palignr  %1, %3, %2, 12
102%else
103    BUILDINVHIGHLOW %1, %2, %3
104    shufps  %1, %1, %3, 0x99
105%endif
106%endmacro
107
108%macro INVERTHL 2
109%if cpuflag(sse2)
110    PSHUFD  %1, %2, 0x4e
111%else
112    movhlps %1, %2
113    movlhps %1, %2
114%endif
115%endmacro
116
117%macro BUTTERF 3
118    INVERTHL %2, %1
119    xorps    %1, [ps_p1p1m1m1]
120    addps    %1, %2
121%if cpuflag(sse3)
122    mulps    %1, %1, [ps_cosh_sse3 + %3]
123    PSHUFD   %2, %1, 0xb1
124    addsubps %1, %1, %2
125%else
126    mulps    %1, [ps_cosh + %3]
127    PSHUFD   %2, %1, 0xb1
128    xorps    %1, [ps_p1m1p1m1]
129    addps    %1, %2
130%endif
131%endmacro
132
133%macro STORE 4
134    movhlps %2, %1
135    movss   [%3       ], %1
136    movss   [%3 + 2*%4], %2
137    shufps  %1, %1, 0xb1
138    movss   [%3 +   %4], %1
139    movhlps %2, %1
140    movss   [%3 + 3*%4], %2
141%endmacro
142
143%macro LOAD 4
144    movlps  %1, [%3       ]
145    movhps  %1, [%3 +   %4]
146    movlps  %2, [%3 + 2*%4]
147    movhps  %2, [%3 + 3*%4]
148    shufps  %1, %2, 0x88
149%endmacro
150
151%macro LOADA64 2
152%if cpuflag(avx)
153   movu     %1, [%2]
154%else
155   movlps   %1, [%2]
156   movhps   %1, [%2 + 8]
157%endif
158%endmacro
159
160%macro DEFINE_IMDCT 0
161cglobal imdct36_float, 4,4,9, out, buf, in, win
162
163    ; for(i=17;i>=1;i--) in[i] += in[i-1];
164    LOADA64 m0, inq
165    LOADA64 m1, inq + 16
166
167    ROTLEFT m5, m0, m1
168
169    PSHUFD  m6, m0, 0x93
170    andps   m6, m6, [ps_mask]
171    addps   m0, m0, m6
172
173    LOADA64 m2, inq + 32
174
175    ROTLEFT m7, m1, m2
176
177    addps   m1, m1, m5
178    LOADA64 m3, inq + 48
179
180    ROTLEFT m5, m2, m3
181
182    xorps   m4, m4, m4
183    movlps  m4, [inq+64]
184    BUILDINVHIGHLOW m6, m3, m4
185    shufps  m6, m6, m4, 0xa9
186
187    addps   m4, m4, m6
188    addps   m2, m2, m7
189    addps   m3, m3, m5
190
191    ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
192    movlhps m5, m5, m0
193    andps   m5, m5, [ps_mask3]
194
195    BUILDINVHIGHLOW m7, m0, m1
196    andps   m7, m7, [ps_mask2]
197
198    addps   m0, m0, m5
199
200    BUILDINVHIGHLOW m6, m1, m2
201    andps   m6, m6, [ps_mask2]
202
203    addps  m1, m1, m7
204
205    BUILDINVHIGHLOW m7, m2, m3
206    andps   m7, m7, [ps_mask2]
207
208    addps   m2, m2, m6
209
210    movhlps m6, m6, m3
211    andps   m6, m6, [ps_mask4]
212
213    addps  m3, m3, m7
214    addps  m4, m4, m6
215
216    ; Populate tmp[]
217    movlhps m6, m1, m5    ; zero out high values
218    subps   m6, m6, m4
219
220    subps  m5, m0, m3
221
222%ifdef ARCH_X86_64
223    SWAP   m5, m8
224%endif
225
226    mulps  m7, m2, [ps_val1]
227
228%ifdef ARCH_X86_64
229    mulps  m5, m8, [ps_val2]
230%else
231    mulps  m5, m5, [ps_val2]
232%endif
233    addps  m7, m7, m5
234
235    mulps  m5, m6, [ps_val1]
236    subps  m7, m7, m5
237
238%ifdef ARCH_X86_64
239    SWAP   m5, m8
240%else
241    subps  m5, m0, m3
242%endif
243
244    subps  m5, m5, m6
245    addps  m5, m5, m2
246
247    shufps m6, m4, m3, 0xe4
248    subps  m6, m6, m2
249    mulps  m6, m6, [ps_val3]
250
251    addps  m4, m4, m1
252    mulps  m4, m4, [ps_val4]
253
254    shufps m1, m1, m0, 0xe4
255    addps  m1, m1, m2
256    mulps  m1, m1, [ps_val5]
257
258    mulps  m3, m3, [ps_val6]
259    mulps  m0, m0, [ps_val7]
260    addps  m0, m0, m3
261
262    xorps  m2, m1, [ps_p1p1m1m1]
263    subps  m2, m2, m4
264    addps  m2, m2, m0
265
266    addps  m3, m4, m0
267    subps  m3, m3, m6
268    xorps  m3, m3, [ps_p1p1m1m1]
269
270    shufps m0, m0, m4, 0xe4
271    subps  m0, m0, m1
272    addps  m0, m0, m6
273
274    BUILDINVHIGHLOW m4, m2, m3
275    shufps  m3, m3, m2, 0x4e
276
277    ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
278
279    BUTTERF  m0, m1, 0
280    BUTTERF  m7, m2, 16
281    BUTTERF  m3, m6, 32
282    BUTTERF  m4, m1, 48
283
284    mulps   m5, m5, [ps_cosh + 64]
285    PSHUFD  m1, m5, 0xe1
286    xorps   m5, m5, [ps_p1m1p1m1]
287    addps   m5, m5, m1
288
289    ; permutates:
290    ; m0    0  1  2  3     =>     2  6 10 14   m1
291    ; m7    4  5  6  7     =>     3  7 11 15   m2
292    ; m3    8  9 10 11     =>    17 13  9  5   m3
293    ; m4   12 13 14 15     =>    16 12  8  4   m5
294    ; m5   16 17 xx xx     =>     0  1 xx xx   m0
295
296    unpckhps m1, m0, m7
297    unpckhps m6, m3, m4
298    movhlps  m2, m6, m1
299    movlhps  m1, m1, m6
300
301    unpcklps m5, m5, m4
302    unpcklps m3, m3, m7
303    movhlps  m4, m3, m5
304    movlhps  m5, m5, m3
305    SWAP m4, m3
306    ; permutation done
307
308    PSHUFD  m6, m2, 0xb1
309    movss   m4, [bufq + 4*68]
310    movss   m7, [bufq + 4*64]
311    unpcklps  m7, m7, m4
312    mulps   m6, m6, [winq + 16*4]
313    addps   m6, m6, m7
314    movss   [outq + 64*SBLIMIT], m6
315    shufps  m6, m6, m6, 0xb1
316    movss   [outq + 68*SBLIMIT], m6
317
318    mulps   m6, m3, [winq + 4*4]
319    LOAD    m4, m7, bufq + 4*16, 16
320    addps   m6, m6, m4
321    STORE   m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
322
323    shufps  m4, m0, m3, 0xb5
324    mulps   m4, m4, [winq + 8*4]
325    LOAD    m7, m6, bufq + 4*32, 16
326    addps   m4, m4, m7
327    STORE   m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
328
329    shufps  m3, m3, m2, 0xb1
330    mulps   m3, m3, [winq + 12*4]
331    LOAD    m7, m6, bufq + 4*48, 16
332    addps   m3, m3, m7
333    STORE   m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
334
335    mulps   m2, m2, [winq]
336    LOAD    m6, m7, bufq, 16
337    addps   m2, m2, m6
338    STORE   m2, m7, outq, 4*SBLIMIT
339
340    mulps    m4, m1, [winq + 20*4]
341    STORE    m4, m7, bufq, 16
342
343    mulps    m3, m5, [winq + 24*4]
344    STORE    m3, m7, bufq + 4*16, 16
345
346    shufps   m0, m0, m5, 0xb0
347    mulps    m0, m0, [winq + 28*4]
348    STORE    m0, m7, bufq + 4*32, 16
349
350    shufps   m5, m5, m1, 0xb1
351    mulps    m5, m5, [winq + 32*4]
352    STORE    m5, m7, bufq + 4*48, 16
353
354    shufps   m1, m1, m1, 0xb1
355    mulps    m1, m1, [winq + 36*4]
356    movss    [bufq + 4*64], m1
357    shufps   m1, m1, 0xb1
358    movss    [bufq + 4*68], m1
359    RET
360%endmacro
361
362INIT_XMM sse
363DEFINE_IMDCT
364
365INIT_XMM sse2
366DEFINE_IMDCT
367
368INIT_XMM sse3
369DEFINE_IMDCT
370
371INIT_XMM ssse3
372DEFINE_IMDCT
373
374INIT_XMM avx
375DEFINE_IMDCT
376
377INIT_XMM sse
378
379%ifdef ARCH_X86_64
380%define SPILL SWAP
381%define UNSPILL SWAP
382%define SPILLED(x) m %+ x
383%else
384%define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
385%macro SPILL 2 ; xmm#, mempos
386    movaps SPILLED(%2), m%1
387%endmacro
388%macro UNSPILL 2
389    movaps m%1, SPILLED(%2)
390%endmacro
391%endif
392
393%macro DEFINE_FOUR_IMDCT 0
394cglobal four_imdct36_float, 5,5,8, out, buf, in, win, tmp
395    movlps  m0, [inq+64]
396    movhps  m0, [inq+64 +   72]
397    movlps  m3, [inq+64 + 2*72]
398    movhps  m3, [inq+64 + 3*72]
399
400    shufps  m5, m0, m3, 0xdd
401    shufps  m0, m0, m3, 0x88
402
403    mova     m1, [inq+48]
404    movu     m6, [inq+48 +   72]
405    mova     m7, [inq+48 + 2*72]
406    movu     m3, [inq+48 + 3*72]
407
408    TRANSPOSE4x4PS 1, 6, 7, 3, 4
409
410    addps   m4, m6, m7
411    mova    [tmpq+4*28], m4
412
413    addps    m7, m3
414    addps    m6, m1
415    addps    m3, m0
416    addps    m0, m5
417    addps    m0, m7
418    addps    m7, m6
419    mova    [tmpq+4*12], m7
420    SPILL   3, 12
421
422    mova     m4, [inq+32]
423    movu     m5, [inq+32 +   72]
424    mova     m2, [inq+32 + 2*72]
425    movu     m7, [inq+32 + 3*72]
426
427    TRANSPOSE4x4PS 4, 5, 2, 7, 3
428
429    addps   m1, m7
430    SPILL   1, 11
431
432    addps   m3, m5, m2
433    SPILL   3, 13
434
435    addps    m7, m2
436    addps    m5, m4
437    addps    m6, m7
438    mova    [tmpq], m6
439    addps   m7, m5
440    mova    [tmpq+4*16], m7
441
442    mova    m2, [inq+16]
443    movu    m7, [inq+16 +   72]
444    mova    m1, [inq+16 + 2*72]
445    movu    m6, [inq+16 + 3*72]
446
447    TRANSPOSE4x4PS 2, 7, 1, 6, 3
448
449    addps   m4, m6
450    addps   m6, m1
451    addps   m1, m7
452    addps   m7, m2
453    addps   m5, m6
454    SPILL   5, 15
455    addps   m6, m7
456    mulps   m6, [costabs + 16*2]
457    mova    [tmpq+4*8], m6
458    SPILL   1, 10
459    SPILL   0, 14
460
461    mova    m1, [inq]
462    movu    m6, [inq +   72]
463    mova    m3, [inq + 2*72]
464    movu    m5, [inq + 3*72]
465
466    TRANSPOSE4x4PS 1, 6, 3, 5, 0
467
468    addps    m2, m5
469    addps    m5, m3
470    addps    m7, m5
471    addps    m3, m6
472    addps    m6, m1
473    SPILL    7, 8
474    addps    m5, m6
475    SPILL    6, 9
476    addps    m6, m4, SPILLED(12)
477    subps    m6, m2
478    UNSPILL  7, 11
479    SPILL    5, 11
480    subps    m5, m1, m7
481    mulps    m7, [costabs + 16*5]
482    addps    m7, m1
483    mulps    m0, m6, [costabs + 16*6]
484    addps    m0, m5
485    mova     [tmpq+4*24], m0
486    addps    m6, m5
487    mova     [tmpq+4*4], m6
488    addps    m6, m4, m2
489    mulps    m6, [costabs + 16*1]
490    subps    m4, SPILLED(12)
491    mulps    m4, [costabs + 16*8]
492    addps    m2, SPILLED(12)
493    mulps    m2, [costabs + 16*3]
494    subps    m5, m7, m6
495    subps    m5, m2
496    addps    m6, m7
497    addps    m6, m4
498    addps    m7, m2
499    subps    m7, m4
500    mova     [tmpq+4*20], m7
501    mova     m2, [tmpq+4*28]
502    mova     [tmpq+4*28], m5
503    UNSPILL  7, 13
504    subps    m5, m7, m2
505    mulps    m5, [costabs + 16*7]
506    UNSPILL  1, 10
507    mulps    m1, [costabs + 16*2]
508    addps    m4, m3, m2
509    mulps    m4, [costabs + 16*4]
510    addps    m2, m7
511    addps    m7, m3
512    mulps    m7, [costabs]
513    subps    m3, m2
514    mulps    m3, [costabs + 16*2]
515    addps    m2, m7, m5
516    addps    m2, m1
517    SPILL    2, 10
518    addps    m7, m4
519    subps    m7, m1
520    SPILL    7, 12
521    subps    m5, m4
522    subps    m5, m1
523    UNSPILL  0, 14
524    SPILL    5, 13
525    addps    m1, m0, SPILLED(15)
526    subps    m1, SPILLED(8)
527    mova     m4, [costabs + 16*5]
528    mulps    m4, [tmpq]
529    UNSPILL  2, 9
530    addps    m4, m2
531    subps    m2, [tmpq]
532    mulps    m5, m1, [costabs + 16*6]
533    addps    m5, m2
534    SPILL    5, 9
535    addps    m2, m1
536    SPILL    2, 14
537    UNSPILL  5, 15
538    subps    m7, m5, m0
539    addps    m5, SPILLED(8)
540    mulps    m5, [costabs + 16*1]
541    mulps    m7, [costabs + 16*8]
542    addps    m0, SPILLED(8)
543    mulps    m0, [costabs + 16*3]
544    subps    m2, m4, m5
545    subps    m2, m0
546    SPILL    2, 15
547    addps    m5, m4
548    addps    m5, m7
549    addps    m4, m0
550    subps    m4, m7
551    SPILL    4, 8
552    mova     m7, [tmpq+4*16]
553    mova     m2, [tmpq+4*12]
554    addps    m0, m7, m2
555    subps    m0, SPILLED(11)
556    mulps    m0, [costabs + 16*2]
557    addps    m4, m7, SPILLED(11)
558    mulps    m4, [costabs]
559    subps    m7, m2
560    mulps    m7, [costabs + 16*7]
561    addps    m2, SPILLED(11)
562    mulps    m2, [costabs + 16*4]
563    addps    m1, m7, [tmpq+4*8]
564    addps    m1, m4
565    addps    m4, m2
566    subps    m4, [tmpq+4*8]
567    SPILL    4, 11
568    subps    m7, m2
569    subps    m7, [tmpq+4*8]
570    addps    m4, m6, SPILLED(10)
571    subps    m6, SPILLED(10)
572    addps    m2, m5, m1
573    mulps    m2, [costabs + 16*9]
574    subps    m5, m1
575    mulps    m5, [costabs + 16*17]
576    subps    m1, m4, m2
577    addps    m4, m2
578    mulps    m2, m1, [winq+4*36]
579    addps    m2, [bufq+4*36]
580    mova     [outq+1152], m2
581    mulps    m1, [winq+4*32]
582    addps    m1, [bufq+4*32]
583    mova     [outq+1024], m1
584    mulps    m1, m4, [winq+4*116]
585    mova     [bufq+4*36], m1
586    mulps    m4, [winq+4*112]
587    mova     [bufq+4*32], m4
588    addps    m2, m6, m5
589    subps    m6, m5
590    mulps    m1, m6, [winq+4*68]
591    addps    m1, [bufq+4*68]
592    mova     [outq+2176], m1
593    mulps    m6, [winq]
594    addps    m6, [bufq]
595    mova     [outq], m6
596    mulps    m1, m2, [winq+4*148]
597    mova     [bufq+4*68], m1
598    mulps    m2, [winq+4*80]
599    mova     [bufq], m2
600    addps    m5, m3, [tmpq+4*24]
601    mova     m2, [tmpq+4*24]
602    subps    m2, m3
603    mova     m1, SPILLED(9)
604    subps    m1, m0
605    mulps    m1, [costabs + 16*10]
606    addps    m0, SPILLED(9)
607    mulps    m0, [costabs + 16*16]
608    addps    m6, m5, m1
609    subps    m5, m1
610    mulps    m3, m5, [winq+4*40]
611    addps    m3, [bufq+4*40]
612    mova     [outq+1280], m3
613    mulps    m5, [winq+4*28]
614    addps    m5, [bufq+4*28]
615    mova     [outq+896], m5
616    mulps    m1, m6, [winq+4*120]
617    mova     [bufq+4*40], m1
618    mulps    m6, [winq+4*108]
619    mova     [bufq+4*28], m6
620    addps    m1, m2, m0
621    subps    m2, m0
622    mulps    m5, m2, [winq+4*64]
623    addps    m5, [bufq+4*64]
624    mova     [outq+2048], m5
625    mulps    m2, [winq+4*4]
626    addps    m2, [bufq+4*4]
627    mova     [outq+128], m2
628    mulps    m0, m1, [winq+4*144]
629    mova     [bufq+4*64], m0
630    mulps    m1, [winq+4*84]
631    mova     [bufq+4*4], m1
632    mova     m1, [tmpq+4*28]
633    mova     m5, m1
634    addps    m1, SPILLED(13)
635    subps    m5, SPILLED(13)
636    UNSPILL  3, 15
637    addps    m2, m7, m3
638    mulps    m2, [costabs + 16*11]
639    subps    m3, m7
640    mulps    m3, [costabs + 16*15]
641    addps    m0, m2, m1
642    subps    m1, m2
643    SWAP     m0, m2
644    mulps    m6, m1, [winq+4*44]
645    addps    m6, [bufq+4*44]
646    mova     [outq+1408], m6
647    mulps    m1, [winq+4*24]
648    addps    m1, [bufq+4*24]
649    mova     [outq+768], m1
650    mulps    m0, m2, [winq+4*124]
651    mova     [bufq+4*44], m0
652    mulps    m2, [winq+4*104]
653    mova     [bufq+4*24], m2
654    addps    m0, m5, m3
655    subps    m5, m3
656    mulps    m1, m5, [winq+4*60]
657    addps    m1, [bufq+4*60]
658    mova     [outq+1920], m1
659    mulps    m5, [winq+4*8]
660    addps    m5, [bufq+4*8]
661    mova     [outq+256], m5
662    mulps    m1, m0, [winq+4*140]
663    mova     [bufq+4*60], m1
664    mulps    m0, [winq+4*88]
665    mova     [bufq+4*8], m0
666    mova     m1, [tmpq+4*20]
667    addps    m1, SPILLED(12)
668    mova     m2, [tmpq+4*20]
669    subps    m2, SPILLED(12)
670    UNSPILL  7, 8
671    subps    m0, m7, SPILLED(11)
672    addps    m7, SPILLED(11)
673    mulps    m4, m7, [costabs + 16*12]
674    mulps    m0, [costabs + 16*14]
675    addps    m5, m1, m4
676    subps    m1, m4
677    mulps    m7, m1, [winq+4*48]
678    addps    m7, [bufq+4*48]
679    mova     [outq+1536], m7
680    mulps    m1, [winq+4*20]
681    addps    m1, [bufq+4*20]
682    mova     [outq+640], m1
683    mulps    m1, m5, [winq+4*128]
684    mova     [bufq+4*48], m1
685    mulps    m5, [winq+4*100]
686    mova     [bufq+4*20], m5
687    addps    m6, m2, m0
688    subps    m2, m0
689    mulps    m1, m2, [winq+4*56]
690    addps    m1, [bufq+4*56]
691    mova     [outq+1792], m1
692    mulps    m2, [winq+4*12]
693    addps    m2, [bufq+4*12]
694    mova     [outq+384], m2
695    mulps    m0, m6, [winq+4*136]
696    mova    [bufq+4*56], m0
697    mulps    m6, [winq+4*92]
698    mova     [bufq+4*12], m6
699    UNSPILL  0, 14
700    mulps    m0, [costabs + 16*13]
701    mova     m3, [tmpq+4*4]
702    addps    m2, m0, m3
703    subps    m3, m0
704    mulps    m0, m3, [winq+4*52]
705    addps    m0, [bufq+4*52]
706    mova     [outq+1664], m0
707    mulps    m3, [winq+4*16]
708    addps    m3, [bufq+4*16]
709    mova     [outq+512], m3
710    mulps    m0, m2, [winq+4*132]
711    mova     [bufq+4*52], m0
712    mulps    m2, [winq+4*96]
713    mova     [bufq+4*16], m2
714    RET
715%endmacro
716
717INIT_XMM sse
718DEFINE_FOUR_IMDCT
719
720INIT_XMM avx
721DEFINE_FOUR_IMDCT
722