1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29pw_pixel_max: times 8 dw ((1 << 10)-1)
30pd_32:        times 4 dd 32
31
32SECTION .text
33
34;-----------------------------------------------------------------------------
35; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
36;-----------------------------------------------------------------------------
37%macro STORE_DIFFx2 6
38    psrad       %1, 6
39    psrad       %2, 6
40    packssdw    %1, %2
41    movq        %3, [%5]
42    movhps      %3, [%5+%6]
43    paddsw      %1, %3
44    CLIPW       %1, %4, [pw_pixel_max]
45    movq      [%5], %1
46    movhps [%5+%6], %1
47%endmacro
48
49%macro STORE_DIFF16 5
50    psrad       %1, 6
51    psrad       %2, 6
52    packssdw    %1, %2
53    paddsw      %1, [%5]
54    CLIPW       %1, %3, %4
55    mova      [%5], %1
56%endmacro
57
58;dst, in, stride
59%macro IDCT4_ADD_10 3
60    mova  m0, [%2+ 0]
61    mova  m1, [%2+16]
62    mova  m2, [%2+32]
63    mova  m3, [%2+48]
64    IDCT4_1D d,0,1,2,3,4,5
65    TRANSPOSE4x4D 0,1,2,3,4
66    paddd m0, [pd_32]
67    IDCT4_1D d,0,1,2,3,4,5
68    pxor  m5, m5
69    mova [%2+ 0], m5
70    mova [%2+16], m5
71    mova [%2+32], m5
72    mova [%2+48], m5
73    STORE_DIFFx2 m0, m1, m4, m5, %1, %3
74    lea   %1, [%1+%3*2]
75    STORE_DIFFx2 m2, m3, m4, m5, %1, %3
76%endmacro
77
78%macro IDCT_ADD_10 0
79cglobal h264_idct_add_10, 3,3
80    IDCT4_ADD_10 r0, r1, r2
81    RET
82%endmacro
83
84INIT_XMM sse2
85IDCT_ADD_10
86%if HAVE_AVX_EXTERNAL
87INIT_XMM avx
88IDCT_ADD_10
89%endif
90
91;-----------------------------------------------------------------------------
92; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset,
93;                            int16_t *block, int stride,
94;                            const uint8_t nnzc[6*8])
95;-----------------------------------------------------------------------------
96;;;;;;; NO FATE SAMPLES TRIGGER THIS
97%macro ADD4x4IDCT 0
98add4x4_idct %+ SUFFIX:
99    add   r5, r0
100    mova  m0, [r2+ 0]
101    mova  m1, [r2+16]
102    mova  m2, [r2+32]
103    mova  m3, [r2+48]
104    IDCT4_1D d,0,1,2,3,4,5
105    TRANSPOSE4x4D 0,1,2,3,4
106    paddd m0, [pd_32]
107    IDCT4_1D d,0,1,2,3,4,5
108    pxor  m5, m5
109    mova  [r2+ 0], m5
110    mova  [r2+16], m5
111    mova  [r2+32], m5
112    mova  [r2+48], m5
113    STORE_DIFFx2 m0, m1, m4, m5, r5, r3
114    lea   r5, [r5+r3*2]
115    STORE_DIFFx2 m2, m3, m4, m5, r5, r3
116    ret
117%endmacro
118
119INIT_XMM sse2
120ALIGN 16
121ADD4x4IDCT
122%if HAVE_AVX_EXTERNAL
123INIT_XMM avx
124ALIGN 16
125ADD4x4IDCT
126%endif
127
128%macro ADD16_OP 2
129    cmp          byte [r4+%2], 0
130    jz .skipblock%1
131    mov         r5d, [r1+%1*4]
132    call add4x4_idct %+ SUFFIX
133.skipblock%1:
134%if %1<15
135    add          r2, 64
136%endif
137%endmacro
138
139%macro IDCT_ADD16_10 0
140cglobal h264_idct_add16_10, 5,6
141    ADD16_OP 0, 4+1*8
142    ADD16_OP 1, 5+1*8
143    ADD16_OP 2, 4+2*8
144    ADD16_OP 3, 5+2*8
145    ADD16_OP 4, 6+1*8
146    ADD16_OP 5, 7+1*8
147    ADD16_OP 6, 6+2*8
148    ADD16_OP 7, 7+2*8
149    ADD16_OP 8, 4+3*8
150    ADD16_OP 9, 5+3*8
151    ADD16_OP 10, 4+4*8
152    ADD16_OP 11, 5+4*8
153    ADD16_OP 12, 6+3*8
154    ADD16_OP 13, 7+3*8
155    ADD16_OP 14, 6+4*8
156    ADD16_OP 15, 7+4*8
157    REP_RET
158%endmacro
159
160INIT_XMM sse2
161IDCT_ADD16_10
162%if HAVE_AVX_EXTERNAL
163INIT_XMM avx
164IDCT_ADD16_10
165%endif
166
167;-----------------------------------------------------------------------------
168; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
169;-----------------------------------------------------------------------------
170%macro IDCT_DC_ADD_OP_10 3
171    pxor      m5, m5
172%if avx_enabled
173    paddw     m1, m0, [%1+0   ]
174    paddw     m2, m0, [%1+%2  ]
175    paddw     m3, m0, [%1+%2*2]
176    paddw     m4, m0, [%1+%3  ]
177%else
178    mova      m1, [%1+0   ]
179    mova      m2, [%1+%2  ]
180    mova      m3, [%1+%2*2]
181    mova      m4, [%1+%3  ]
182    paddw     m1, m0
183    paddw     m2, m0
184    paddw     m3, m0
185    paddw     m4, m0
186%endif
187    CLIPW     m1, m5, m6
188    CLIPW     m2, m5, m6
189    CLIPW     m3, m5, m6
190    CLIPW     m4, m5, m6
191    mova [%1+0   ], m1
192    mova [%1+%2  ], m2
193    mova [%1+%2*2], m3
194    mova [%1+%3  ], m4
195%endmacro
196
197INIT_MMX mmxext
198cglobal h264_idct_dc_add_10,3,3
199    movd      m0, [r1]
200    mov dword [r1], 0
201    paddd     m0, [pd_32]
202    psrad     m0, 6
203    lea       r1, [r2*3]
204    pshufw    m0, m0, 0
205    mova      m6, [pw_pixel_max]
206    IDCT_DC_ADD_OP_10 r0, r2, r1
207    RET
208
209;-----------------------------------------------------------------------------
210; void ff_h264_idct8_dc_add_10(pixel *dst, int16_t *block, int stride)
211;-----------------------------------------------------------------------------
212%macro IDCT8_DC_ADD 0
213cglobal h264_idct8_dc_add_10,3,4,7
214    movd      m0, [r1]
215    mov dword[r1], 0
216    paddd     m0, [pd_32]
217    psrad     m0, 6
218    lea       r1, [r2*3]
219    SPLATW    m0, m0, 0
220    mova      m6, [pw_pixel_max]
221    IDCT_DC_ADD_OP_10 r0, r2, r1
222    lea       r0, [r0+r2*4]
223    IDCT_DC_ADD_OP_10 r0, r2, r1
224    RET
225%endmacro
226
227INIT_XMM sse2
228IDCT8_DC_ADD
229%if HAVE_AVX_EXTERNAL
230INIT_XMM avx
231IDCT8_DC_ADD
232%endif
233
234;-----------------------------------------------------------------------------
235; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset,
236;                                 int16_t *block, int stride,
237;                                 const uint8_t nnzc[6*8])
238;-----------------------------------------------------------------------------
239%macro AC 1
240.ac%1:
241    mov  r5d, [r1+(%1+0)*4]
242    call add4x4_idct %+ SUFFIX
243    mov  r5d, [r1+(%1+1)*4]
244    add  r2, 64
245    call add4x4_idct %+ SUFFIX
246    add  r2, 64
247    jmp .skipadd%1
248%endmacro
249
250%assign last_block 16
251%macro ADD16_OP_INTRA 2
252    cmp      word [r4+%2], 0
253    jnz .ac%1
254    mov      r5d, [r2+ 0]
255    or       r5d, [r2+64]
256    jz .skipblock%1
257    mov      r5d, [r1+(%1+0)*4]
258    call idct_dc_add %+ SUFFIX
259.skipblock%1:
260%if %1<last_block-2
261    add       r2, 128
262%endif
263.skipadd%1:
264%endmacro
265
266%macro IDCT_ADD16INTRA_10 0
267idct_dc_add %+ SUFFIX:
268    add       r5, r0
269    movq      m0, [r2+ 0]
270    movhps    m0, [r2+64]
271    mov dword [r2+ 0], 0
272    mov dword [r2+64], 0
273    paddd     m0, [pd_32]
274    psrad     m0, 6
275    pshufhw   m0, m0, 0
276    pshuflw   m0, m0, 0
277    lea       r6, [r3*3]
278    mova      m6, [pw_pixel_max]
279    IDCT_DC_ADD_OP_10 r5, r3, r6
280    ret
281
282cglobal h264_idct_add16intra_10,5,7,8
283    ADD16_OP_INTRA 0, 4+1*8
284    ADD16_OP_INTRA 2, 4+2*8
285    ADD16_OP_INTRA 4, 6+1*8
286    ADD16_OP_INTRA 6, 6+2*8
287    ADD16_OP_INTRA 8, 4+3*8
288    ADD16_OP_INTRA 10, 4+4*8
289    ADD16_OP_INTRA 12, 6+3*8
290    ADD16_OP_INTRA 14, 6+4*8
291    REP_RET
292    AC 8
293    AC 10
294    AC 12
295    AC 14
296    AC 0
297    AC 2
298    AC 4
299    AC 6
300%endmacro
301
302INIT_XMM sse2
303IDCT_ADD16INTRA_10
304%if HAVE_AVX_EXTERNAL
305INIT_XMM avx
306IDCT_ADD16INTRA_10
307%endif
308
309%assign last_block 36
310;-----------------------------------------------------------------------------
311; void ff_h264_idct_add8_10(pixel **dst, const int *block_offset,
312;                           int16_t *block, int stride,
313;                           const uint8_t nnzc[6*8])
314;-----------------------------------------------------------------------------
315%macro IDCT_ADD8 0
316cglobal h264_idct_add8_10,5,8,7
317%if ARCH_X86_64
318    mov      r7, r0
319%endif
320    add      r2, 1024
321    mov      r0, [r0]
322    ADD16_OP_INTRA 16, 4+ 6*8
323    ADD16_OP_INTRA 18, 4+ 7*8
324    add      r2, 1024-128*2
325%if ARCH_X86_64
326    mov      r0, [r7+gprsize]
327%else
328    mov      r0, r0m
329    mov      r0, [r0+gprsize]
330%endif
331    ADD16_OP_INTRA 32, 4+11*8
332    ADD16_OP_INTRA 34, 4+12*8
333    REP_RET
334    AC 16
335    AC 18
336    AC 32
337    AC 34
338
339%endmacro ; IDCT_ADD8
340
341INIT_XMM sse2
342IDCT_ADD8
343%if HAVE_AVX_EXTERNAL
344INIT_XMM avx
345IDCT_ADD8
346%endif
347
348;-----------------------------------------------------------------------------
349; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride)
350;-----------------------------------------------------------------------------
351%macro IDCT8_1D 2
352    SWAP      0, 1
353    psrad     m4, m5, 1
354    psrad     m1, m0, 1
355    paddd     m4, m5
356    paddd     m1, m0
357    paddd     m4, m7
358    paddd     m1, m5
359    psubd     m4, m0
360    paddd     m1, m3
361
362    psubd     m0, m3
363    psubd     m5, m3
364    paddd     m0, m7
365    psubd     m5, m7
366    psrad     m3, 1
367    psrad     m7, 1
368    psubd     m0, m3
369    psubd     m5, m7
370
371    SWAP      1, 7
372    psrad     m1, m7, 2
373    psrad     m3, m4, 2
374    paddd     m3, m0
375    psrad     m0, 2
376    paddd     m1, m5
377    psrad     m5, 2
378    psubd     m0, m4
379    psubd     m7, m5
380
381    SWAP      5, 6
382    psrad     m4, m2, 1
383    psrad     m6, m5, 1
384    psubd     m4, m5
385    paddd     m6, m2
386
387    mova      m2, %1
388    mova      m5, %2
389    SUMSUB_BA d, 5, 2
390    SUMSUB_BA d, 6, 5
391    SUMSUB_BA d, 4, 2
392    SUMSUB_BA d, 7, 6
393    SUMSUB_BA d, 0, 4
394    SUMSUB_BA d, 3, 2
395    SUMSUB_BA d, 1, 5
396    SWAP      7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
397%endmacro
398
399%macro IDCT8_1D_FULL 1
400    mova         m7, [%1+112*2]
401    mova         m6, [%1+ 96*2]
402    mova         m5, [%1+ 80*2]
403    mova         m3, [%1+ 48*2]
404    mova         m2, [%1+ 32*2]
405    mova         m1, [%1+ 16*2]
406    IDCT8_1D   [%1], [%1+ 64*2]
407%endmacro
408
409; %1=int16_t *block, %2=int16_t *dstblock
410%macro IDCT8_ADD_SSE_START 2
411    IDCT8_1D_FULL %1
412%if ARCH_X86_64
413    TRANSPOSE4x4D  0,1,2,3,8
414    mova    [%2    ], m0
415    TRANSPOSE4x4D  4,5,6,7,8
416    mova    [%2+8*2], m4
417%else
418    mova         [%1], m7
419    TRANSPOSE4x4D   0,1,2,3,7
420    mova           m7, [%1]
421    mova    [%2     ], m0
422    mova    [%2+16*2], m1
423    mova    [%2+32*2], m2
424    mova    [%2+48*2], m3
425    TRANSPOSE4x4D   4,5,6,7,3
426    mova    [%2+ 8*2], m4
427    mova    [%2+24*2], m5
428    mova    [%2+40*2], m6
429    mova    [%2+56*2], m7
430%endif
431%endmacro
432
433; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
434%macro IDCT8_ADD_SSE_END 3
435    IDCT8_1D_FULL %2
436    mova  [%2     ], m6
437    mova  [%2+16*2], m7
438
439    pxor         m7, m7
440    STORE_DIFFx2 m0, m1, m6, m7, %1, %3
441    lea          %1, [%1+%3*2]
442    STORE_DIFFx2 m2, m3, m6, m7, %1, %3
443    mova         m0, [%2     ]
444    mova         m1, [%2+16*2]
445    lea          %1, [%1+%3*2]
446    STORE_DIFFx2 m4, m5, m6, m7, %1, %3
447    lea          %1, [%1+%3*2]
448    STORE_DIFFx2 m0, m1, m6, m7, %1, %3
449%endmacro
450
451%macro IDCT8_ADD 0
452cglobal h264_idct8_add_10, 3,4,16
453%if UNIX64 == 0
454    %assign pad 16-gprsize-(stack_offset&15)
455    sub  rsp, pad
456    call h264_idct8_add1_10 %+ SUFFIX
457    add  rsp, pad
458    RET
459%endif
460
461ALIGN 16
462; TODO: does not need to use stack
463h264_idct8_add1_10 %+ SUFFIX:
464%assign pad 256+16-gprsize
465    sub          rsp, pad
466    add   dword [r1], 32
467
468%if ARCH_X86_64
469    IDCT8_ADD_SSE_START r1, rsp
470    SWAP 1,  9
471    SWAP 2, 10
472    SWAP 3, 11
473    SWAP 5, 13
474    SWAP 6, 14
475    SWAP 7, 15
476    IDCT8_ADD_SSE_START r1+16, rsp+128
477    PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
478    IDCT8_1D [rsp], [rsp+128]
479    SWAP 0,  8
480    SWAP 1,  9
481    SWAP 2, 10
482    SWAP 3, 11
483    SWAP 4, 12
484    SWAP 5, 13
485    SWAP 6, 14
486    SWAP 7, 15
487    IDCT8_1D [rsp+16], [rsp+144]
488    psrad         m8, 6
489    psrad         m0, 6
490    packssdw      m8, m0
491    paddsw        m8, [r0]
492    pxor          m0, m0
493    mova    [r1+  0], m0
494    mova    [r1+ 16], m0
495    mova    [r1+ 32], m0
496    mova    [r1+ 48], m0
497    mova    [r1+ 64], m0
498    mova    [r1+ 80], m0
499    mova    [r1+ 96], m0
500    mova    [r1+112], m0
501    mova    [r1+128], m0
502    mova    [r1+144], m0
503    mova    [r1+160], m0
504    mova    [r1+176], m0
505    mova    [r1+192], m0
506    mova    [r1+208], m0
507    mova    [r1+224], m0
508    mova    [r1+240], m0
509    CLIPW         m8, m0, [pw_pixel_max]
510    mova        [r0], m8
511    mova          m8, [pw_pixel_max]
512    STORE_DIFF16  m9, m1, m0, m8, r0+r2
513    lea           r0, [r0+r2*2]
514    STORE_DIFF16 m10, m2, m0, m8, r0
515    STORE_DIFF16 m11, m3, m0, m8, r0+r2
516    lea           r0, [r0+r2*2]
517    STORE_DIFF16 m12, m4, m0, m8, r0
518    STORE_DIFF16 m13, m5, m0, m8, r0+r2
519    lea           r0, [r0+r2*2]
520    STORE_DIFF16 m14, m6, m0, m8, r0
521    STORE_DIFF16 m15, m7, m0, m8, r0+r2
522%else
523    IDCT8_ADD_SSE_START r1,    rsp
524    IDCT8_ADD_SSE_START r1+16, rsp+128
525    lea           r3, [r0+8]
526    IDCT8_ADD_SSE_END r0, rsp,    r2
527    IDCT8_ADD_SSE_END r3, rsp+16, r2
528    mova    [r1+  0], m7
529    mova    [r1+ 16], m7
530    mova    [r1+ 32], m7
531    mova    [r1+ 48], m7
532    mova    [r1+ 64], m7
533    mova    [r1+ 80], m7
534    mova    [r1+ 96], m7
535    mova    [r1+112], m7
536    mova    [r1+128], m7
537    mova    [r1+144], m7
538    mova    [r1+160], m7
539    mova    [r1+176], m7
540    mova    [r1+192], m7
541    mova    [r1+208], m7
542    mova    [r1+224], m7
543    mova    [r1+240], m7
544%endif ; ARCH_X86_64
545
546    add          rsp, pad
547    ret
548%endmacro
549
550INIT_XMM sse2
551IDCT8_ADD
552%if HAVE_AVX_EXTERNAL
553INIT_XMM avx
554IDCT8_ADD
555%endif
556
557;-----------------------------------------------------------------------------
558; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset,
559;                            int16_t *block, int stride,
560;                            const uint8_t nnzc[6*8])
561;-----------------------------------------------------------------------------
562;;;;;;; NO FATE SAMPLES TRIGGER THIS
563%macro IDCT8_ADD4_OP 2
564    cmp       byte [r4+%2], 0
565    jz .skipblock%1
566    mov      r0d, [r6+%1*4]
567    add       r0, r5
568    call h264_idct8_add1_10 %+ SUFFIX
569.skipblock%1:
570%if %1<12
571    add       r1, 256
572%endif
573%endmacro
574
575%macro IDCT8_ADD4 0
576cglobal h264_idct8_add4_10, 0,7,16
577    %assign pad 16-gprsize-(stack_offset&15)
578    SUB      rsp, pad
579    mov       r5, r0mp
580    mov       r6, r1mp
581    mov       r1, r2mp
582    mov      r2d, r3m
583    movifnidn r4, r4mp
584    IDCT8_ADD4_OP  0, 4+1*8
585    IDCT8_ADD4_OP  4, 6+1*8
586    IDCT8_ADD4_OP  8, 4+3*8
587    IDCT8_ADD4_OP 12, 6+3*8
588    ADD       rsp, pad
589    RET
590%endmacro ; IDCT8_ADD4
591
592INIT_XMM sse2
593IDCT8_ADD4
594%if HAVE_AVX_EXTERNAL
595INIT_XMM avx
596IDCT8_ADD4
597%endif
598