1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7;*
8;* This file is part of Libav.
9;*
10;* Libav is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* Libav is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with Libav; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "x86inc.asm"
26%include "x86util.asm"
27
28SECTION_RODATA
29
30pw_pixel_max: times 8 dw ((1 << 10)-1)
31pd_32:        times 4 dd 32
32scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
33           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
34           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
35           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
36           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
37           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
38           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
39           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
40           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
41           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
42           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
43           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
44
45%ifdef PIC
46%define scan8 r11
47%else
48%define scan8 scan8_mem
49%endif
50
51SECTION .text
52
53;-----------------------------------------------------------------------------
54; void h264_idct_add(pixel *dst, dctcoef *block, int stride)
55;-----------------------------------------------------------------------------
56%macro STORE_DIFFx2 6
57    psrad       %1, 6
58    psrad       %2, 6
59    packssdw    %1, %2
60    movq        %3, [%5]
61    movhps      %3, [%5+%6]
62    paddsw      %1, %3
63    CLIPW       %1, %4, [pw_pixel_max]
64    movq      [%5], %1
65    movhps [%5+%6], %1
66%endmacro
67
68%macro STORE_DIFF16 5
69    psrad       %1, 6
70    psrad       %2, 6
71    packssdw    %1, %2
72    paddsw      %1, [%5]
73    CLIPW       %1, %3, %4
74    mova      [%5], %1
75%endmacro
76
77;dst, in, stride
78%macro IDCT4_ADD_10 3
79    mova  m0, [%2+ 0]
80    mova  m1, [%2+16]
81    mova  m2, [%2+32]
82    mova  m3, [%2+48]
83    IDCT4_1D d,0,1,2,3,4,5
84    TRANSPOSE4x4D 0,1,2,3,4
85    paddd m0, [pd_32]
86    IDCT4_1D d,0,1,2,3,4,5
87    pxor  m5, m5
88    STORE_DIFFx2 m0, m1, m4, m5, %1, %3
89    lea   %1, [%1+%3*2]
90    STORE_DIFFx2 m2, m3, m4, m5, %1, %3
91%endmacro
92
93%macro IDCT_ADD_10 1
94cglobal h264_idct_add_10_%1, 3,3
95    IDCT4_ADD_10 r0, r1, r2
96    RET
97%endmacro
98
99INIT_XMM
100IDCT_ADD_10 sse2
101%ifdef HAVE_AVX
102INIT_AVX
103IDCT_ADD_10 avx
104%endif
105
106;-----------------------------------------------------------------------------
107; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
108;-----------------------------------------------------------------------------
109;;;;;;; NO FATE SAMPLES TRIGGER THIS
110%macro ADD4x4IDCT 1
111add4x4_idct_%1:
112    add   r5, r0
113    mova  m0, [r2+ 0]
114    mova  m1, [r2+16]
115    mova  m2, [r2+32]
116    mova  m3, [r2+48]
117    IDCT4_1D d,0,1,2,3,4,5
118    TRANSPOSE4x4D 0,1,2,3,4
119    paddd m0, [pd_32]
120    IDCT4_1D d,0,1,2,3,4,5
121    pxor  m5, m5
122    STORE_DIFFx2 m0, m1, m4, m5, r5, r3
123    lea   r5, [r5+r3*2]
124    STORE_DIFFx2 m2, m3, m4, m5, r5, r3
125    ret
126%endmacro
127
128INIT_XMM
129ALIGN 16
130ADD4x4IDCT sse2
131%ifdef HAVE_AVX
132INIT_AVX
133ALIGN 16
134ADD4x4IDCT avx
135%endif
136
137%macro ADD16_OP 3
138    cmp          byte [r4+%3], 0
139    jz .skipblock%2
140    mov         r5d, [r1+%2*4]
141    call add4x4_idct_%1
142.skipblock%2:
143%if %2<15
144    add          r2, 64
145%endif
146%endmacro
147
148%macro IDCT_ADD16_10 1
149cglobal h264_idct_add16_10_%1, 5,6
150    ADD16_OP %1, 0, 4+1*8
151    ADD16_OP %1, 1, 5+1*8
152    ADD16_OP %1, 2, 4+2*8
153    ADD16_OP %1, 3, 5+2*8
154    ADD16_OP %1, 4, 6+1*8
155    ADD16_OP %1, 5, 7+1*8
156    ADD16_OP %1, 6, 6+2*8
157    ADD16_OP %1, 7, 7+2*8
158    ADD16_OP %1, 8, 4+3*8
159    ADD16_OP %1, 9, 5+3*8
160    ADD16_OP %1, 10, 4+4*8
161    ADD16_OP %1, 11, 5+4*8
162    ADD16_OP %1, 12, 6+3*8
163    ADD16_OP %1, 13, 7+3*8
164    ADD16_OP %1, 14, 6+4*8
165    ADD16_OP %1, 15, 7+4*8
166    REP_RET
167%endmacro
168
169INIT_XMM
170IDCT_ADD16_10 sse2
171%ifdef HAVE_AVX
172INIT_AVX
173IDCT_ADD16_10 avx
174%endif
175
176;-----------------------------------------------------------------------------
177; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride)
178;-----------------------------------------------------------------------------
179%macro IDCT_DC_ADD_OP_10 3
180    pxor      m5, m5
181%if avx_enabled
182    paddw     m1, m0, [%1+0   ]
183    paddw     m2, m0, [%1+%2  ]
184    paddw     m3, m0, [%1+%2*2]
185    paddw     m4, m0, [%1+%3  ]
186%else
187    mova      m1, [%1+0   ]
188    mova      m2, [%1+%2  ]
189    mova      m3, [%1+%2*2]
190    mova      m4, [%1+%3  ]
191    paddw     m1, m0
192    paddw     m2, m0
193    paddw     m3, m0
194    paddw     m4, m0
195%endif
196    CLIPW     m1, m5, m6
197    CLIPW     m2, m5, m6
198    CLIPW     m3, m5, m6
199    CLIPW     m4, m5, m6
200    mova [%1+0   ], m1
201    mova [%1+%2  ], m2
202    mova [%1+%2*2], m3
203    mova [%1+%3  ], m4
204%endmacro
205
206INIT_MMX
207cglobal h264_idct_dc_add_10_mmx2,3,3
208    movd      m0, [r1]
209    paddd     m0, [pd_32]
210    psrad     m0, 6
211    lea       r1, [r2*3]
212    pshufw    m0, m0, 0
213    mova      m6, [pw_pixel_max]
214    IDCT_DC_ADD_OP_10 r0, r2, r1
215    RET
216
217;-----------------------------------------------------------------------------
218; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
219;-----------------------------------------------------------------------------
220%macro IDCT8_DC_ADD 1
221cglobal h264_idct8_dc_add_10_%1,3,3,7
222    mov      r1d, [r1]
223    add       r1, 32
224    sar       r1, 6
225    movd      m0, r1d
226    lea       r1, [r2*3]
227    SPLATW    m0, m0, 0
228    mova      m6, [pw_pixel_max]
229    IDCT_DC_ADD_OP_10 r0, r2, r1
230    lea       r0, [r0+r2*4]
231    IDCT_DC_ADD_OP_10 r0, r2, r1
232    RET
233%endmacro
234
235INIT_XMM
236IDCT8_DC_ADD sse2
237%ifdef HAVE_AVX
238INIT_AVX
239IDCT8_DC_ADD avx
240%endif
241
242;-----------------------------------------------------------------------------
243; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
244;-----------------------------------------------------------------------------
245%macro AC 2
246.ac%2
247    mov  r5d, [r1+(%2+0)*4]
248    call add4x4_idct_%1
249    mov  r5d, [r1+(%2+1)*4]
250    add  r2, 64
251    call add4x4_idct_%1
252    add  r2, 64
253    jmp .skipadd%2
254%endmacro
255
256%assign last_block 16
257%macro ADD16_OP_INTRA 3
258    cmp      word [r4+%3], 0
259    jnz .ac%2
260    mov      r5d, [r2+ 0]
261    or       r5d, [r2+64]
262    jz .skipblock%2
263    mov      r5d, [r1+(%2+0)*4]
264    call idct_dc_add_%1
265.skipblock%2:
266%if %2<last_block-2
267    add       r2, 128
268%endif
269.skipadd%2:
270%endmacro
271
272%macro IDCT_ADD16INTRA_10 1
273idct_dc_add_%1:
274    add       r5, r0
275    movq      m0, [r2+ 0]
276    movhps    m0, [r2+64]
277    paddd     m0, [pd_32]
278    psrad     m0, 6
279    pshufhw   m0, m0, 0
280    pshuflw   m0, m0, 0
281    lea       r6, [r3*3]
282    mova      m6, [pw_pixel_max]
283    IDCT_DC_ADD_OP_10 r5, r3, r6
284    ret
285
286cglobal h264_idct_add16intra_10_%1,5,7,8
287    ADD16_OP_INTRA %1, 0, 4+1*8
288    ADD16_OP_INTRA %1, 2, 4+2*8
289    ADD16_OP_INTRA %1, 4, 6+1*8
290    ADD16_OP_INTRA %1, 6, 6+2*8
291    ADD16_OP_INTRA %1, 8, 4+3*8
292    ADD16_OP_INTRA %1, 10, 4+4*8
293    ADD16_OP_INTRA %1, 12, 6+3*8
294    ADD16_OP_INTRA %1, 14, 6+4*8
295    REP_RET
296    AC %1, 8
297    AC %1, 10
298    AC %1, 12
299    AC %1, 14
300    AC %1, 0
301    AC %1, 2
302    AC %1, 4
303    AC %1, 6
304%endmacro
305
306INIT_XMM
307IDCT_ADD16INTRA_10 sse2
308%ifdef HAVE_AVX
309INIT_AVX
310IDCT_ADD16INTRA_10 avx
311%endif
312
313%assign last_block 36
314;-----------------------------------------------------------------------------
315; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
316;-----------------------------------------------------------------------------
317%macro IDCT_ADD8 1
318cglobal h264_idct_add8_10_%1,5,7
319%ifdef ARCH_X86_64
320    mov r10, r0
321%endif
322    add      r2, 1024
323    mov      r0, [r0]
324    ADD16_OP_INTRA %1, 16, 4+ 6*8
325    ADD16_OP_INTRA %1, 18, 4+ 7*8
326    add      r2, 1024-128*2
327%ifdef ARCH_X86_64
328    mov      r0, [r10+gprsize]
329%else
330    mov      r0, r0m
331    mov      r0, [r0+gprsize]
332%endif
333    ADD16_OP_INTRA %1, 32, 4+11*8
334    ADD16_OP_INTRA %1, 34, 4+12*8
335    REP_RET
336    AC %1, 16
337    AC %1, 18
338    AC %1, 32
339    AC %1, 34
340
341%endmacro ; IDCT_ADD8
342
343INIT_XMM
344IDCT_ADD8 sse2
345%ifdef HAVE_AVX
346INIT_AVX
347IDCT_ADD8 avx
348%endif
349
350;-----------------------------------------------------------------------------
351; void h264_idct8_add(pixel *dst, dctcoef *block, int stride)
352;-----------------------------------------------------------------------------
353%macro IDCT8_1D 2
354    SWAP      0, 1
355    psrad     m4, m5, 1
356    psrad     m1, m0, 1
357    paddd     m4, m5
358    paddd     m1, m0
359    paddd     m4, m7
360    paddd     m1, m5
361    psubd     m4, m0
362    paddd     m1, m3
363
364    psubd     m0, m3
365    psubd     m5, m3
366    paddd     m0, m7
367    psubd     m5, m7
368    psrad     m3, 1
369    psrad     m7, 1
370    psubd     m0, m3
371    psubd     m5, m7
372
373    SWAP      1, 7
374    psrad     m1, m7, 2
375    psrad     m3, m4, 2
376    paddd     m3, m0
377    psrad     m0, 2
378    paddd     m1, m5
379    psrad     m5, 2
380    psubd     m0, m4
381    psubd     m7, m5
382
383    SWAP      5, 6
384    psrad     m4, m2, 1
385    psrad     m6, m5, 1
386    psubd     m4, m5
387    paddd     m6, m2
388
389    mova      m2, %1
390    mova      m5, %2
391    SUMSUB_BA d, 5, 2
392    SUMSUB_BA d, 6, 5
393    SUMSUB_BA d, 4, 2
394    SUMSUB_BA d, 7, 6
395    SUMSUB_BA d, 0, 4
396    SUMSUB_BA d, 3, 2
397    SUMSUB_BA d, 1, 5
398    SWAP      7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
399%endmacro
400
401%macro IDCT8_1D_FULL 1
402    mova         m7, [%1+112*2]
403    mova         m6, [%1+ 96*2]
404    mova         m5, [%1+ 80*2]
405    mova         m3, [%1+ 48*2]
406    mova         m2, [%1+ 32*2]
407    mova         m1, [%1+ 16*2]
408    IDCT8_1D   [%1], [%1+ 64*2]
409%endmacro
410
411; %1=int16_t *block, %2=int16_t *dstblock
412%macro IDCT8_ADD_SSE_START 2
413    IDCT8_1D_FULL %1
414%ifdef ARCH_X86_64
415    TRANSPOSE4x4D  0,1,2,3,8
416    mova    [%2    ], m0
417    TRANSPOSE4x4D  4,5,6,7,8
418    mova    [%2+8*2], m4
419%else
420    mova         [%1], m7
421    TRANSPOSE4x4D   0,1,2,3,7
422    mova           m7, [%1]
423    mova    [%2     ], m0
424    mova    [%2+16*2], m1
425    mova    [%2+32*2], m2
426    mova    [%2+48*2], m3
427    TRANSPOSE4x4D   4,5,6,7,3
428    mova    [%2+ 8*2], m4
429    mova    [%2+24*2], m5
430    mova    [%2+40*2], m6
431    mova    [%2+56*2], m7
432%endif
433%endmacro
434
435; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
436%macro IDCT8_ADD_SSE_END 3
437    IDCT8_1D_FULL %2
438    mova  [%2     ], m6
439    mova  [%2+16*2], m7
440
441    pxor         m7, m7
442    STORE_DIFFx2 m0, m1, m6, m7, %1, %3
443    lea          %1, [%1+%3*2]
444    STORE_DIFFx2 m2, m3, m6, m7, %1, %3
445    mova         m0, [%2     ]
446    mova         m1, [%2+16*2]
447    lea          %1, [%1+%3*2]
448    STORE_DIFFx2 m4, m5, m6, m7, %1, %3
449    lea          %1, [%1+%3*2]
450    STORE_DIFFx2 m0, m1, m6, m7, %1, %3
451%endmacro
452
453%macro IDCT8_ADD 1
454cglobal h264_idct8_add_10_%1, 3,4,16
455%ifndef UNIX64
456    %assign pad 16-gprsize-(stack_offset&15)
457    sub  rsp, pad
458    call h264_idct8_add1_10_%1
459    add  rsp, pad
460    RET
461%endif
462
463ALIGN 16
464; TODO: does not need to use stack
465h264_idct8_add1_10_%1:
466%assign pad 256+16-gprsize
467    sub          rsp, pad
468    add   dword [r1], 32
469
470%ifdef ARCH_X86_64
471    IDCT8_ADD_SSE_START r1, rsp
472    SWAP 1,  9
473    SWAP 2, 10
474    SWAP 3, 11
475    SWAP 5, 13
476    SWAP 6, 14
477    SWAP 7, 15
478    IDCT8_ADD_SSE_START r1+16, rsp+128
479    PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
480    IDCT8_1D [rsp], [rsp+128]
481    SWAP 0,  8
482    SWAP 1,  9
483    SWAP 2, 10
484    SWAP 3, 11
485    SWAP 4, 12
486    SWAP 5, 13
487    SWAP 6, 14
488    SWAP 7, 15
489    IDCT8_1D [rsp+16], [rsp+144]
490    psrad         m8, 6
491    psrad         m0, 6
492    packssdw      m8, m0
493    paddsw        m8, [r0]
494    pxor          m0, m0
495    CLIPW         m8, m0, [pw_pixel_max]
496    mova        [r0], m8
497    mova          m8, [pw_pixel_max]
498    STORE_DIFF16  m9, m1, m0, m8, r0+r2
499    lea           r0, [r0+r2*2]
500    STORE_DIFF16 m10, m2, m0, m8, r0
501    STORE_DIFF16 m11, m3, m0, m8, r0+r2
502    lea           r0, [r0+r2*2]
503    STORE_DIFF16 m12, m4, m0, m8, r0
504    STORE_DIFF16 m13, m5, m0, m8, r0+r2
505    lea           r0, [r0+r2*2]
506    STORE_DIFF16 m14, m6, m0, m8, r0
507    STORE_DIFF16 m15, m7, m0, m8, r0+r2
508%else
509    IDCT8_ADD_SSE_START r1,    rsp
510    IDCT8_ADD_SSE_START r1+16, rsp+128
511    lea           r3, [r0+8]
512    IDCT8_ADD_SSE_END r0, rsp,    r2
513    IDCT8_ADD_SSE_END r3, rsp+16, r2
514%endif ; ARCH_X86_64
515
516    add          rsp, pad
517    ret
518%endmacro
519
520INIT_XMM
521IDCT8_ADD sse2
522%ifdef HAVE_AVX
523INIT_AVX
524IDCT8_ADD avx
525%endif
526
527;-----------------------------------------------------------------------------
528; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
529;-----------------------------------------------------------------------------
530;;;;;;; NO FATE SAMPLES TRIGGER THIS
531%macro IDCT8_ADD4_OP 3
532    cmp       byte [r4+%3], 0
533    jz .skipblock%2
534    mov      r0d, [r6+%2*4]
535    add       r0, r5
536    call h264_idct8_add1_10_%1
537.skipblock%2:
538%if %2<12
539    add       r1, 256
540%endif
541%endmacro
542
543%macro IDCT8_ADD4 1
544cglobal h264_idct8_add4_10_%1, 0,7,16
545    %assign pad 16-gprsize-(stack_offset&15)
546    SUB      rsp, pad
547    mov       r5, r0mp
548    mov       r6, r1mp
549    mov       r1, r2mp
550    mov      r2d, r3m
551    movifnidn r4, r4mp
552    IDCT8_ADD4_OP %1,  0, 4+1*8
553    IDCT8_ADD4_OP %1,  4, 6+1*8
554    IDCT8_ADD4_OP %1,  8, 4+3*8
555    IDCT8_ADD4_OP %1, 12, 6+3*8
556    ADD       rsp, pad
557    RET
558%endmacro ; IDCT8_ADD4
559
560INIT_XMM
561IDCT8_ADD4 sse2
562%ifdef HAVE_AVX
563INIT_AVX
564IDCT8_ADD4 avx
565%endif
566