1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Oskar Arvidsson <oskar@irock.se>
7;*          Loren Merritt <lorenm@u.washington.edu>
8;*          Fiona Glaser <fiona@x264.com>
9;*
10;* This file is part of FFmpeg.
11;*
12;* FFmpeg is free software; you can redistribute it and/or
13;* modify it under the terms of the GNU Lesser General Public
14;* License as published by the Free Software Foundation; either
15;* version 2.1 of the License, or (at your option) any later version.
16;*
17;* FFmpeg is distributed in the hope that it will be useful,
18;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20;* Lesser General Public License for more details.
21;*
22;* You should have received a copy of the GNU Lesser General Public
23;* License along with FFmpeg; if not, write to the Free Software
24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25;******************************************************************************
26
27%include "libavutil/x86/x86util.asm"
28
29SECTION_RODATA
30
31pw_pixel_max: times 8 dw ((1 << 10)-1)
32
33SECTION .text
34
35cextern pw_2
36cextern pw_3
37cextern pw_4
38
39; out: %4 = |%1-%2|-%3
40; clobbers: %5
41%macro ABS_SUB 5
42    psubusw %5, %2, %1
43    psubusw %4, %1, %2
44    por     %4, %5
45    psubw   %4, %3
46%endmacro
47
48; out: %4 = |%1-%2|<%3
49%macro DIFF_LT   5
50    psubusw %4, %2, %1
51    psubusw %5, %1, %2
52    por     %5, %4 ; |%1-%2|
53    pxor    %4, %4
54    psubw   %5, %3 ; |%1-%2|-%3
55    pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
56%endmacro
57
58%macro LOAD_AB 4
59    movd       %1, %3
60    movd       %2, %4
61    SPLATW     %1, %1
62    SPLATW     %2, %2
63%endmacro
64
65; in:  %2=tc reg
66; out: %1=splatted tc
67%macro LOAD_TC 2
68    movd        %1, [%2]
69    punpcklbw   %1, %1
70%if mmsize == 8
71    pshufw      %1, %1, 0
72%else
73    pshuflw     %1, %1, 01010000b
74    pshufd      %1, %1, 01010000b
75%endif
76    psraw       %1, 6
77%endmacro
78
79; in: %1=p1, %2=p0, %3=q0, %4=q1
80;     %5=alpha, %6=beta, %7-%9=tmp
81; out: %7=mask
82%macro LOAD_MASK 9
83    ABS_SUB     %2, %3, %5, %8, %7 ; |p0-q0| - alpha
84    ABS_SUB     %1, %2, %6, %9, %7 ; |p1-p0| - beta
85    pand        %8, %9
86    ABS_SUB     %3, %4, %6, %9, %7 ; |q1-q0| - beta
87    pxor        %7, %7
88    pand        %8, %9
89    pcmpgtw     %7, %8
90%endmacro
91
92; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
93; out: %1=p0', m2=q0'
94%macro DEBLOCK_P0_Q0 7
95    psubw   %3, %4
96    pxor    %7, %7
97    paddw   %3, [pw_4]
98    psubw   %7, %5
99    psubw   %6, %2, %1
100    psllw   %6, 2
101    paddw   %3, %6
102    psraw   %3, 3
103    mova    %6, [pw_pixel_max]
104    CLIPW   %3, %7, %5
105    pxor    %7, %7
106    paddw   %1, %3
107    psubw   %2, %3
108    CLIPW   %1, %7, %6
109    CLIPW   %2, %7, %6
110%endmacro
111
112; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
113%macro LUMA_Q1 6
114    pavgw       %6, %3, %4      ; (p0+q0+1)>>1
115    paddw       %1, %6
116    pxor        %6, %6
117    psraw       %1, 1
118    psubw       %6, %5
119    psubw       %1, %2
120    CLIPW       %1, %6, %5
121    paddw       %1, %2
122%endmacro
123
124%macro LUMA_DEBLOCK_ONE 3
125    DIFF_LT     m5, %1, bm, m4, m6
126    pxor        m6, m6
127    mova        %3, m4
128    pcmpgtw     m6, tcm
129    pand        m4, tcm
130    pandn       m6, m7
131    pand        m4, m6
132    LUMA_Q1 m5, %2, m1, m2, m4, m6
133%endmacro
134
135%macro LUMA_H_STORE 2
136%if mmsize == 8
137    movq        [r0-4], m0
138    movq        [r0+r1-4], m1
139    movq        [r0+r1*2-4], m2
140    movq        [r0+%2-4], m3
141%else
142    movq        [r0-4], m0
143    movhps      [r0+r1-4], m0
144    movq        [r0+r1*2-4], m1
145    movhps      [%1-4], m1
146    movq        [%1+r1-4], m2
147    movhps      [%1+r1*2-4], m2
148    movq        [%1+%2-4], m3
149    movhps      [%1+r1*4-4], m3
150%endif
151%endmacro
152
153%macro DEBLOCK_LUMA 0
154;-----------------------------------------------------------------------------
155; void ff_deblock_v_luma_10(uint16_t *pix, int stride, int alpha, int beta,
156;                           int8_t *tc0)
157;-----------------------------------------------------------------------------
158cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
159    %assign pad 5*mmsize+12-(stack_offset&15)
160    %define tcm [rsp]
161    %define ms1 [rsp+mmsize]
162    %define ms2 [rsp+mmsize*2]
163    %define am  [rsp+mmsize*3]
164    %define bm  [rsp+mmsize*4]
165    SUB        rsp, pad
166    shl        r2d, 2
167    shl        r3d, 2
168    LOAD_AB     m4, m5, r2d, r3d
169    mov         r3, 32/mmsize
170    mov         r2, r0
171    sub         r0, r1
172    mova        am, m4
173    sub         r0, r1
174    mova        bm, m5
175    sub         r0, r1
176.loop:
177    mova        m0, [r0+r1]
178    mova        m1, [r0+r1*2]
179    mova        m2, [r2]
180    mova        m3, [r2+r1]
181
182    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
183    LOAD_TC     m6, r4
184    mova       tcm, m6
185
186    mova        m5, [r0]
187    LUMA_DEBLOCK_ONE m1, m0, ms1
188    mova   [r0+r1], m5
189
190    mova        m5, [r2+r1*2]
191    LUMA_DEBLOCK_ONE m2, m3, ms2
192    mova   [r2+r1], m5
193
194    pxor        m5, m5
195    mova        m6, tcm
196    pcmpgtw     m5, tcm
197    psubw       m6, ms1
198    pandn       m5, m7
199    psubw       m6, ms2
200    pand        m5, m6
201    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
202    mova [r0+r1*2], m1
203    mova      [r2], m2
204
205    add         r0, mmsize
206    add         r2, mmsize
207    add         r4, mmsize/8
208    dec         r3
209    jg .loop
210    ADD         rsp, pad
211    RET
212
213cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
214    %assign pad 7*mmsize+12-(stack_offset&15)
215    %define tcm [rsp]
216    %define ms1 [rsp+mmsize]
217    %define ms2 [rsp+mmsize*2]
218    %define p1m [rsp+mmsize*3]
219    %define p2m [rsp+mmsize*4]
220    %define am  [rsp+mmsize*5]
221    %define bm  [rsp+mmsize*6]
222    SUB        rsp, pad
223    shl        r2d, 2
224    shl        r3d, 2
225    LOAD_AB     m4, m5, r2d, r3d
226    mov         r3, r1
227    mova        am, m4
228    add         r3, r1
229    mov         r5, 32/mmsize
230    mova        bm, m5
231    add         r3, r1
232%if mmsize == 16
233    mov         r2, r0
234    add         r2, r3
235%endif
236.loop:
237%if mmsize == 8
238    movq        m2, [r0-8]     ; y q2 q1 q0
239    movq        m7, [r0+0]
240    movq        m5, [r0+r1-8]
241    movq        m3, [r0+r1+0]
242    movq        m0, [r0+r1*2-8]
243    movq        m6, [r0+r1*2+0]
244    movq        m1, [r0+r3-8]
245    TRANSPOSE4x4W 2, 5, 0, 1, 4
246    SWAP         2, 7
247    movq        m7, [r0+r3]
248    TRANSPOSE4x4W 2, 3, 6, 7, 4
249%else
250    movu        m5, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
251    movu        m0, [r0+r1-8]
252    movu        m2, [r0+r1*2-8]
253    movu        m3, [r2-8]
254    TRANSPOSE4x4W 5, 0, 2, 3, 6
255    mova       tcm, m3
256
257    movu        m4, [r2+r1-8]
258    movu        m1, [r2+r1*2-8]
259    movu        m3, [r2+r3-8]
260    movu        m7, [r2+r1*4-8]
261    TRANSPOSE4x4W 4, 1, 3, 7, 6
262
263    mova        m6, tcm
264    punpcklqdq  m6, m7
265    punpckhqdq  m5, m4
266    SBUTTERFLY qdq, 0, 1, 7
267    SBUTTERFLY qdq, 2, 3, 7
268%endif
269
270    mova       p2m, m6
271    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
272    LOAD_TC     m6, r4
273    mova       tcm, m6
274
275    LUMA_DEBLOCK_ONE m1, m0, ms1
276    mova       p1m, m5
277
278    mova        m5, p2m
279    LUMA_DEBLOCK_ONE m2, m3, ms2
280    mova       p2m, m5
281
282    pxor        m5, m5
283    mova        m6, tcm
284    pcmpgtw     m5, tcm
285    psubw       m6, ms1
286    pandn       m5, m7
287    psubw       m6, ms2
288    pand        m5, m6
289    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
290    mova        m0, p1m
291    mova        m3, p2m
292    TRANSPOSE4x4W 0, 1, 2, 3, 4
293    LUMA_H_STORE r2, r3
294
295    add         r4, mmsize/8
296    lea         r0, [r0+r1*(mmsize/2)]
297    lea         r2, [r2+r1*(mmsize/2)]
298    dec         r5
299    jg .loop
300    ADD        rsp, pad
301    RET
302%endmacro
303
304%if ARCH_X86_64
305; in:  m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
306;      m12=alpha, m13=beta
307; out: m0=p1', m3=q1', m1=p0', m2=q0'
308; clobbers: m4, m5, m6, m7, m10, m11, m14
309%macro DEBLOCK_LUMA_INTER_SSE2 0
310    LOAD_MASK   m0, m1, m2, m3, m12, m13, m7, m4, m6
311    LOAD_TC     m6, r4
312    DIFF_LT     m8, m1, m13, m10, m4
313    DIFF_LT     m9, m2, m13, m11, m4
314    pand        m6, m7
315
316    mova       m14, m6
317    pxor        m4, m4
318    pcmpgtw     m6, m4
319    pand        m6, m14
320
321    mova        m5, m10
322    pand        m5, m6
323    LUMA_Q1 m8, m0, m1, m2, m5, m4
324
325    mova        m5, m11
326    pand        m5, m6
327    LUMA_Q1 m9, m3, m1, m2, m5, m4
328
329    pxor        m4, m4
330    psubw       m6, m10
331    pcmpgtw     m4, m14
332    pandn       m4, m7
333    psubw       m6, m11
334    pand        m4, m6
335    DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
336
337    SWAP         0, 8
338    SWAP         3, 9
339%endmacro
340
341%macro DEBLOCK_LUMA_64 0
342cglobal deblock_v_luma_10, 5,5,15
343    %define p2 m8
344    %define p1 m0
345    %define p0 m1
346    %define q0 m2
347    %define q1 m3
348    %define q2 m9
349    %define mask0 m7
350    %define mask1 m10
351    %define mask2 m11
352    shl        r2d, 2
353    shl        r3d, 2
354    LOAD_AB    m12, m13, r2d, r3d
355    mov         r2, r0
356    sub         r0, r1
357    sub         r0, r1
358    sub         r0, r1
359    mov         r3, 2
360.loop:
361    mova        p2, [r0]
362    mova        p1, [r0+r1]
363    mova        p0, [r0+r1*2]
364    mova        q0, [r2]
365    mova        q1, [r2+r1]
366    mova        q2, [r2+r1*2]
367    DEBLOCK_LUMA_INTER_SSE2
368    mova   [r0+r1], p1
369    mova [r0+r1*2], p0
370    mova      [r2], q0
371    mova   [r2+r1], q1
372    add         r0, mmsize
373    add         r2, mmsize
374    add         r4, 2
375    dec         r3
376    jg .loop
377    REP_RET
378
379cglobal deblock_h_luma_10, 5,7,15
380    shl        r2d, 2
381    shl        r3d, 2
382    LOAD_AB    m12, m13, r2d, r3d
383    mov         r2, r1
384    add         r2, r1
385    add         r2, r1
386    mov         r5, r0
387    add         r5, r2
388    mov         r6, 2
389.loop:
390    movu        m8, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
391    movu        m0, [r0+r1-8]
392    movu        m2, [r0+r1*2-8]
393    movu        m9, [r5-8]
394    movu        m5, [r5+r1-8]
395    movu        m1, [r5+r1*2-8]
396    movu        m3, [r5+r2-8]
397    movu        m7, [r5+r1*4-8]
398
399    TRANSPOSE4x4W 8, 0, 2, 9, 10
400    TRANSPOSE4x4W 5, 1, 3, 7, 10
401
402    punpckhqdq  m8, m5
403    SBUTTERFLY qdq, 0, 1, 10
404    SBUTTERFLY qdq, 2, 3, 10
405    punpcklqdq  m9, m7
406
407    DEBLOCK_LUMA_INTER_SSE2
408
409    TRANSPOSE4x4W 0, 1, 2, 3, 4
410    LUMA_H_STORE r5, r2
411    add         r4, 2
412    lea         r0, [r0+r1*8]
413    lea         r5, [r5+r1*8]
414    dec         r6
415    jg .loop
416    REP_RET
417%endmacro
418
419INIT_XMM sse2
420DEBLOCK_LUMA_64
421%if HAVE_AVX_EXTERNAL
422INIT_XMM avx
423DEBLOCK_LUMA_64
424%endif
425%endif
426
427%macro SWAPMOVA 2
428%ifid %1
429    SWAP %1, %2
430%else
431    mova %1, %2
432%endif
433%endmacro
434
435; in: t0-t2: tmp registers
436;     %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
437;     %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
438%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
439%if ARCH_X86_64
440    paddw     t0, %3, %2
441    mova      t2, %4
442    paddw     t2, %3
443%else
444    mova      t0, %3
445    mova      t2, %4
446    paddw     t0, %2
447    paddw     t2, %3
448%endif
449    paddw     t0, %1
450    paddw     t2, t2
451    paddw     t0, %5
452    paddw     t2, %9
453    paddw     t0, %9    ; (p2 + p1 + p0 + q0 + 2)
454    paddw     t2, t0    ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
455
456    psrlw     t2, 3
457    psrlw     t1, t0, 2
458    psubw     t2, %3
459    psubw     t1, %2
460    pand      t2, %8
461    pand      t1, %8
462    paddw     t2, %3
463    paddw     t1, %2
464    SWAPMOVA %11, t1
465
466    psubw     t1, t0, %3
467    paddw     t0, t0
468    psubw     t1, %5
469    psubw     t0, %3
470    paddw     t1, %6
471    paddw     t1, %2
472    paddw     t0, %6
473    psrlw     t1, 2     ; (2*p1 + p0 + q1 + 2)/4
474    psrlw     t0, 3     ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
475
476    pxor      t0, t1
477    pxor      t1, %1
478    pand      t0, %8
479    pand      t1, %7
480    pxor      t0, t1
481    pxor      t0, %1
482    SWAPMOVA %10, t0
483    SWAPMOVA %12, t2
484%endmacro
485
486%macro LUMA_INTRA_INIT 1
487    %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
488    %define t0 m4
489    %define t1 m5
490    %define t2 m6
491    %define t3 m7
492    %assign i 4
493%rep %1
494    CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
495    %assign i i+1
496%endrep
497    SUB    rsp, pad
498%endmacro
499
500; in: %1-%3=tmp, %4=p2, %5=q2
501%macro LUMA_INTRA_INTER 5
502    LOAD_AB t0, t1, r2d, r3d
503    mova    %1, t0
504    LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
505%if ARCH_X86_64
506    mova    %2, t0        ; mask0
507    psrlw   t3, %1, 2
508%else
509    mova    t3, %1
510    mova    %2, t0        ; mask0
511    psrlw   t3, 2
512%endif
513    paddw   t3, [pw_2]    ; alpha/4+2
514    DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
515    pand    t2, %2
516    mova    t3, %5        ; q2
517    mova    %1, t2        ; mask1
518    DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
519    pand    t2, %1
520    mova    t3, %4        ; p2
521    mova    %3, t2        ; mask1q
522    DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
523    pand    t2, %1
524    mova    %1, t2        ; mask1p
525%endmacro
526
527%macro LUMA_H_INTRA_LOAD 0
528%if mmsize == 8
529    movu    t0, [r0-8]
530    movu    t1, [r0+r1-8]
531    movu    m0, [r0+r1*2-8]
532    movu    m1, [r0+r4-8]
533    TRANSPOSE4x4W 4, 5, 0, 1, 2
534    mova    t4, t0        ; p3
535    mova    t5, t1        ; p2
536
537    movu    m2, [r0]
538    movu    m3, [r0+r1]
539    movu    t0, [r0+r1*2]
540    movu    t1, [r0+r4]
541    TRANSPOSE4x4W 2, 3, 4, 5, 6
542    mova    t6, t0        ; q2
543    mova    t7, t1        ; q3
544%else
545    movu    t0, [r0-8]
546    movu    t1, [r0+r1-8]
547    movu    m0, [r0+r1*2-8]
548    movu    m1, [r0+r5-8]
549    movu    m2, [r4-8]
550    movu    m3, [r4+r1-8]
551    movu    t2, [r4+r1*2-8]
552    movu    t3, [r4+r5-8]
553    TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
554    mova    t4, t0        ; p3
555    mova    t5, t1        ; p2
556    mova    t6, t2        ; q2
557    mova    t7, t3        ; q3
558%endif
559%endmacro
560
561; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
562%macro LUMA_H_INTRA_STORE 9
563%if mmsize == 8
564    TRANSPOSE4x4W %1, %2, %3, %4, %9
565    movq       [r0-8], m%1
566    movq       [r0+r1-8], m%2
567    movq       [r0+r1*2-8], m%3
568    movq       [r0+r4-8], m%4
569    movq       m%1, %8
570    TRANSPOSE4x4W %5, %6, %7, %1, %9
571    movq       [r0], m%5
572    movq       [r0+r1], m%6
573    movq       [r0+r1*2], m%7
574    movq       [r0+r4], m%1
575%else
576    TRANSPOSE2x4x4W %1, %2, %3, %4, %9
577    movq       [r0-8], m%1
578    movq       [r0+r1-8], m%2
579    movq       [r0+r1*2-8], m%3
580    movq       [r0+r5-8], m%4
581    movhps     [r4-8], m%1
582    movhps     [r4+r1-8], m%2
583    movhps     [r4+r1*2-8], m%3
584    movhps     [r4+r5-8], m%4
585%ifnum %8
586    SWAP       %1, %8
587%else
588    mova       m%1, %8
589%endif
590    TRANSPOSE2x4x4W %5, %6, %7, %1, %9
591    movq       [r0], m%5
592    movq       [r0+r1], m%6
593    movq       [r0+r1*2], m%7
594    movq       [r0+r5], m%1
595    movhps     [r4], m%5
596    movhps     [r4+r1], m%6
597    movhps     [r4+r1*2], m%7
598    movhps     [r4+r5], m%1
599%endif
600%endmacro
601
602%if ARCH_X86_64
603;-----------------------------------------------------------------------------
604; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
605;                                 int beta)
606;-----------------------------------------------------------------------------
607%macro DEBLOCK_LUMA_INTRA_64 0
608cglobal deblock_v_luma_intra_10, 4,7,16
609    %define t0 m1
610    %define t1 m2
611    %define t2 m4
612    %define p2 m8
613    %define p1 m9
614    %define p0 m10
615    %define q0 m11
616    %define q1 m12
617    %define q2 m13
618    %define aa m5
619    %define bb m14
620    lea     r4, [r1*4]
621    lea     r5, [r1*3] ; 3*stride
622    neg     r4
623    add     r4, r0     ; pix-4*stride
624    mov     r6, 2
625    mova    m0, [pw_2]
626    shl    r2d, 2
627    shl    r3d, 2
628    LOAD_AB aa, bb, r2d, r3d
629.loop:
630    mova    p2, [r4+r1]
631    mova    p1, [r4+2*r1]
632    mova    p0, [r4+r5]
633    mova    q0, [r0]
634    mova    q1, [r0+r1]
635    mova    q2, [r0+2*r1]
636
637    LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
638    mova    t2, aa
639    psrlw   t2, 2
640    paddw   t2, m0 ; alpha/4+2
641    DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
642    DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
643    DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
644    pand    m6, m3
645    pand    m7, m6
646    pand    m6, t1
647    LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
648    LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
649    add     r0, mmsize
650    add     r4, mmsize
651    dec     r6
652    jg .loop
653    REP_RET
654
655;-----------------------------------------------------------------------------
656; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
657;                                 int beta)
658;-----------------------------------------------------------------------------
659cglobal deblock_h_luma_intra_10, 4,7,16
660    %define t0 m15
661    %define t1 m14
662    %define t2 m2
663    %define q3 m5
664    %define q2 m8
665    %define q1 m9
666    %define q0 m10
667    %define p0 m11
668    %define p1 m12
669    %define p2 m13
670    %define p3 m4
671    %define spill [rsp]
672    %assign pad 24-(stack_offset&15)
673    SUB     rsp, pad
674    lea     r4, [r1*4]
675    lea     r5, [r1*3] ; 3*stride
676    add     r4, r0     ; pix+4*stride
677    mov     r6, 2
678    mova    m0, [pw_2]
679    shl    r2d, 2
680    shl    r3d, 2
681.loop:
682    movu    q3, [r0-8]
683    movu    q2, [r0+r1-8]
684    movu    q1, [r0+r1*2-8]
685    movu    q0, [r0+r5-8]
686    movu    p0, [r4-8]
687    movu    p1, [r4+r1-8]
688    movu    p2, [r4+r1*2-8]
689    movu    p3, [r4+r5-8]
690    TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
691
692    LOAD_AB m1, m2, r2d, r3d
693    LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
694    psrlw   m1, 2
695    paddw   m1, m0 ; alpha/4+2
696    DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
697    DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
698    DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
699    pand    m6, m3
700    pand    m7, m6
701    pand    m6, t1
702
703    mova spill, q3
704    LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
705    LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
706    mova    m7, spill
707
708    LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
709
710    lea     r0, [r0+r1*8]
711    lea     r4, [r4+r1*8]
712    dec     r6
713    jg .loop
714    ADD    rsp, pad
715    RET
716%endmacro
717
718INIT_XMM sse2
719DEBLOCK_LUMA_INTRA_64
720%if HAVE_AVX_EXTERNAL
721INIT_XMM avx
722DEBLOCK_LUMA_INTRA_64
723%endif
724
725%endif
726
727%macro DEBLOCK_LUMA_INTRA 0
728;-----------------------------------------------------------------------------
729; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
730;                                 int beta)
731;-----------------------------------------------------------------------------
732cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
733    LUMA_INTRA_INIT 3
734    lea     r4, [r1*4]
735    lea     r5, [r1*3]
736    neg     r4
737    add     r4, r0
738    mov     r6, 32/mmsize
739    shl    r2d, 2
740    shl    r3d, 2
741.loop:
742    mova    m0, [r4+r1*2] ; p1
743    mova    m1, [r4+r5]   ; p0
744    mova    m2, [r0]      ; q0
745    mova    m3, [r0+r1]   ; q1
746    LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
747    LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
748    mova    t3, [r0+r1*2] ; q2
749    LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
750    add     r0, mmsize
751    add     r4, mmsize
752    dec     r6
753    jg .loop
754    ADD    rsp, pad
755    RET
756
757;-----------------------------------------------------------------------------
758; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
759;                                 int beta)
760;-----------------------------------------------------------------------------
761cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
762    LUMA_INTRA_INIT 8
763%if mmsize == 8
764    lea     r4, [r1*3]
765    mov     r5, 32/mmsize
766%else
767    lea     r4, [r1*4]
768    lea     r5, [r1*3] ; 3*stride
769    add     r4, r0     ; pix+4*stride
770    mov     r6, 32/mmsize
771%endif
772    shl    r2d, 2
773    shl    r3d, 2
774.loop:
775    LUMA_H_INTRA_LOAD
776    LUMA_INTRA_INTER t8, t9, t10, t5, t6
777
778    LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
779    mova    t3, t6     ; q2
780    LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
781
782    mova    m2, t4
783    mova    m0, t11
784    mova    m1, t5
785    mova    m3, t8
786    mova    m6, t6
787
788    LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
789
790    lea     r0, [r0+r1*(mmsize/2)]
791%if mmsize == 8
792    dec     r5
793%else
794    lea     r4, [r4+r1*(mmsize/2)]
795    dec     r6
796%endif
797    jg .loop
798    ADD    rsp, pad
799    RET
800%endmacro
801
802%if ARCH_X86_64 == 0
803INIT_MMX mmxext
804DEBLOCK_LUMA
805DEBLOCK_LUMA_INTRA
806INIT_XMM sse2
807DEBLOCK_LUMA
808DEBLOCK_LUMA_INTRA
809%if HAVE_AVX_EXTERNAL
810INIT_XMM avx
811DEBLOCK_LUMA
812DEBLOCK_LUMA_INTRA
813%endif
814%endif
815
816; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
817; out: %1=p0', %2=q0'
818%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
819    mova    %6, [pw_2]
820    paddw   %6, %3
821    paddw   %6, %4
822    paddw   %7, %6, %2
823    paddw   %6, %1
824    paddw   %6, %3
825    paddw   %7, %4
826    psraw   %6, 2
827    psraw   %7, 2
828    psubw   %6, %1
829    psubw   %7, %2
830    pand    %6, %5
831    pand    %7, %5
832    paddw   %1, %6
833    paddw   %2, %7
834%endmacro
835
836%macro CHROMA_V_LOAD 1
837    mova        m0, [r0]    ; p1
838    mova        m1, [r0+r1] ; p0
839    mova        m2, [%1]    ; q0
840    mova        m3, [%1+r1] ; q1
841%endmacro
842
843%macro CHROMA_V_STORE 0
844    mova [r0+1*r1], m1
845    mova [r0+2*r1], m2
846%endmacro
847
848%macro CHROMA_V_LOAD_TC 2
849    movd        %1, [%2]
850    punpcklbw   %1, %1
851    punpcklwd   %1, %1
852    psraw       %1, 6
853%endmacro
854
855%macro DEBLOCK_CHROMA 0
856;-----------------------------------------------------------------------------
857; void ff_deblock_v_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
858;                             int8_t *tc0)
859;-----------------------------------------------------------------------------
860cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
861    mov         r5, r0
862    sub         r0, r1
863    sub         r0, r1
864    shl        r2d, 2
865    shl        r3d, 2
866%if mmsize < 16
867    mov         r6, 16/mmsize
868.loop:
869%endif
870    CHROMA_V_LOAD r5
871    LOAD_AB     m4, m5, r2d, r3d
872    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
873    pxor        m4, m4
874    CHROMA_V_LOAD_TC m6, r4
875    psubw       m6, [pw_3]
876    pmaxsw      m6, m4
877    pand        m7, m6
878    DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
879    CHROMA_V_STORE
880%if mmsize < 16
881    add         r0, mmsize
882    add         r5, mmsize
883    add         r4, mmsize/4
884    dec         r6
885    jg .loop
886    REP_RET
887%else
888    RET
889%endif
890
891;-----------------------------------------------------------------------------
892; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha,
893;                                   int beta)
894;-----------------------------------------------------------------------------
895cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
896    mov         r4, r0
897    sub         r0, r1
898    sub         r0, r1
899    shl        r2d, 2
900    shl        r3d, 2
901%if mmsize < 16
902    mov         r5, 16/mmsize
903.loop:
904%endif
905    CHROMA_V_LOAD r4
906    LOAD_AB     m4, m5, r2d, r3d
907    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
908    CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
909    CHROMA_V_STORE
910%if mmsize < 16
911    add         r0, mmsize
912    add         r4, mmsize
913    dec         r5
914    jg .loop
915    REP_RET
916%else
917    RET
918%endif
919%endmacro
920
921%if ARCH_X86_64 == 0
922INIT_MMX mmxext
923DEBLOCK_CHROMA
924%endif
925INIT_XMM sse2
926DEBLOCK_CHROMA
927%if HAVE_AVX_EXTERNAL
928INIT_XMM avx
929DEBLOCK_CHROMA
930%endif
931