1;*****************************************************************************
2;* SSE2-optimized HEVC deblocking code
3;*****************************************************************************
4;* Copyright (C) 2013 VTT
5;*
6;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29pw_pixel_max: times 8 dw ((1 << 10)-1)
30pw_m1:        times 8 dw -1
31pw_m2:        times 8 dw -2
32pd_1 :        times 4 dd  1
33
34cextern pw_4
35cextern pw_8
36
37SECTION .text
38INIT_XMM sse2
39
40; expands to [base],...,[base+7*stride]
41%define PASS8ROWS(base, base3, stride, stride3) \
42    [base], [base+stride], [base+stride*2], [base3], \
43    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
44
45; in: 8 rows of 4 bytes in %4..%11
46; out: 4 rows of 8 words in m0..m3
47%macro TRANSPOSE4x8B_LOAD 8
48    movd             m0, %1
49    movd             m2, %2
50    movd             m1, %3
51    movd             m3, %4
52
53    punpcklbw        m0, m2
54    punpcklbw        m1, m3
55    punpcklwd        m0, m1
56
57    movd             m4, %5
58    movd             m6, %6
59    movd             m5, %7
60    movd             m3, %8
61
62    punpcklbw        m4, m6
63    punpcklbw        m5, m3
64    punpcklwd        m4, m5
65
66    punpckhdq        m2, m0, m4
67    punpckldq        m0, m4
68
69    pxor             m5, m5
70    punpckhbw        m1, m0, m5
71    punpcklbw        m0, m5
72    punpckhbw        m3, m2, m5
73    punpcklbw        m2, m5
74%endmacro
75
76; in: 4 rows of 8 words in m0..m3
77; out: 8 rows of 4 bytes in %1..%8
78%macro TRANSPOSE8x4B_STORE 8
79    packuswb         m0, m0
80    packuswb         m1, m1
81    packuswb         m2, m2
82    packuswb         m3, m3
83
84    punpcklbw        m0, m1
85    punpcklbw        m2, m3
86
87    punpckhwd        m6, m0, m2
88    punpcklwd        m0, m2
89
90    movd             %1, m0
91    pshufd           m0, m0, 0x39
92    movd             %2, m0
93    pshufd           m0, m0, 0x39
94    movd             %3, m0
95    pshufd           m0, m0, 0x39
96    movd             %4, m0
97
98    movd             %5, m6
99    pshufd           m6, m6, 0x39
100    movd             %6, m6
101    pshufd           m6, m6, 0x39
102    movd             %7, m6
103    pshufd           m6, m6, 0x39
104    movd             %8, m6
105%endmacro
106
107; in: 8 rows of 4 words in %4..%11
108; out: 4 rows of 8 words in m0..m3
109%macro TRANSPOSE4x8W_LOAD 8
110    movq             m0, %1
111    movq             m2, %2
112    movq             m1, %3
113    movq             m3, %4
114
115    punpcklwd        m0, m2
116    punpcklwd        m1, m3
117    punpckhdq        m2, m0, m1
118    punpckldq        m0, m1
119
120    movq             m4, %5
121    movq             m6, %6
122    movq             m5, %7
123    movq             m3, %8
124
125    punpcklwd        m4, m6
126    punpcklwd        m5, m3
127    punpckhdq        m6, m4, m5
128    punpckldq        m4, m5
129
130    punpckhqdq       m1, m0, m4
131    punpcklqdq       m0, m4
132    punpckhqdq       m3, m2, m6
133    punpcklqdq       m2, m6
134
135%endmacro
136
137; in: 4 rows of 8 words in m0..m3
138; out: 8 rows of 4 words in %1..%8
139%macro TRANSPOSE8x4W_STORE 8
140    pxor             m5, m5; zeros reg
141    CLIPW            m0, m5, [pw_pixel_max]
142    CLIPW            m1, m5, [pw_pixel_max]
143    CLIPW            m2, m5, [pw_pixel_max]
144    CLIPW            m3, m5, [pw_pixel_max]
145
146    punpckhwd        m4, m0, m1
147    punpcklwd        m0, m1
148    punpckhwd        m5, m2, m3
149    punpcklwd        m2, m3
150    punpckhdq        m6, m0, m2
151    punpckldq        m0, m2
152
153    movq             %1, m0
154    movhps           %2, m0
155    movq             %3, m6
156    movhps           %4, m6
157
158    punpckhdq        m6, m4, m5
159    punpckldq        m4, m5
160
161    movq             %5, m4
162    movhps           %6, m4
163    movq             %7, m6
164    movhps           %8, m6
165%endmacro
166
167; in: 8 rows of 8 bytes in %1..%8
168; out: 8 rows of 8 words in m0..m7
169%macro TRANSPOSE8x8B_LOAD 8
170    movq             m7, %1
171    movq             m2, %2
172    movq             m1, %3
173    movq             m3, %4
174
175    punpcklbw        m7, m2
176    punpcklbw        m1, m3
177    punpcklwd        m3, m7, m1
178    punpckhwd        m7, m1
179
180    movq             m4, %5
181    movq             m6, %6
182    movq             m5, %7
183    movq            m15, %8
184
185    punpcklbw        m4, m6
186    punpcklbw        m5, m15
187    punpcklwd        m9, m4, m5
188    punpckhwd        m4, m5
189
190    punpckldq        m1, m3, m9;  0, 1
191    punpckhdq        m3, m9;  2, 3
192
193    punpckldq        m5, m7, m4;  4, 5
194    punpckhdq        m7, m4;  6, 7
195
196    pxor            m13, m13
197
198    punpcklbw        m0, m1, m13; 0 in 16 bit
199    punpckhbw        m1, m13; 1 in 16 bit
200
201    punpcklbw        m2, m3, m13; 2
202    punpckhbw        m3, m13; 3
203
204    punpcklbw        m4, m5, m13; 4
205    punpckhbw        m5, m13; 5
206
207    punpcklbw        m6, m7, m13; 6
208    punpckhbw        m7, m13; 7
209%endmacro
210
211
212; in: 8 rows of 8 words in m0..m8
213; out: 8 rows of 8 bytes in %1..%8
214%macro TRANSPOSE8x8B_STORE 8
215    packuswb         m0, m0
216    packuswb         m1, m1
217    packuswb         m2, m2
218    packuswb         m3, m3
219    packuswb         m4, m4
220    packuswb         m5, m5
221    packuswb         m6, m6
222    packuswb         m7, m7
223
224    punpcklbw        m0, m1
225    punpcklbw        m2, m3
226
227    punpckhwd        m8, m0, m2
228    punpcklwd        m0, m2
229
230    punpcklbw        m4, m5
231    punpcklbw        m6, m7
232
233    punpckhwd        m9, m4, m6
234    punpcklwd        m4, m6
235
236    punpckhdq       m10, m0, m4; 2, 3
237    punpckldq        m0, m4;   0, 1
238
239    punpckldq       m11, m8, m9;  4, 5
240    punpckhdq        m8, m9;   6, 7
241    movq             %1, m0
242    movhps           %2, m0
243    movq             %3, m10
244    movhps           %4, m10
245    movq             %5, m11
246    movhps           %6, m11
247    movq             %7, m8
248    movhps           %8, m8
249%endmacro
250
251; in: 8 rows of 8 words in %1..%8
252; out: 8 rows of 8 words in m0..m7
253%macro TRANSPOSE8x8W_LOAD 8
254    movdqu           m0, %1
255    movdqu           m1, %2
256    movdqu           m2, %3
257    movdqu           m3, %4
258    movdqu           m4, %5
259    movdqu           m5, %6
260    movdqu           m6, %7
261    movdqu           m7, %8
262    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
263%endmacro
264
265; in: 8 rows of 8 words in m0..m8
266; out: 8 rows of 8 words in %1..%8
267%macro TRANSPOSE8x8W_STORE 8
268    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
269
270    pxor             m8, m8
271    CLIPW            m0, m8, [pw_pixel_max]
272    CLIPW            m1, m8, [pw_pixel_max]
273    CLIPW            m2, m8, [pw_pixel_max]
274    CLIPW            m3, m8, [pw_pixel_max]
275    CLIPW            m4, m8, [pw_pixel_max]
276    CLIPW            m5, m8, [pw_pixel_max]
277    CLIPW            m6, m8, [pw_pixel_max]
278    CLIPW            m7, m8, [pw_pixel_max]
279
280    movdqu           %1, m0
281    movdqu           %2, m1
282    movdqu           %3, m2
283    movdqu           %4, m3
284    movdqu           %5, m4
285    movdqu           %6, m5
286    movdqu           %7, m6
287    movdqu           %8, m7
288%endmacro
289
290
291; in: %2 clobbered
292; out: %1
293; mask in m11
294; clobbers m10
295%macro MASKED_COPY 2
296    pand             %2, m11 ; and mask
297    pandn           m10, m11, %1; and -mask
298    por              %2, m10
299    mova             %1, %2
300%endmacro
301
302; in: %2 clobbered
303; out: %1
304; mask in %3, will be clobbered
305%macro MASKED_COPY2 3
306    pand             %2, %3 ; and mask
307    pandn            %3, %1; and -mask
308    por              %2, %3
309    mova             %1, %2
310%endmacro
311
312ALIGN 16
313; input in m0 ... m3 and tcs in tc (r2). Output in m1 and m2
314%macro CHROMA_DEBLOCK_BODY 1
315    psubw            m4, m2, m1; q0 - p0
316    psubw            m5, m0, m3; p1 - q1
317    psllw            m4, 2; << 2
318    paddw            m5, m4;
319
320    ;tc calculations
321    movd             m6, [tcq]; tc0
322    punpcklwd        m6, m6
323    movd             m4, [tcq+4]; tc1
324    punpcklwd        m4, m4
325    shufps           m6, m4, 0; tc0, tc1
326    pmullw           m4, m6, [pw_m1]; -tc0, -tc1
327    ;end tc calculations
328
329    paddw            m5, [pw_4]; +4
330    psraw            m5, 3; >> 3
331
332    psllw            m4, %1-8; << (BIT_DEPTH - 8)
333    psllw            m6, %1-8; << (BIT_DEPTH - 8)
334    pmaxsw           m5, m4
335    pminsw           m5, m6
336    paddw            m1, m5; p0 + delta0
337    psubw            m2, m5; q0 - delta0
338%endmacro
339
340; input in m0 ... m7, betas in r2 tcs in r3. Output in m1...m6
341%macro LUMA_DEBLOCK_BODY 2
342    psllw            m9, m2, 1; *2
343    psubw           m10, m1, m9
344    paddw           m10, m3
345    ABS1            m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3
346
347    psllw            m9, m5, 1; *2
348    psubw           m11, m6, m9
349    paddw           m11, m4
350    ABS1            m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3
351
352    ;beta calculations
353    mov             r11, [betaq];
354    shl             r11, %1 - 8
355    movd            m13, r11d; beta0
356    add           betaq, 4;
357    punpcklwd       m13, m13
358    mov             r12, [betaq];
359    shl             r12, %1 - 8
360    movd            m14, r12d; beta1
361    punpcklwd       m14, m14
362    pshufd          m13, m14, 0; beta0, beta1
363    ;end beta calculations
364
365    paddw            m9, m10, m11;   0d0, 0d3  ,  1d0, 1d3
366
367    pshufhw         m14, m9, 0x0f ;0b00001111;  0d3 0d3 0d0 0d0 in high
368    pshuflw         m14, m14, 0x0f ;0b00001111;  1d3 1d3 1d0 1d0 in low
369
370    pshufhw          m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
371    pshuflw          m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
372
373    paddw           m14, m9; 0d0+0d3, 1d0+1d3
374
375    ;compare
376    pcmpgtw         m15, m13, m14; beta0, beta1
377    movmskps        r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
378    cmp             r13, 0
379    je .bypassluma
380
381    ;weak / strong decision compare to beta_2
382    psraw           m15, m13, 2;   beta >> 2
383    psllw            m8, m9, 1;
384    pcmpgtw         m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
385    movmskps        r14, m15;
386    ;end weak / strong decision
387
388    ; weak filter nd_p/q calculation
389    pshufd           m8, m10, 0x31
390    psrld            m8, 16
391    paddw            m8, m10
392    movd            r7d, m8
393    and              r7, 0xffff; 1dp0 + 1dp3
394    pshufd           m8, m8, 0x4E
395    movd            r8d, m8
396    and              r8, 0xffff; 0dp0 + 0dp3
397
398    pshufd           m8, m11, 0x31
399    psrld            m8, 16
400    paddw            m8, m11
401    movd            r9d, m8
402    and              r9, 0xffff; 1dq0 + 1dq3
403    pshufd           m8, m8, 0x4E
404    movd           r10d, m8
405    and             r10, 0xffff; 0dq0 + 0dq3
406    ; end calc for weak filter
407
408    ; filtering mask
409    mov              r2, r13
410    shr              r2, 3
411    movd            m15, r2d
412    and             r13, 1
413    movd            m11, r13d
414    shufps          m11, m15, 0
415    shl              r2, 1
416    or              r13, r2
417
418    pcmpeqd         m11, [pd_1]; filtering mask
419
420    ;decide between strong and weak filtering
421    ;tc25 calculations
422    mov             r2d, [tcq];
423    shl              r2, %1 - 8
424    movd             m8, r2d; tc0
425    add             tcq, 4;
426    mov             r3d, [tcq];
427    shl              r3, %1 - 8
428    movd             m9, r3d; tc1
429    add             r2d, r3d; tc0 + tc1
430    jz        .bypassluma
431    punpcklwd        m8, m8
432    punpcklwd        m9, m9
433    shufps           m8, m9, 0; tc0, tc1
434    mova             m9, m8
435    psllw            m8, 2; tc << 2
436    pavgw            m8, m9; tc25 = ((tc * 5 + 1) >> 1)
437    ;end tc25 calculations
438
439    ;----beta_3 comparison-----
440    psubw           m12, m0, m3;      p3 - p0
441    ABS1            m12, m14; abs(p3 - p0)
442
443    psubw           m15, m7, m4;      q3 - q0
444    ABS1            m15, m14; abs(q3 - q0)
445
446    paddw           m12, m15; abs(p3 - p0) + abs(q3 - q0)
447
448    pshufhw         m12, m12, 0xf0 ;0b11110000;
449    pshuflw         m12, m12, 0xf0 ;0b11110000;
450
451    psraw           m13, 3; beta >> 3
452    pcmpgtw         m13, m12;
453    movmskps         r2, m13;
454    and             r14, r2; strong mask , beta_2 and beta_3 comparisons
455    ;----beta_3 comparison end-----
456    ;----tc25 comparison---
457    psubw           m12, m3, m4;      p0 - q0
458    ABS1            m12, m14; abs(p0 - q0)
459
460    pshufhw         m12, m12, 0xf0 ;0b11110000;
461    pshuflw         m12, m12, 0xf0 ;0b11110000;
462
463    pcmpgtw          m8, m12; tc25 comparisons
464    movmskps         r2, m8;
465    and             r14, r2; strong mask, beta_2, beta_3 and tc25 comparisons
466    ;----tc25 comparison end---
467    mov              r2, r14;
468    shr              r2, 1;
469    and             r14, r2; strong mask, bits 2 and 0
470
471    pmullw          m14, m9, [pw_m2]; -tc * 2
472    psllw            m9, 1;  tc * 2
473
474    and             r14, 5; 0b101
475    mov              r2, r14; strong mask
476    shr             r14, 2;
477    movd            m12, r14d; store to xmm for mask generation
478    shl             r14, 1
479    and              r2, 1
480    movd            m10, r2d; store to xmm for mask generation
481    or              r14, r2; final strong mask, bits 1 and 0
482    jz      .weakfilter
483
484    shufps          m10, m12, 0
485    pcmpeqd         m10, [pd_1]; strong mask
486
487    mova            m13, [pw_4]; 4 in every cell
488    pand            m11, m10; combine filtering mask and strong mask
489    paddw           m12, m2, m3;          p1 +   p0
490    paddw           m12, m4;          p1 +   p0 +   q0
491    mova            m10, m12; copy
492    psllw           m12, 1;         2*p1 + 2*p0 + 2*q0
493    paddw           m12, m1;   p2 + 2*p1 + 2*p0 + 2*q0
494    paddw           m12, m5;   p2 + 2*p1 + 2*p0 + 2*q0 + q1
495    paddw           m12, m13;  p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
496    psraw           m12, 3;  ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3)
497    psubw           m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0
498    pmaxsw          m12, m14
499    pminsw          m12, m9; av_clip( , -2 * tc, 2 * tc)
500    paddw           m12, m3; p0'
501
502    paddw           m15, m1, m10; p2 + p1 + p0 + q0
503    psrlw           m13, 1; 2 in every cell
504    paddw           m15, m13; p2 + p1 + p0 + q0 + 2
505    psraw           m15, 2;  (p2 + p1 + p0 + q0 + 2) >> 2
506    psubw           m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1
507    pmaxsw          m15, m14
508    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
509    paddw           m15, m2; p1'
510
511    paddw            m8, m1, m0;     p3 +   p2
512    psllw            m8, 1;    2*p3 + 2*p2
513    paddw            m8, m1;   2*p3 + 3*p2
514    paddw            m8, m10;  2*p3 + 3*p2 + p1 + p0 + q0
515    psllw           m13, 1; 4 in every cell
516    paddw            m8, m13;  2*p3 + 3*p2 + p1 + p0 + q0 + 4
517    psraw            m8, 3;   (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
518    psubw            m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
519    pmaxsw           m8, m14
520    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
521    paddw            m8, m1; p2'
522    MASKED_COPY      m1, m8
523
524    paddw            m8, m3, m4;         p0 +   q0
525    paddw            m8, m5;         p0 +   q0 +   q1
526    psllw            m8, 1;        2*p0 + 2*q0 + 2*q1
527    paddw            m8, m2;  p1 + 2*p0 + 2*q0 + 2*q1
528    paddw            m8, m6;  p1 + 2*p0 + 2*q0 + 2*q1 + q2
529    paddw            m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
530    psraw            m8, 3;  (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3
531    psubw            m8, m4;
532    pmaxsw           m8, m14
533    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
534    paddw            m8, m4; q0'
535    MASKED_COPY      m2, m15
536
537    paddw           m15, m3, m4;   p0 + q0
538    paddw           m15, m5;   p0 + q0 + q1
539    mova            m10, m15;
540    paddw           m15, m6;   p0 + q0 + q1 + q2
541    psrlw           m13, 1; 2 in every cell
542    paddw           m15, m13;  p0 + q0 + q1 + q2 + 2
543    psraw           m15, 2;   (p0 + q0 + q1 + q2 + 2) >> 2
544    psubw           m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1
545    pmaxsw          m15, m14
546    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
547    paddw           m15, m5; q1'
548
549    paddw           m13, m7;      q3 + 2
550    paddw           m13, m6;      q3 +  q2 + 2
551    psllw           m13, 1;     2*q3 + 2*q2 + 4
552    paddw          m13, m6;    2*q3 + 3*q2 + 4
553    paddw           m13, m10;   2*q3 + 3*q2 + q1 + q0 + p0 + 4
554    psraw           m13, 3;    (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
555    psubw           m13, m6;  ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
556    pmaxsw          m13, m14
557    pminsw          m13, m9; av_clip( , -2 * tc, 2 * tc)
558    paddw           m13, m6; q2'
559
560    MASKED_COPY      m6, m13
561    MASKED_COPY      m5, m15
562    MASKED_COPY      m4, m8
563    MASKED_COPY      m3, m12
564
565.weakfilter:
566    not             r14; strong mask -> weak mask
567    and             r14, r13; final weak filtering mask, bits 0 and 1
568    jz      .store
569
570    ; weak filtering mask
571    mov              r2, r14
572    shr              r2, 1
573    movd            m12, r2d
574    and             r14, 1
575    movd            m11, r14d
576    shufps          m11, m12, 0
577    pcmpeqd         m11, [pd_1]; filtering mask
578
579    mov             r13, r11; beta0
580    shr             r13, 1;
581    add             r11, r13
582    shr             r11, 3; ((beta0+(beta0>>1))>>3))
583
584    mov             r13, r12; beta1
585    shr             r13, 1;
586    add             r12, r13
587    shr             r12, 3; ((beta1+(beta1>>1))>>3))
588
589    mova            m13, [pw_8]
590    psubw           m12, m4, m3 ; q0 - p0
591    psllw           m10, m12, 3; 8 * (q0 - p0)
592    paddw           m12, m10 ; 9 * (q0 - p0)
593
594    psubw           m10, m5, m2 ; q1 - p1
595    psllw            m8, m10, 1; 2 * ( q1 - p1 )
596    paddw           m10, m8; 3 * ( q1 - p1 )
597    psubw           m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
598    paddw           m12, m13; + 8
599    psraw           m12, 4; >> 4 , delta0
600    PABSW           m13, m12; abs(delta0)
601
602
603    psllw           m10, m9, 2; 8 * tc
604    paddw           m10, m9; 10 * tc
605    pcmpgtw         m10, m13
606    pand            m11, m10
607
608    psraw            m9, 1;   tc * 2 -> tc
609    psraw           m14, 1; -tc * 2 -> -tc
610
611    pmaxsw          m12, m14
612    pminsw          m12, m9;  av_clip(delta0, -tc, tc)
613
614    psraw            m9, 1;   tc -> tc / 2
615    pmullw          m14, m9, [pw_m1]; -tc / 2
616
617    pavgw           m15, m1, m3;   (p2 + p0 + 1) >> 1
618    psubw           m15, m2;  ((p2 + p0 + 1) >> 1) - p1
619    paddw           m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
620    psraw           m15, 1;   (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
621    pmaxsw          m15, m14
622    pminsw          m15, m9; av_clip(deltap1, -tc/2, tc/2)
623    paddw           m15, m2; p1'
624
625    ;beta calculations
626    movd            m10, r11d; beta0
627    punpcklwd       m10, m10
628    movd            m13, r12d; beta1
629    punpcklwd       m13, m13
630    shufps          m10, m13, 0; betax0, betax1
631
632    movd            m13, r7d; 1dp0 + 1dp3
633    movd             m8, r8d; 0dp0 + 0dp3
634    punpcklwd        m8, m8
635    punpcklwd       m13, m13
636    shufps          m13, m8, 0;
637    pcmpgtw          m8, m10, m13
638    pand             m8, m11
639    ;end beta calculations
640    MASKED_COPY2     m2, m15, m8; write p1'
641
642    pavgw            m8, m6, m4;   (q2 + q0 + 1) >> 1
643    psubw            m8, m5;  ((q2 + q0 + 1) >> 1) - q1
644    psubw            m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
645    psraw            m8, 1;   ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
646    pmaxsw           m8, m14
647    pminsw           m8, m9; av_clip(deltaq1, -tc/2, tc/2)
648    paddw            m8, m5; q1'
649
650    movd            m13, r9d;
651    movd            m15, r10d;
652    punpcklwd       m15, m15
653    punpcklwd       m13, m13
654    shufps          m13, m15, 0; dq0 + dq3
655
656    pcmpgtw         m10, m13; compare to ((beta+(beta>>1))>>3)
657    pand            m10, m11
658    MASKED_COPY2     m5, m8, m10; write q1'
659
660    paddw           m15, m3, m12 ; p0 + delta0
661    MASKED_COPY      m3, m15
662
663    psubw            m8, m4, m12 ; q0 - delta0
664    MASKED_COPY      m4, m8
665%endmacro
666
667INIT_XMM sse2
668;-----------------------------------------------------------------------------
669; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q)
670;-----------------------------------------------------------------------------
671cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
672    sub            pixq, 2
673    lea       r3strideq, [3*strideq]
674    mov           pix0q, pixq
675    add            pixq, r3strideq
676    TRANSPOSE4x8B_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
677    CHROMA_DEBLOCK_BODY 8
678    TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
679    RET
680
681cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
682    sub            pixq, 4
683    lea       r3strideq, [3*strideq]
684    mov           pix0q, pixq
685    add            pixq, r3strideq
686    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
687    CHROMA_DEBLOCK_BODY 10
688    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
689    RET
690
691;-----------------------------------------------------------------------------
692; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q
693;-----------------------------------------------------------------------------
694cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
695    mov           pix0q, pixq
696    sub           pix0q, strideq
697    sub           pix0q, strideq
698    movq             m0, [pix0q];    p1
699    movq             m1, [pix0q+strideq]; p0
700    movq             m2, [pixq];    q0
701    movq             m3, [pixq+strideq]; q1
702    pxor             m5, m5; zeros reg
703    punpcklbw        m0, m5
704    punpcklbw        m1, m5
705    punpcklbw        m2, m5
706    punpcklbw        m3, m5
707    CHROMA_DEBLOCK_BODY  8
708    packuswb         m1, m1 ; p0' packed in bytes on low quadword
709    packuswb         m2, m2 ; q0' packed in bytes on low quadword
710    movq [pix0q+strideq], m1
711    movq         [pixq], m2
712    RET
713
714cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
715    mov          pix0q, pixq
716    sub          pix0q, strideq
717    sub          pix0q, strideq
718    movu            m0, [pix0q];    p1
719    movu            m1, [pix0q+strideq]; p0
720    movu            m2, [pixq];    q0
721    movu            m3, [pixq+strideq]; q1
722    CHROMA_DEBLOCK_BODY 10
723    pxor            m5, m5; zeros reg
724    CLIPW           m1, m5, [pw_pixel_max]
725    CLIPW           m2, m5, [pw_pixel_max]
726    movu [pix0q+strideq], m1
727    movu        [pixq], m2
728    RET
729
730%if ARCH_X86_64
731%macro LOOP_FILTER_LUMA 0
732;-----------------------------------------------------------------------------
733;    void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
734;-----------------------------------------------------------------------------
735cglobal hevc_v_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc
736    sub              r0, 4
737    lea              r5, [3*r1]
738    mov              r6, r0
739    add              r0, r5
740    TRANSPOSE8x8B_LOAD  PASS8ROWS(r6, r0, r1, r5)
741        LUMA_DEBLOCK_BODY 8, v
742.store:
743    TRANSPOSE8x8B_STORE PASS8ROWS(r6, r0, r1, r5)
744.bypassluma:
745    RET
746
747cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc
748    sub            pixq, 8
749    lea              r5, [3*strideq]
750    mov              r6, pixq
751    add            pixq, r5
752    TRANSPOSE8x8W_LOAD  PASS8ROWS(r6, pixq, strideq, r5)
753        LUMA_DEBLOCK_BODY 10, v
754.store:
755    TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5)
756.bypassluma:
757    RET
758
759;-----------------------------------------------------------------------------
760;    void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
761;-----------------------------------------------------------------------------
762cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
763    lea     src3strideq, [3*strideq]
764    mov           pix0q, pixq
765    sub           pix0q, src3strideq
766    sub           pix0q, strideq
767    movdqu           m0, [pix0q];       p3
768    movdqu           m1, [pix0q+strideq];    p2
769    movdqu           m2, [pix0q+2*strideq];  p1
770    movdqu           m3, [pix0q+src3strideq];    p0
771    movdqu           m4, [pixq];       q0
772    movdqu           m5, [pixq+strideq];    q1
773    movdqu           m6, [pixq+2*strideq];  q2
774    movdqu           m7, [pixq+src3strideq];    q3
775    pxor             m8, m8
776    punpcklbw        m0, m8
777    punpcklbw        m1, m8
778    punpcklbw        m2, m8
779    punpcklbw        m3, m8
780    punpcklbw        m4, m8
781    punpcklbw        m5, m8
782    punpcklbw        m6, m8
783    punpcklbw        m7, m8
784        LUMA_DEBLOCK_BODY 8, h
785.store:
786    packuswb         m1, m1; p2
787    packuswb         m2, m2; p1
788    packuswb         m3, m3; p0
789    packuswb         m4, m4; q0
790    packuswb         m5, m5; q1
791    packuswb         m6, m6; q2
792    movq        [r5+r1], m1;  p2
793    movq      [r5+2*r1], m2;  p1
794    movq        [r5+r6], m3;  p0
795    movq           [r0], m4;  q0
796    movq        [r0+r1], m5;  q1
797    movq      [r0+2*r1], m6;  q2
798.bypassluma:
799    RET
800
801cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
802    lea     src3strideq, [3*strideq]
803    mov           pix0q, pixq
804    sub           pix0q, src3strideq
805    sub           pix0q, strideq
806    movdqu           m0, [pix0q];       p3
807    movdqu           m1, [pix0q+strideq];    p2
808    movdqu           m2, [pix0q+2*strideq];  p1
809    movdqu           m3, [pix0q+src3strideq];    p0
810    movdqu           m4, [pixq];       q0
811    movdqu           m5, [pixq+strideq];    q1
812    movdqu           m6, [pixq+2*strideq];  q2
813    movdqu           m7, [pixq+src3strideq];    q3
814        LUMA_DEBLOCK_BODY 10, h
815.store:
816    pxor             m8, m8; zeros reg
817    CLIPW            m1, m8, [pw_pixel_max]
818    CLIPW            m2, m8, [pw_pixel_max]
819    CLIPW            m3, m8, [pw_pixel_max]
820    CLIPW            m4, m8, [pw_pixel_max]
821    CLIPW            m5, m8, [pw_pixel_max]
822    CLIPW            m6, m8, [pw_pixel_max]
823    movdqu     [pix0q+strideq], m1;  p2
824    movdqu   [pix0q+2*strideq], m2;  p1
825    movdqu [pix0q+src3strideq], m3;  p0
826    movdqu              [pixq], m4;  q0
827    movdqu      [pixq+strideq], m5;  q1
828    movdqu    [pixq+2*strideq], m6;  q2
829.bypassluma:
830    RET
831%endmacro
832
833INIT_XMM sse2
834LOOP_FILTER_LUMA
835INIT_XMM ssse3
836LOOP_FILTER_LUMA
837%endif
838