• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/router/ffmpeg/libavcodec/x86/
1;*****************************************************************************
2;* MMX/SSE2-optimized H.264 deblocking code
3;*****************************************************************************
4;* Copyright (C) 2005-2008 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*
8;* This program is free software; you can redistribute it and/or modify
9;* it under the terms of the GNU General Public License as published by
10;* the Free Software Foundation; either version 2 of the License, or
11;* (at your option) any later version.
12;*
13;* This program is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16;* GNU General Public License for more details.
17;*
18;* You should have received a copy of the GNU General Public License
19;* along with this program; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21;*****************************************************************************
22
23%include "x86inc.asm"
24
25SECTION_RODATA
26pb_00: times 16 db 0x00
27pb_01: times 16 db 0x01
28pb_03: times 16 db 0x03
29pb_a1: times 16 db 0xa1
30
31SECTION .text
32
33; expands to [base],...,[base+7*stride]
34%define PASS8ROWS(base, base3, stride, stride3) \
35    [base], [base+stride], [base+stride*2], [base3], \
36    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
37
38; in: 8 rows of 4 bytes in %1..%8
39; out: 4 rows of 8 bytes in m0..m3
40%macro TRANSPOSE4x8_LOAD 8
41    movd       m0, %1
42    movd       m2, %2
43    movd       m1, %3
44    movd       m3, %4
45    punpcklbw  m0, m2
46    punpcklbw  m1, m3
47    movq       m2, m0
48    punpcklwd  m0, m1
49    punpckhwd  m2, m1
50
51    movd       m4, %5
52    movd       m6, %6
53    movd       m5, %7
54    movd       m7, %8
55    punpcklbw  m4, m6
56    punpcklbw  m5, m7
57    movq       m6, m4
58    punpcklwd  m4, m5
59    punpckhwd  m6, m5
60
61    movq       m1, m0
62    movq       m3, m2
63    punpckldq  m0, m4
64    punpckhdq  m1, m4
65    punpckldq  m2, m6
66    punpckhdq  m3, m6
67%endmacro
68
69; in: 4 rows of 8 bytes in m0..m3
70; out: 8 rows of 4 bytes in %1..%8
71%macro TRANSPOSE8x4_STORE 8
72    movq       m4, m0
73    movq       m5, m1
74    movq       m6, m2
75    punpckhdq  m4, m4
76    punpckhdq  m5, m5
77    punpckhdq  m6, m6
78
79    punpcklbw  m0, m1
80    punpcklbw  m2, m3
81    movq       m1, m0
82    punpcklwd  m0, m2
83    punpckhwd  m1, m2
84    movd       %1, m0
85    punpckhdq  m0, m0
86    movd       %2, m0
87    movd       %3, m1
88    punpckhdq  m1, m1
89    movd       %4, m1
90
91    punpckhdq  m3, m3
92    punpcklbw  m4, m5
93    punpcklbw  m6, m3
94    movq       m5, m4
95    punpcklwd  m4, m6
96    punpckhwd  m5, m6
97    movd       %5, m4
98    punpckhdq  m4, m4
99    movd       %6, m4
100    movd       %7, m5
101    punpckhdq  m5, m5
102    movd       %8, m5
103%endmacro
104
105%macro SBUTTERFLY 4
106    movq       %4, %2
107    punpckl%1  %2, %3
108    punpckh%1  %4, %3
109%endmacro
110
111; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
112; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
113%macro TRANSPOSE6x8_MEM 9
114    movq  m0, %1
115    movq  m1, %2
116    movq  m2, %3
117    movq  m3, %4
118    movq  m4, %5
119    movq  m5, %6
120    movq  m6, %7
121    SBUTTERFLY bw, m0, m1, m7
122    SBUTTERFLY bw, m2, m3, m1
123    SBUTTERFLY bw, m4, m5, m3
124    movq  [%9+0x10], m1
125    SBUTTERFLY bw, m6, %8, m5
126    SBUTTERFLY wd, m0, m2, m1
127    SBUTTERFLY wd, m4, m6, m2
128    punpckhdq m0, m4
129    movq  [%9+0x00], m0
130    SBUTTERFLY wd, m7, [%9+0x10], m6
131    SBUTTERFLY wd, m3, m5, m4
132    SBUTTERFLY dq, m7, m3, m0
133    SBUTTERFLY dq, m1, m2, m5
134    punpckldq m6, m4
135    movq  [%9+0x10], m1
136    movq  [%9+0x20], m5
137    movq  [%9+0x30], m7
138    movq  [%9+0x40], m0
139    movq  [%9+0x50], m6
140%endmacro
141
142; in: 8 rows of 8 in %1..%8
143; out: 8 rows of 8 in %9..%16
144%macro TRANSPOSE8x8_MEM 16
145    movq  m0, %1
146    movq  m1, %2
147    movq  m2, %3
148    movq  m3, %4
149    movq  m4, %5
150    movq  m5, %6
151    movq  m6, %7
152    SBUTTERFLY bw, m0, m1, m7
153    SBUTTERFLY bw, m2, m3, m1
154    SBUTTERFLY bw, m4, m5, m3
155    SBUTTERFLY bw, m6, %8, m5
156    movq  %9,  m3
157    SBUTTERFLY wd, m0, m2, m3
158    SBUTTERFLY wd, m4, m6, m2
159    SBUTTERFLY wd, m7, m1, m6
160    movq  %11, m2
161    movq  m2,  %9
162    SBUTTERFLY wd, m2, m5, m1
163    SBUTTERFLY dq, m0, m4, m5
164    SBUTTERFLY dq, m7, m2, m4
165    movq  %9,  m0
166    movq  %10, m5
167    movq  %13, m7
168    movq  %14, m4
169    SBUTTERFLY dq, m3, %11, m0
170    SBUTTERFLY dq, m6, m1, m5
171    movq  %11, m3
172    movq  %12, m0
173    movq  %15, m6
174    movq  %16, m5
175%endmacro
176
177; out: %4 = |%1-%2|>%3
178; clobbers: %5
179%macro DIFF_GT 5
180    mova    %5, %2
181    mova    %4, %1
182    psubusb %5, %1
183    psubusb %4, %2
184    por     %4, %5
185    psubusb %4, %3
186%endmacro
187
188; out: %4 = |%1-%2|>%3
189; clobbers: %5
190%macro DIFF_GT2 5
191    mova    %5, %2
192    mova    %4, %1
193    psubusb %5, %1
194    psubusb %4, %2
195    psubusb %5, %3
196    psubusb %4, %3
197    pcmpeqb %4, %5
198%endmacro
199
200%macro SPLATW 1
201%ifidn m0, xmm0
202    pshuflw  %1, %1, 0
203    punpcklqdq %1, %1
204%else
205    pshufw   %1, %1, 0
206%endif
207%endmacro
208
209; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
210; out: m5=beta-1, m7=mask, %3=alpha-1
211; clobbers: m4,m6
212%macro LOAD_MASK 2-3
213    movd     m4, %1
214    movd     m5, %2
215    SPLATW   m4
216    SPLATW   m5
217    packuswb m4, m4  ; 16x alpha-1
218    packuswb m5, m5  ; 16x beta-1
219%if %0>2
220    mova     %3, m4
221%endif
222    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
223    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
224    por      m7, m4
225    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
226    por      m7, m4
227    pxor     m6, m6
228    pcmpeqb  m7, m6
229%endmacro
230
231; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
232; out: m1=p0' m2=q0'
233; clobbers: m0,3-6
234%macro DEBLOCK_P0_Q0 0
235    mova    m5, m1
236    pxor    m5, m2           ; p0^q0
237    pand    m5, [pb_01 GLOBAL] ; (p0^q0)&1
238    pcmpeqb m4, m4
239    pxor    m3, m4
240    pavgb   m3, m0           ; (p1 - q1 + 256)>>1
241    pavgb   m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
242    pxor    m4, m1
243    pavgb   m4, m2           ; (q0 - p0 + 256)>>1
244    pavgb   m3, m5
245    paddusb m3, m4           ; d+128+33
246    mova    m6, [pb_a1 GLOBAL]
247    psubusb m6, m3
248    psubusb m3, [pb_a1 GLOBAL]
249    pminub  m6, m7
250    pminub  m3, m7
251    psubusb m1, m6
252    psubusb m2, m3
253    paddusb m1, m3
254    paddusb m2, m6
255%endmacro
256
257; in: m1=p0 m2=q0
258;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
259; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
260; clobbers: q2, tmp, tc0
261%macro LUMA_Q1 6
262    mova    %6, m1
263    pavgb   %6, m2
264    pavgb   %2, %6             ; avg(p2,avg(p0,q0))
265    pxor    %6, %3
266    pand    %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
267    psubusb %2, %6             ; (p2+((p0+q0+1)>>1))>>1
268    mova    %6, %1
269    psubusb %6, %5
270    paddusb %5, %1
271    pmaxub  %2, %6
272    pminub  %2, %5
273    mova    %4, %2
274%endmacro
275
276%ifdef ARCH_X86_64
277;-----------------------------------------------------------------------------
278; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279;-----------------------------------------------------------------------------
280INIT_XMM
281cglobal x264_deblock_v_luma_sse2, 5,5,10
282    movd    m8, [r4] ; tc0
283    lea     r4, [r1*3]
284    dec     r2d        ; alpha-1
285    neg     r4
286    dec     r3d        ; beta-1
287    add     r4, r0     ; pix-3*stride
288
289    mova    m0, [r4+r1]   ; p1
290    mova    m1, [r4+2*r1] ; p0
291    mova    m2, [r0]      ; q0
292    mova    m3, [r0+r1]   ; q1
293    LOAD_MASK r2d, r3d
294
295    punpcklbw m8, m8
296    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
297    pcmpeqb m9, m9
298    pcmpeqb m9, m8
299    pandn   m9, m7
300    pand    m8, m9
301
302    movdqa  m3, [r4] ; p2
303    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
304    pand    m6, m9
305    mova    m7, m8
306    psubb   m7, m6
307    pand    m6, m8
308    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
309
310    movdqa  m4, [r0+2*r1] ; q2
311    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
312    pand    m6, m9
313    pand    m8, m6
314    psubb   m7, m6
315    mova    m3, [r0+r1]
316    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
317
318    DEBLOCK_P0_Q0
319    mova    [r4+2*r1], m1
320    mova    [r0], m2
321    RET
322
323;-----------------------------------------------------------------------------
324; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325;-----------------------------------------------------------------------------
326INIT_MMX
327cglobal x264_deblock_h_luma_sse2, 5,7
328    movsxd r10, r1d
329    lea    r11, [r10+r10*2]
330    lea    r6,  [r0-4]
331    lea    r5,  [r0-4+r11]
332%ifdef WIN64
333    sub    rsp, 0x98
334    %define pix_tmp rsp+0x30
335%else
336    sub    rsp, 0x68
337    %define pix_tmp rsp
338%endif
339
340    ; transpose 6x16 -> tmp space
341    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp
342    lea    r6, [r6+r10*8]
343    lea    r5, [r5+r10*8]
344    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
345
346    ; vertical filter
347    ; alpha, beta, tc0 are still in r2d, r3d, r4
348    ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
349    lea    r0, [pix_tmp+0x30]
350    mov    r1d, 0x10
351%ifdef WIN64
352    mov    [rsp+0x20], r4
353%endif
354    call   x264_deblock_v_luma_sse2
355
356    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
357    add    r6, 2
358    add    r5, 2
359    movq   m0, [pix_tmp+0x18]
360    movq   m1, [pix_tmp+0x28]
361    movq   m2, [pix_tmp+0x38]
362    movq   m3, [pix_tmp+0x48]
363    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
364
365    shl    r10, 3
366    sub    r6,  r10
367    sub    r5,  r10
368    shr    r10, 3
369    movq   m0, [pix_tmp+0x10]
370    movq   m1, [pix_tmp+0x20]
371    movq   m2, [pix_tmp+0x30]
372    movq   m3, [pix_tmp+0x40]
373    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
374
375%ifdef WIN64
376    add    rsp, 0x98
377%else
378    add    rsp, 0x68
379%endif
380    RET
381
382%else
383
384%macro DEBLOCK_LUMA 3
385;-----------------------------------------------------------------------------
386; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
387;-----------------------------------------------------------------------------
388cglobal x264_deblock_%2_luma_%1, 5,5
389    lea     r4, [r1*3]
390    dec     r2     ; alpha-1
391    neg     r4
392    dec     r3     ; beta-1
393    add     r4, r0 ; pix-3*stride
394    %assign pad 2*%3+12-(stack_offset&15)
395    SUB     esp, pad
396
397    mova    m0, [r4+r1]   ; p1
398    mova    m1, [r4+2*r1] ; p0
399    mova    m2, [r0]      ; q0
400    mova    m3, [r0+r1]   ; q1
401    LOAD_MASK r2, r3
402
403    mov     r3, r4mp
404    movd    m4, [r3] ; tc0
405    punpcklbw m4, m4
406    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
407    mova   [esp+%3], m4 ; tc
408    pcmpeqb m3, m3
409    pcmpgtb m4, m3
410    pand    m4, m7
411    mova   [esp], m4 ; mask
412
413    mova    m3, [r4] ; p2
414    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
415    pand    m6, m4
416    pand    m4, [esp+%3] ; tc
417    mova    m7, m4
418    psubb   m7, m6
419    pand    m6, m4
420    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
421
422    mova    m4, [r0+2*r1] ; q2
423    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
424    mova    m5, [esp] ; mask
425    pand    m6, m5
426    mova    m5, [esp+%3] ; tc
427    pand    m5, m6
428    psubb   m7, m6
429    mova    m3, [r0+r1]
430    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
431
432    DEBLOCK_P0_Q0
433    mova    [r4+2*r1], m1
434    mova    [r0], m2
435    ADD     esp, pad
436    RET
437
438;-----------------------------------------------------------------------------
439; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
440;-----------------------------------------------------------------------------
441INIT_MMX
442cglobal x264_deblock_h_luma_%1, 0,5
443    mov    r0, r0mp
444    mov    r3, r1m
445    lea    r4, [r3*3]
446    sub    r0, 4
447    lea    r1, [r0+r4]
448    %assign pad 0x78-(stack_offset&15)
449    SUB    esp, pad
450%define pix_tmp esp+12
451
452    ; transpose 6x16 -> tmp space
453    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
454    lea    r0, [r0+r3*8]
455    lea    r1, [r1+r3*8]
456    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
457
458    ; vertical filter
459    lea    r0, [pix_tmp+0x30]
460    PUSH   dword r4m
461    PUSH   dword r3m
462    PUSH   dword r2m
463    PUSH   dword 16
464    PUSH   dword r0
465    call   x264_deblock_%2_luma_%1
466%ifidn %2, v8
467    add    dword [esp   ], 8 ; pix_tmp+0x38
468    add    dword [esp+16], 2 ; tc0+2
469    call   x264_deblock_%2_luma_%1
470%endif
471    ADD    esp, 20
472
473    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
474    mov    r0, r0mp
475    sub    r0, 2
476    lea    r1, [r0+r4]
477
478    movq   m0, [pix_tmp+0x10]
479    movq   m1, [pix_tmp+0x20]
480    movq   m2, [pix_tmp+0x30]
481    movq   m3, [pix_tmp+0x40]
482    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
483
484    lea    r0, [r0+r3*8]
485    lea    r1, [r1+r3*8]
486    movq   m0, [pix_tmp+0x18]
487    movq   m1, [pix_tmp+0x28]
488    movq   m2, [pix_tmp+0x38]
489    movq   m3, [pix_tmp+0x48]
490    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
491
492    ADD    esp, pad
493    RET
494%endmacro ; DEBLOCK_LUMA
495
496INIT_XMM
497DEBLOCK_LUMA sse2, v, 16
498
499%endif ; ARCH
500
501
502
503%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
504    mova  t0, p2
505    mova  t1, p0
506    pavgb t0, p1
507    pavgb t1, q0
508    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
509    mova  t5, t1
510    mova  t2, p2
511    mova  t3, p0
512    paddb t2, p1
513    paddb t3, q0
514    paddb t2, t3
515    mova  t3, t2
516    mova  t4, t2
517    psrlw t2, 1
518    pavgb t2, mpb_00
519    pxor  t2, t0
520    pand  t2, mpb_01
521    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
522
523    mova  t1, p2
524    mova  t2, p2
525    pavgb t1, q1
526    psubb t2, q1
527    paddb t3, t3
528    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
529    pand  t2, mpb_01
530    psubb t1, t2
531    pavgb t1, p1
532    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
533    psrlw t3, 2
534    pavgb t3, mpb_00
535    pxor  t3, t1
536    pand  t3, mpb_01
537    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
538
539    mova  t3, p0
540    mova  t2, p0
541    pxor  t3, q1
542    pavgb t2, q1
543    pand  t3, mpb_01
544    psubb t2, t3
545    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
546
547    pxor  t1, t2
548    pxor  t2, p0
549    pand  t1, mask1p
550    pand  t2, mask0
551    pxor  t1, t2
552    pxor  t1, p0
553    mova  %1, t1 ; store p0
554
555    mova  t1, %4 ; p3
556    mova  t2, t1
557    pavgb t1, p2
558    paddb t2, p2
559    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
560    paddb t2, t2
561    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
562    psrlw t2, 2
563    pavgb t2, mpb_00
564    pxor  t2, t1
565    pand  t2, mpb_01
566    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
567
568    pxor  t0, p1
569    pxor  t1, p2
570    pand  t0, mask1p
571    pand  t1, mask1p
572    pxor  t0, p1
573    pxor  t1, p2
574    mova  %2, t0 ; store p1
575    mova  %3, t1 ; store p2
576%endmacro
577
578%macro LUMA_INTRA_SWAP_PQ 0
579    %define q1 m0
580    %define q0 m1
581    %define p0 m2
582    %define p1 m3
583    %define p2 q2
584    %define mask1p mask1q
585%endmacro
586
587%macro DEBLOCK_LUMA_INTRA 2
588    %define p1 m0
589    %define p0 m1
590    %define q0 m2
591    %define q1 m3
592    %define t0 m4
593    %define t1 m5
594    %define t2 m6
595    %define t3 m7
596%ifdef ARCH_X86_64
597    %define p2 m8
598    %define q2 m9
599    %define t4 m10
600    %define t5 m11
601    %define mask0 m12
602    %define mask1p m13
603    %define mask1q [rsp-24]
604    %define mpb_00 m14
605    %define mpb_01 m15
606%else
607    %define spill(x) [esp+16*x+((stack_offset+4)&15)]
608    %define p2 [r4+r1]
609    %define q2 [r0+2*r1]
610    %define t4 spill(0)
611    %define t5 spill(1)
612    %define mask0 spill(2)
613    %define mask1p spill(3)
614    %define mask1q spill(4)
615    %define mpb_00 [pb_00 GLOBAL]
616    %define mpb_01 [pb_01 GLOBAL]
617%endif
618
619;-----------------------------------------------------------------------------
620; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
621;-----------------------------------------------------------------------------
622cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
623%ifndef ARCH_X86_64
624    sub     esp, 0x60
625%endif
626    lea     r4, [r1*4]
627    lea     r5, [r1*3] ; 3*stride
628    dec     r2d        ; alpha-1
629    jl .end
630    neg     r4
631    dec     r3d        ; beta-1
632    jl .end
633    add     r4, r0     ; pix-4*stride
634    mova    p1, [r4+2*r1]
635    mova    p0, [r4+r5]
636    mova    q0, [r0]
637    mova    q1, [r0+r1]
638%ifdef ARCH_X86_64
639    pxor    mpb_00, mpb_00
640    mova    mpb_01, [pb_01 GLOBAL]
641    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
642    SWAP    7, 12 ; m12=mask0
643    pavgb   t5, mpb_00
644    pavgb   t5, mpb_01 ; alpha/4+1
645    movdqa  p2, [r4+r1]
646    movdqa  q2, [r0+2*r1]
647    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
648    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
649    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
650    pand    t0, mask0
651    pand    t4, t0
652    pand    t2, t0
653    mova    mask1q, t4
654    mova    mask1p, t2
655%else
656    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
657    mova    m4, t5
658    mova    mask0, m7
659    pavgb   m4, [pb_00 GLOBAL]
660    pavgb   m4, [pb_01 GLOBAL] ; alpha/4+1
661    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
662    pand    m6, mask0
663    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
664    pand    m4, m6
665    mova    mask1p, m4
666    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
667    pand    m4, m6
668    mova    mask1q, m4
669%endif
670    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
671    LUMA_INTRA_SWAP_PQ
672    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
673.end:
674%ifndef ARCH_X86_64
675    add     esp, 0x60
676%endif
677    RET
678
679INIT_MMX
680%ifdef ARCH_X86_64
681;-----------------------------------------------------------------------------
682; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
683;-----------------------------------------------------------------------------
684cglobal x264_deblock_h_luma_intra_%1, 4,7
685    movsxd r10, r1d
686    lea    r11, [r10*3]
687    lea    r6,  [r0-4]
688    lea    r5,  [r0-4+r11]
689    sub    rsp, 0x88
690    %define pix_tmp rsp
691
692    ; transpose 8x16 -> tmp space
693    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
694    lea    r6, [r6+r10*8]
695    lea    r5, [r5+r10*8]
696    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
697
698    lea    r0,  [pix_tmp+0x40]
699    mov    r1,  0x10
700    call   x264_deblock_v_luma_intra_%1
701
702    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
703    lea    r5, [r6+r11]
704    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
705    shl    r10, 3
706    sub    r6,  r10
707    sub    r5,  r10
708    shr    r10, 3
709    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
710    add    rsp, 0x88
711    RET
712%else
713cglobal x264_deblock_h_luma_intra_%1, 2,4
714    lea    r3,  [r1*3]
715    sub    r0,  4
716    lea    r2,  [r0+r3]
717%assign pad 0x8c-(stack_offset&15)
718    SUB    rsp, pad
719    %define pix_tmp rsp
720
721    ; transpose 8x16 -> tmp space
722    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
723    lea    r0,  [r0+r1*8]
724    lea    r2,  [r2+r1*8]
725    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
726
727    lea    r0,  [pix_tmp+0x40]
728    PUSH   dword r3m
729    PUSH   dword r2m
730    PUSH   dword 16
731    PUSH   r0
732    call   x264_deblock_%2_luma_intra_%1
733%ifidn %2, v8
734    add    dword [rsp], 8 ; pix_tmp+8
735    call   x264_deblock_%2_luma_intra_%1
736%endif
737    ADD    esp, 16
738
739    mov    r1,  r1m
740    mov    r0,  r0mp
741    lea    r3,  [r1*3]
742    sub    r0,  4
743    lea    r2,  [r0+r3]
744    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
745    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
746    lea    r0,  [r0+r1*8]
747    lea    r2,  [r2+r1*8]
748    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
749    ADD    rsp, pad
750    RET
751%endif ; ARCH_X86_64
752%endmacro ; DEBLOCK_LUMA_INTRA
753
754INIT_XMM
755DEBLOCK_LUMA_INTRA sse2, v
756%ifndef ARCH_X86_64
757INIT_MMX
758DEBLOCK_LUMA_INTRA mmxext, v8
759%endif
760