1;*****************************************************************************
2;* MMX/SSE2-optimized H.264 deblocking code
3;*****************************************************************************
4;* Copyright (C) 2005-2008 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*
8;* This program is free software; you can redistribute it and/or modify
9;* it under the terms of the GNU General Public License as published by
10;* the Free Software Foundation; either version 2 of the License, or
11;* (at your option) any later version.
12;*
13;* This program is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16;* GNU General Public License for more details.
17;*
18;* You should have received a copy of the GNU General Public License
19;* along with this program; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21;*****************************************************************************
22
23%include "x86inc.asm"
24
25SECTION_RODATA
26pb_00: times 16 db 0x00
27pb_01: times 16 db 0x01
28pb_03: times 16 db 0x03
29pb_a1: times 16 db 0xa1
30
31SECTION .text
32
33; expands to [base],...,[base+7*stride]
34%define PASS8ROWS(base, base3, stride, stride3) \
35    [base], [base+stride], [base+stride*2], [base3], \
36    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
37
38; in: 8 rows of 4 bytes in %1..%8
39; out: 4 rows of 8 bytes in m0..m3
40%macro TRANSPOSE4x8_LOAD 8
41    movd       m0, %1
42    movd       m2, %2
43    movd       m1, %3
44    movd       m3, %4
45    punpcklbw  m0, m2
46    punpcklbw  m1, m3
47    movq       m2, m0
48    punpcklwd  m0, m1
49    punpckhwd  m2, m1
50
51    movd       m4, %5
52    movd       m6, %6
53    movd       m5, %7
54    movd       m7, %8
55    punpcklbw  m4, m6
56    punpcklbw  m5, m7
57    movq       m6, m4
58    punpcklwd  m4, m5
59    punpckhwd  m6, m5
60
61    movq       m1, m0
62    movq       m3, m2
63    punpckldq  m0, m4
64    punpckhdq  m1, m4
65    punpckldq  m2, m6
66    punpckhdq  m3, m6
67%endmacro
68
69; in: 4 rows of 8 bytes in m0..m3
70; out: 8 rows of 4 bytes in %1..%8
71%macro TRANSPOSE8x4_STORE 8
72    movq       m4, m0
73    movq       m5, m1
74    movq       m6, m2
75    punpckhdq  m4, m4
76    punpckhdq  m5, m5
77    punpckhdq  m6, m6
78
79    punpcklbw  m0, m1
80    punpcklbw  m2, m3
81    movq       m1, m0
82    punpcklwd  m0, m2
83    punpckhwd  m1, m2
84    movd       %1, m0
85    punpckhdq  m0, m0
86    movd       %2, m0
87    movd       %3, m1
88    punpckhdq  m1, m1
89    movd       %4, m1
90
91    punpckhdq  m3, m3
92    punpcklbw  m4, m5
93    punpcklbw  m6, m3
94    movq       m5, m4
95    punpcklwd  m4, m6
96    punpckhwd  m5, m6
97    movd       %5, m4
98    punpckhdq  m4, m4
99    movd       %6, m4
100    movd       %7, m5
101    punpckhdq  m5, m5
102    movd       %8, m5
103%endmacro
104
105%macro SBUTTERFLY 4
106    movq       %4, %2
107    punpckl%1  %2, %3
108    punpckh%1  %4, %3
109%endmacro
110
111; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
112; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
113%macro TRANSPOSE6x8_MEM 9
114    movq  m0, %1
115    movq  m1, %2
116    movq  m2, %3
117    movq  m3, %4
118    movq  m4, %5
119    movq  m5, %6
120    movq  m6, %7
121    SBUTTERFLY bw, m0, m1, m7
122    SBUTTERFLY bw, m2, m3, m1
123    SBUTTERFLY bw, m4, m5, m3
124    movq  [%9+0x10], m1
125    SBUTTERFLY bw, m6, %8, m5
126    SBUTTERFLY wd, m0, m2, m1
127    SBUTTERFLY wd, m4, m6, m2
128    punpckhdq m0, m4
129    movq  [%9+0x00], m0
130    SBUTTERFLY wd, m7, [%9+0x10], m6
131    SBUTTERFLY wd, m3, m5, m4
132    SBUTTERFLY dq, m7, m3, m0
133    SBUTTERFLY dq, m1, m2, m5
134    punpckldq m6, m4
135    movq  [%9+0x10], m1
136    movq  [%9+0x20], m5
137    movq  [%9+0x30], m7
138    movq  [%9+0x40], m0
139    movq  [%9+0x50], m6
140%endmacro
141
142; in: 8 rows of 8 in %1..%8
143; out: 8 rows of 8 in %9..%16
144%macro TRANSPOSE8x8_MEM 16
145    movq  m0, %1
146    movq  m1, %2
147    movq  m2, %3
148    movq  m3, %4
149    movq  m4, %5
150    movq  m5, %6
151    movq  m6, %7
152    SBUTTERFLY bw, m0, m1, m7
153    SBUTTERFLY bw, m2, m3, m1
154    SBUTTERFLY bw, m4, m5, m3
155    SBUTTERFLY bw, m6, %8, m5
156    movq  %9,  m3
157    SBUTTERFLY wd, m0, m2, m3
158    SBUTTERFLY wd, m4, m6, m2
159    SBUTTERFLY wd, m7, m1, m6
160    movq  %11, m2
161    movq  m2,  %9
162    SBUTTERFLY wd, m2, m5, m1
163    SBUTTERFLY dq, m0, m4, m5
164    SBUTTERFLY dq, m7, m2, m4
165    movq  %9,  m0
166    movq  %10, m5
167    movq  %13, m7
168    movq  %14, m4
169    SBUTTERFLY dq, m3, %11, m0
170    SBUTTERFLY dq, m6, m1, m5
171    movq  %11, m3
172    movq  %12, m0
173    movq  %15, m6
174    movq  %16, m5
175%endmacro
176
177; out: %4 = |%1-%2|>%3
178; clobbers: %5
179%macro DIFF_GT 5
180    mova    %5, %2
181    mova    %4, %1
182    psubusb %5, %1
183    psubusb %4, %2
184    por     %4, %5
185    psubusb %4, %3
186%endmacro
187
188; out: %4 = |%1-%2|>%3
189; clobbers: %5
190%macro DIFF_GT2 5
191    mova    %5, %2
192    mova    %4, %1
193    psubusb %5, %1
194    psubusb %4, %2
195    psubusb %5, %3
196    psubusb %4, %3
197    pcmpeqb %4, %5
198%endmacro
199
200%macro SPLATW 1
201%ifidn m0, xmm0
202    pshuflw  %1, %1, 0
203    punpcklqdq %1, %1
204%else
205    pshufw   %1, %1, 0
206%endif
207%endmacro
208
209; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
210; out: m5=beta-1, m7=mask, %3=alpha-1
211; clobbers: m4,m6
212%macro LOAD_MASK 2-3
213    movd     m4, %1
214    movd     m5, %2
215    SPLATW   m4
216    SPLATW   m5
217    packuswb m4, m4  ; 16x alpha-1
218    packuswb m5, m5  ; 16x beta-1
219%if %0>2
220    mova     %3, m4
221%endif
222    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
223    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
224    por      m7, m4
225    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
226    por      m7, m4
227    pxor     m6, m6
228    pcmpeqb  m7, m6
229%endmacro
230
231; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
232; out: m1=p0' m2=q0'
233; clobbers: m0,3-6
234%macro DEBLOCK_P0_Q0 0
235    mova    m5, m1
236    pxor    m5, m2           ; p0^q0
237    pand    m5, [pb_01 GLOBAL] ; (p0^q0)&1
238    pcmpeqb m4, m4
239    pxor    m3, m4
240    pavgb   m3, m0           ; (p1 - q1 + 256)>>1
241    pavgb   m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
242    pxor    m4, m1
243    pavgb   m4, m2           ; (q0 - p0 + 256)>>1
244    pavgb   m3, m5
245    paddusb m3, m4           ; d+128+33
246    mova    m6, [pb_a1 GLOBAL]
247    psubusb m6, m3
248    psubusb m3, [pb_a1 GLOBAL]
249    pminub  m6, m7
250    pminub  m3, m7
251    psubusb m1, m6
252    psubusb m2, m3
253    paddusb m1, m3
254    paddusb m2, m6
255%endmacro
256
257; in: m1=p0 m2=q0
258;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
259; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
260; clobbers: q2, tmp, tc0
261%macro LUMA_Q1 6
262    mova    %6, m1
263    pavgb   %6, m2
264    pavgb   %2, %6             ; avg(p2,avg(p0,q0))
265    pxor    %6, %3
266    pand    %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
267    psubusb %2, %6             ; (p2+((p0+q0+1)>>1))>>1
268    mova    %6, %1
269    psubusb %6, %5
270    paddusb %5, %1
271    pmaxub  %2, %6
272    pminub  %2, %5
273    mova    %4, %2
274%endmacro
275
276%ifdef ARCH_X86_64
277;-----------------------------------------------------------------------------
278; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279;-----------------------------------------------------------------------------
280INIT_XMM
281cglobal x264_deblock_v_luma_sse2
282    movd    m8, [r4] ; tc0
283    lea     r4, [r1*3]
284    dec     r2d        ; alpha-1
285    neg     r4
286    dec     r3d        ; beta-1
287    add     r4, r0     ; pix-3*stride
288
289    mova    m0, [r4+r1]   ; p1
290    mova    m1, [r4+2*r1] ; p0
291    mova    m2, [r0]      ; q0
292    mova    m3, [r0+r1]   ; q1
293    LOAD_MASK r2d, r3d
294
295    punpcklbw m8, m8
296    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
297    pcmpeqb m9, m9
298    pcmpeqb m9, m8
299    pandn   m9, m7
300    pand    m8, m9
301
302    movdqa  m3, [r4] ; p2
303    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
304    pand    m6, m9
305    mova    m7, m8
306    psubb   m7, m6
307    pand    m6, m8
308    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
309
310    movdqa  m4, [r0+2*r1] ; q2
311    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
312    pand    m6, m9
313    pand    m8, m6
314    psubb   m7, m6
315    mova    m3, [r0+r1]
316    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
317
318    DEBLOCK_P0_Q0
319    mova    [r4+2*r1], m1
320    mova    [r0], m2
321    ret
322
323;-----------------------------------------------------------------------------
324; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325;-----------------------------------------------------------------------------
326INIT_MMX
327cglobal x264_deblock_h_luma_sse2
328    movsxd r10, esi
329    lea    r11, [r10+r10*2]
330    lea    rax, [r0-4]
331    lea    r9,  [r0-4+r11]
332    sub    rsp, 0x68
333    %define pix_tmp rsp
334
335    ; transpose 6x16 -> tmp space
336    TRANSPOSE6x8_MEM  PASS8ROWS(rax, r9, r10, r11), pix_tmp
337    lea    rax, [rax+r10*8]
338    lea    r9,  [r9 +r10*8]
339    TRANSPOSE6x8_MEM  PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
340
341    ; vertical filter
342    ; alpha, beta, tc0 are still in r2d, r3d, r4
343    ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
344    lea    r0, [pix_tmp+0x30]
345    mov    esi, 0x10
346    call   x264_deblock_v_luma_sse2
347
348    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
349    add    rax, 2
350    add    r9,  2
351    movq   m0, [pix_tmp+0x18]
352    movq   m1, [pix_tmp+0x28]
353    movq   m2, [pix_tmp+0x38]
354    movq   m3, [pix_tmp+0x48]
355    TRANSPOSE8x4_STORE  PASS8ROWS(rax, r9, r10, r11)
356
357    shl    r10, 3
358    sub    rax, r10
359    sub    r9,  r10
360    shr    r10, 3
361    movq   m0, [pix_tmp+0x10]
362    movq   m1, [pix_tmp+0x20]
363    movq   m2, [pix_tmp+0x30]
364    movq   m3, [pix_tmp+0x40]
365    TRANSPOSE8x4_STORE  PASS8ROWS(rax, r9, r10, r11)
366
367    add    rsp, 0x68
368    ret
369
370%else
371
372%macro DEBLOCK_LUMA 3
373;-----------------------------------------------------------------------------
374; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
375;-----------------------------------------------------------------------------
376cglobal x264_deblock_%2_luma_%1, 5,5
377    lea     r4, [r1*3]
378    dec     r2     ; alpha-1
379    neg     r4
380    dec     r3     ; beta-1
381    add     r4, r0 ; pix-3*stride
382    %assign pad 2*%3+12-(stack_offset&15)
383    SUB     esp, pad
384
385    mova    m0, [r4+r1]   ; p1
386    mova    m1, [r4+2*r1] ; p0
387    mova    m2, [r0]      ; q0
388    mova    m3, [r0+r1]   ; q1
389    LOAD_MASK r2, r3
390
391    mov     r3, r4m
392    movd    m4, [r3] ; tc0
393    punpcklbw m4, m4
394    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
395    mova   [esp+%3], m4 ; tc
396    pcmpeqb m3, m3
397    pcmpgtb m4, m3
398    pand    m4, m7
399    mova   [esp], m4 ; mask
400
401    mova    m3, [r4] ; p2
402    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
403    pand    m6, m4
404    pand    m4, [esp+%3] ; tc
405    mova    m7, m4
406    psubb   m7, m6
407    pand    m6, m4
408    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
409
410    mova    m4, [r0+2*r1] ; q2
411    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
412    mova    m5, [esp] ; mask
413    pand    m6, m5
414    mova    m5, [esp+%3] ; tc
415    pand    m5, m6
416    psubb   m7, m6
417    mova    m3, [r0+r1]
418    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
419
420    DEBLOCK_P0_Q0
421    mova    [r4+2*r1], m1
422    mova    [r0], m2
423    ADD     esp, pad
424    RET
425
426;-----------------------------------------------------------------------------
427; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
428;-----------------------------------------------------------------------------
429INIT_MMX
430cglobal x264_deblock_h_luma_%1, 0,5
431    mov    r0, r0m
432    mov    r3, r1m
433    lea    r4, [r3*3]
434    sub    r0, 4
435    lea    r1, [r0+r4]
436    %assign pad 0x78-(stack_offset&15)
437    SUB    esp, pad
438%define pix_tmp esp+12
439
440    ; transpose 6x16 -> tmp space
441    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
442    lea    r0, [r0+r3*8]
443    lea    r1, [r1+r3*8]
444    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
445
446    ; vertical filter
447    lea    r0, [pix_tmp+0x30]
448    PUSH   dword r4m
449    PUSH   dword r3m
450    PUSH   dword r2m
451    PUSH   dword 16
452    PUSH   dword r0
453    call   x264_deblock_%2_luma_%1
454%ifidn %2, v8
455    add    dword [esp   ], 8 ; pix_tmp+0x38
456    add    dword [esp+16], 2 ; tc0+2
457    call   x264_deblock_%2_luma_%1
458%endif
459    ADD    esp, 20
460
461    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
462    mov    r0, r0m
463    sub    r0, 2
464    lea    r1, [r0+r4]
465
466    movq   m0, [pix_tmp+0x10]
467    movq   m1, [pix_tmp+0x20]
468    movq   m2, [pix_tmp+0x30]
469    movq   m3, [pix_tmp+0x40]
470    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
471
472    lea    r0, [r0+r3*8]
473    lea    r1, [r1+r3*8]
474    movq   m0, [pix_tmp+0x18]
475    movq   m1, [pix_tmp+0x28]
476    movq   m2, [pix_tmp+0x38]
477    movq   m3, [pix_tmp+0x48]
478    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
479
480    ADD    esp, pad
481    RET
482%endmacro ; DEBLOCK_LUMA
483
484INIT_XMM
485DEBLOCK_LUMA sse2, v, 16
486
487%endif ; ARCH
488
489
490
491%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
492    mova  t0, p2
493    mova  t1, p0
494    pavgb t0, p1
495    pavgb t1, q0
496    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
497    mova  t5, t1
498    mova  t2, p2
499    mova  t3, p0
500    paddb t2, p1
501    paddb t3, q0
502    paddb t2, t3
503    mova  t3, t2
504    mova  t4, t2
505    psrlw t2, 1
506    pavgb t2, mpb_00
507    pxor  t2, t0
508    pand  t2, mpb_01
509    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
510
511    mova  t1, p2
512    mova  t2, p2
513    pavgb t1, q1
514    psubb t2, q1
515    paddb t3, t3
516    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
517    pand  t2, mpb_01
518    psubb t1, t2
519    pavgb t1, p1
520    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
521    psrlw t3, 2
522    pavgb t3, mpb_00
523    pxor  t3, t1
524    pand  t3, mpb_01
525    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
526
527    mova  t3, p0
528    mova  t2, p0
529    pxor  t3, q1
530    pavgb t2, q1
531    pand  t3, mpb_01
532    psubb t2, t3
533    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
534
535    pxor  t1, t2
536    pxor  t2, p0
537    pand  t1, mask1p
538    pand  t2, mask0
539    pxor  t1, t2
540    pxor  t1, p0
541    mova  %1, t1 ; store p0
542
543    mova  t1, %4 ; p3
544    mova  t2, t1
545    pavgb t1, p2
546    paddb t2, p2
547    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
548    paddb t2, t2
549    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
550    psrlw t2, 2
551    pavgb t2, mpb_00
552    pxor  t2, t1
553    pand  t2, mpb_01
554    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
555
556    pxor  t0, p1
557    pxor  t1, p2
558    pand  t0, mask1p
559    pand  t1, mask1p
560    pxor  t0, p1
561    pxor  t1, p2
562    mova  %2, t0 ; store p1
563    mova  %3, t1 ; store p2
564%endmacro
565
566%macro LUMA_INTRA_SWAP_PQ 0
567    %define q1 m0
568    %define q0 m1
569    %define p0 m2
570    %define p1 m3
571    %define p2 q2
572    %define mask1p mask1q
573%endmacro
574
575%macro DEBLOCK_LUMA_INTRA 2
576    %define p1 m0
577    %define p0 m1
578    %define q0 m2
579    %define q1 m3
580    %define t0 m4
581    %define t1 m5
582    %define t2 m6
583    %define t3 m7
584%ifdef ARCH_X86_64
585    %define p2 m8
586    %define q2 m9
587    %define t4 m10
588    %define t5 m11
589    %define mask0 m12
590    %define mask1p m13
591    %define mask1q [rsp-24]
592    %define mpb_00 m14
593    %define mpb_01 m15
594%else
595    %define spill(x) [esp+16*x+((stack_offset+4)&15)]
596    %define p2 [r4+r1]
597    %define q2 [r0+2*r1]
598    %define t4 spill(0)
599    %define t5 spill(1)
600    %define mask0 spill(2)
601    %define mask1p spill(3)
602    %define mask1q spill(4)
603    %define mpb_00 [pb_00 GLOBAL]
604    %define mpb_01 [pb_01 GLOBAL]
605%endif
606
607;-----------------------------------------------------------------------------
608; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
609;-----------------------------------------------------------------------------
610cglobal x264_deblock_%2_luma_intra_%1, 4,6
611%ifndef ARCH_X86_64
612    sub     esp, 0x60
613%endif
614    lea     r4, [r1*4]
615    lea     r5, [r1*3] ; 3*stride
616    dec     r2d        ; alpha-1
617    jl .end
618    neg     r4
619    dec     r3d        ; beta-1
620    jl .end
621    add     r4, r0     ; pix-4*stride
622    mova    p1, [r4+2*r1]
623    mova    p0, [r4+r5]
624    mova    q0, [r0]
625    mova    q1, [r0+r1]
626%ifdef ARCH_X86_64
627    pxor    mpb_00, mpb_00
628    mova    mpb_01, [pb_01 GLOBAL]
629    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
630    SWAP    7, 12 ; m12=mask0
631    pavgb   t5, mpb_00
632    pavgb   t5, mpb_01 ; alpha/4+1
633    movdqa  p2, [r4+r1]
634    movdqa  q2, [r0+2*r1]
635    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
636    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
637    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
638    pand    t0, mask0
639    pand    t4, t0
640    pand    t2, t0
641    mova    mask1q, t4
642    mova    mask1p, t2
643%else
644    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
645    mova    m4, t5
646    mova    mask0, m7
647    pavgb   m4, [pb_00 GLOBAL]
648    pavgb   m4, [pb_01 GLOBAL] ; alpha/4+1
649    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
650    pand    m6, mask0
651    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
652    pand    m4, m6
653    mova    mask1p, m4
654    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
655    pand    m4, m6
656    mova    mask1q, m4
657%endif
658    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
659    LUMA_INTRA_SWAP_PQ
660    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
661.end:
662%ifndef ARCH_X86_64
663    add     esp, 0x60
664%endif
665    RET
666
667INIT_MMX
668%ifdef ARCH_X86_64
669;-----------------------------------------------------------------------------
670; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
671;-----------------------------------------------------------------------------
672cglobal x264_deblock_h_luma_intra_%1
673    movsxd r10, r1d
674    lea    r11, [r10*3]
675    lea    rax, [r0-4]
676    lea    r9,  [r0-4+r11]
677    sub    rsp, 0x88
678    %define pix_tmp rsp
679
680    ; transpose 8x16 -> tmp space
681    TRANSPOSE8x8_MEM  PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
682    lea    rax, [rax+r10*8]
683    lea    r9,  [r9+r10*8]
684    TRANSPOSE8x8_MEM  PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
685
686    lea    r0,  [pix_tmp+0x40]
687    mov    r1,  0x10
688    call   x264_deblock_v_luma_intra_%1
689
690    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
691    lea    r9, [rax+r11]
692    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
693    shl    r10, 3
694    sub    rax, r10
695    sub    r9,  r10
696    shr    r10, 3
697    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
698    add    rsp, 0x88
699    ret
700%else
701cglobal x264_deblock_h_luma_intra_%1, 2,4
702    lea    r3,  [r1*3]
703    sub    r0,  4
704    lea    r2,  [r0+r3]
705%assign pad 0x8c-(stack_offset&15)
706    SUB    rsp, pad
707    %define pix_tmp rsp
708
709    ; transpose 8x16 -> tmp space
710    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
711    lea    r0,  [r0+r1*8]
712    lea    r2,  [r2+r1*8]
713    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
714
715    lea    r0,  [pix_tmp+0x40]
716    PUSH   dword r3m
717    PUSH   dword r2m
718    PUSH   dword 16
719    PUSH   r0
720    call   x264_deblock_%2_luma_intra_%1
721%ifidn %2, v8
722    add    dword [rsp], 8 ; pix_tmp+8
723    call   x264_deblock_%2_luma_intra_%1
724%endif
725    ADD    esp, 16
726
727    mov    r1,  r1m
728    mov    r0,  r0m
729    lea    r3,  [r1*3]
730    sub    r0,  4
731    lea    r2,  [r0+r3]
732    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
733    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
734    lea    r0,  [r0+r1*8]
735    lea    r2,  [r2+r1*8]
736    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
737    ADD    rsp, pad
738    RET
739%endif ; ARCH_X86_64
740%endmacro ; DEBLOCK_LUMA_INTRA
741
742INIT_XMM
743DEBLOCK_LUMA_INTRA sse2, v
744%ifndef ARCH_X86_64
745INIT_MMX
746DEBLOCK_LUMA_INTRA mmxext, v8
747%endif
748