1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29cextern pw_16
30cextern pw_8
31cextern pw_4
32cextern pw_2
33cextern pw_1
34
35pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
36pw_m3:        times 8 dw -3
37pw_pixel_max: times 8 dw ((1 << 10)-1)
38pw_512:       times 8 dw 512
39pd_17:        times 4 dd 17
40pd_16:        times 4 dd 16
41
42SECTION .text
43
44; dest, left, right, src
45; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
46%macro PRED4x4_LOWPASS 4
47    paddw       %2, %3
48    psrlw       %2, 1
49    pavgw       %1, %4, %2
50%endmacro
51
52;-----------------------------------------------------------------------------
53; void ff_pred4x4_down_right(pixel *src, const pixel *topright, int stride)
54;-----------------------------------------------------------------------------
55%macro PRED4x4_DR 0
56cglobal pred4x4_down_right_10, 3, 3
57    sub       r0, r2
58    lea       r1, [r0+r2*2]
59    movhps    m1, [r1-8]
60    movhps    m2, [r0+r2*1-8]
61    movhps    m4, [r0-8]
62    punpckhwd m2, m4
63    movq      m3, [r0]
64    punpckhdq m1, m2
65    PALIGNR   m3, m1, 10, m1
66    movhps    m4, [r1+r2*1-8]
67    PALIGNR   m0, m3, m4, 14, m4
68    movhps    m4, [r1+r2*2-8]
69    PALIGNR   m2, m0, m4, 14, m4
70    PRED4x4_LOWPASS m0, m2, m3, m0
71    movq      [r1+r2*2], m0
72    psrldq    m0, 2
73    movq      [r1+r2*1], m0
74    psrldq    m0, 2
75    movq      [r0+r2*2], m0
76    psrldq    m0, 2
77    movq      [r0+r2*1], m0
78    RET
79%endmacro
80
81INIT_XMM sse2
82PRED4x4_DR
83INIT_XMM ssse3
84PRED4x4_DR
85%if HAVE_AVX_EXTERNAL
86INIT_XMM avx
87PRED4x4_DR
88%endif
89
90;------------------------------------------------------------------------------
91; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
92;------------------------------------------------------------------------------
93%macro PRED4x4_VR 0
94cglobal pred4x4_vertical_right_10, 3, 3, 6
95    sub     r0, r2
96    lea     r1, [r0+r2*2]
97    movq    m5, [r0]            ; ........t3t2t1t0
98    movhps  m1, [r0-8]
99    PALIGNR m0, m5, m1, 14, m1  ; ......t3t2t1t0lt
100    pavgw   m5, m0
101    movhps  m1, [r0+r2*1-8]
102    PALIGNR m0, m1, 14, m1      ; ....t3t2t1t0ltl0
103    movhps  m2, [r0+r2*2-8]
104    PALIGNR m1, m0, m2, 14, m2  ; ..t3t2t1t0ltl0l1
105    movhps  m3, [r1+r2*1-8]
106    PALIGNR m2, m1, m3, 14, m3  ; t3t2t1t0ltl0l1l2
107    PRED4x4_LOWPASS m1, m0, m2, m1
108    pslldq  m0, m1, 12
109    psrldq  m1, 4
110    movq    [r0+r2*1], m5
111    movq    [r0+r2*2], m1
112    PALIGNR m5, m0, 14, m2
113    pslldq  m0, 2
114    movq    [r1+r2*1], m5
115    PALIGNR m1, m0, 14, m0
116    movq    [r1+r2*2], m1
117    RET
118%endmacro
119
120INIT_XMM sse2
121PRED4x4_VR
122INIT_XMM ssse3
123PRED4x4_VR
124%if HAVE_AVX_EXTERNAL
125INIT_XMM avx
126PRED4x4_VR
127%endif
128
129;-------------------------------------------------------------------------------
130; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
131;-------------------------------------------------------------------------------
132%macro PRED4x4_HD 0
133cglobal pred4x4_horizontal_down_10, 3, 3
134    sub        r0, r2
135    lea        r1, [r0+r2*2]
136    movq       m0, [r0-8]      ; lt ..
137    movhps     m0, [r0]
138    pslldq     m0, 2           ; t2 t1 t0 lt .. .. .. ..
139    movq       m1, [r1+r2*2-8] ; l3
140    movq       m3, [r1+r2*1-8]
141    punpcklwd  m1, m3          ; l2 l3
142    movq       m2, [r0+r2*2-8] ; l1
143    movq       m3, [r0+r2*1-8]
144    punpcklwd  m2, m3          ; l0 l1
145    punpckhdq  m1, m2          ; l0 l1 l2 l3
146    punpckhqdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
147    psrldq     m0, m1, 4       ; .. .. t2 t1 t0 lt l0 l1
148    psrldq     m3, m1, 2       ; .. t2 t1 t0 lt l0 l1 l2
149    pavgw      m5, m1, m3
150    PRED4x4_LOWPASS m3, m1, m0, m3
151    punpcklwd  m5, m3
152    psrldq     m3, 8
153    PALIGNR    m3, m5, 12, m4
154    movq       [r1+r2*2], m5
155    movhps     [r0+r2*2], m5
156    psrldq     m5, 4
157    movq       [r1+r2*1], m5
158    movq       [r0+r2*1], m3
159    RET
160%endmacro
161
162INIT_XMM sse2
163PRED4x4_HD
164INIT_XMM ssse3
165PRED4x4_HD
166%if HAVE_AVX_EXTERNAL
167INIT_XMM avx
168PRED4x4_HD
169%endif
170
171;-----------------------------------------------------------------------------
172; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
173;-----------------------------------------------------------------------------
174
175INIT_MMX mmxext
176cglobal pred4x4_dc_10, 3, 3
177    sub    r0, r2
178    lea    r1, [r0+r2*2]
179    movq   m2, [r0+r2*1-8]
180    paddw  m2, [r0+r2*2-8]
181    paddw  m2, [r1+r2*1-8]
182    paddw  m2, [r1+r2*2-8]
183    psrlq  m2, 48
184    movq   m0, [r0]
185    HADDW  m0, m1
186    paddw  m0, [pw_4]
187    paddw  m0, m2
188    psrlw  m0, 3
189    SPLATW m0, m0, 0
190    movq   [r0+r2*1], m0
191    movq   [r0+r2*2], m0
192    movq   [r1+r2*1], m0
193    movq   [r1+r2*2], m0
194    RET
195
196;-----------------------------------------------------------------------------
197; void ff_pred4x4_down_left(pixel *src, const pixel *topright, int stride)
198;-----------------------------------------------------------------------------
199%macro PRED4x4_DL 0
200cglobal pred4x4_down_left_10, 3, 3
201    sub        r0, r2
202    movq       m0, [r0]
203    movhps     m0, [r1]
204    psrldq     m2, m0, 2
205    pslldq     m3, m0, 2
206    pshufhw    m2, m2, 10100100b
207    PRED4x4_LOWPASS m0, m3, m2, m0
208    lea        r1, [r0+r2*2]
209    movhps     [r1+r2*2], m0
210    psrldq     m0, 2
211    movq       [r0+r2*1], m0
212    psrldq     m0, 2
213    movq       [r0+r2*2], m0
214    psrldq     m0, 2
215    movq       [r1+r2*1], m0
216    RET
217%endmacro
218
219INIT_XMM sse2
220PRED4x4_DL
221%if HAVE_AVX_EXTERNAL
222INIT_XMM avx
223PRED4x4_DL
224%endif
225
226;-----------------------------------------------------------------------------
227; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
228;-----------------------------------------------------------------------------
229%macro PRED4x4_VL 0
230cglobal pred4x4_vertical_left_10, 3, 3
231    sub        r0, r2
232    movu       m1, [r0]
233    movhps     m1, [r1]
234    psrldq     m0, m1, 2
235    psrldq     m2, m1, 4
236    pavgw      m4, m0, m1
237    PRED4x4_LOWPASS m0, m1, m2, m0
238    lea        r1, [r0+r2*2]
239    movq       [r0+r2*1], m4
240    movq       [r0+r2*2], m0
241    psrldq     m4, 2
242    psrldq     m0, 2
243    movq       [r1+r2*1], m4
244    movq       [r1+r2*2], m0
245    RET
246%endmacro
247
248INIT_XMM sse2
249PRED4x4_VL
250%if HAVE_AVX_EXTERNAL
251INIT_XMM avx
252PRED4x4_VL
253%endif
254
255;-----------------------------------------------------------------------------
256; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
257;-----------------------------------------------------------------------------
258INIT_MMX mmxext
259cglobal pred4x4_horizontal_up_10, 3, 3
260    sub       r0, r2
261    lea       r1, [r0+r2*2]
262    movq      m0, [r0+r2*1-8]
263    punpckhwd m0, [r0+r2*2-8]
264    movq      m1, [r1+r2*1-8]
265    punpckhwd m1, [r1+r2*2-8]
266    punpckhdq m0, m1
267    pshufw    m1, m1, 0xFF
268    movq      [r1+r2*2], m1
269    movd      [r1+r2*1+4], m1
270    pshufw    m2, m0, 11111001b
271    movq      m1, m2
272    pavgw     m2, m0
273
274    pshufw    m5, m0, 11111110b
275    PRED4x4_LOWPASS m1, m0, m5, m1
276    movq      m6, m2
277    punpcklwd m6, m1
278    movq      [r0+r2*1], m6
279    psrlq     m2, 16
280    psrlq     m1, 16
281    punpcklwd m2, m1
282    movq      [r0+r2*2], m2
283    psrlq     m2, 32
284    movd      [r1+r2*1], m2
285    RET
286
287
288
289;-----------------------------------------------------------------------------
290; void ff_pred8x8_vertical(pixel *src, int stride)
291;-----------------------------------------------------------------------------
292INIT_XMM sse2
293cglobal pred8x8_vertical_10, 2, 2
294    sub  r0, r1
295    mova m0, [r0]
296%rep 3
297    mova [r0+r1*1], m0
298    mova [r0+r1*2], m0
299    lea  r0, [r0+r1*2]
300%endrep
301    mova [r0+r1*1], m0
302    mova [r0+r1*2], m0
303    RET
304
305;-----------------------------------------------------------------------------
306; void ff_pred8x8_horizontal(pixel *src, int stride)
307;-----------------------------------------------------------------------------
308INIT_XMM sse2
309cglobal pred8x8_horizontal_10, 2, 3
310    mov         r2d, 4
311.loop:
312    movq         m0, [r0+r1*0-8]
313    movq         m1, [r0+r1*1-8]
314    pshuflw      m0, m0, 0xff
315    pshuflw      m1, m1, 0xff
316    punpcklqdq   m0, m0
317    punpcklqdq   m1, m1
318    mova  [r0+r1*0], m0
319    mova  [r0+r1*1], m1
320    lea          r0, [r0+r1*2]
321    dec          r2d
322    jg .loop
323    REP_RET
324
325;-----------------------------------------------------------------------------
326; void ff_predict_8x8_dc(pixel *src, int stride)
327;-----------------------------------------------------------------------------
328%macro MOV8 2-3
329; sort of a hack, but it works
330%if mmsize==8
331    movq    [%1+0], %2
332    movq    [%1+8], %3
333%else
334    movdqa    [%1], %2
335%endif
336%endmacro
337
338%macro PRED8x8_DC 1
339cglobal pred8x8_dc_10, 2, 6
340    sub         r0, r1
341    pxor        m4, m4
342    movq        m0, [r0+0]
343    movq        m1, [r0+8]
344%if mmsize==16
345    punpcklwd   m0, m1
346    movhlps     m1, m0
347    paddw       m0, m1
348%else
349    pshufw      m2, m0, 00001110b
350    pshufw      m3, m1, 00001110b
351    paddw       m0, m2
352    paddw       m1, m3
353    punpcklwd   m0, m1
354%endif
355    %1          m2, m0, 00001110b
356    paddw       m0, m2
357
358    lea         r5, [r1*3]
359    lea         r4, [r0+r1*4]
360    movzx      r2d, word [r0+r1*1-2]
361    movzx      r3d, word [r0+r1*2-2]
362    add        r2d, r3d
363    movzx      r3d, word [r0+r5*1-2]
364    add        r2d, r3d
365    movzx      r3d, word [r4-2]
366    add        r2d, r3d
367    movd        m2, r2d            ; s2
368
369    movzx      r2d, word [r4+r1*1-2]
370    movzx      r3d, word [r4+r1*2-2]
371    add        r2d, r3d
372    movzx      r3d, word [r4+r5*1-2]
373    add        r2d, r3d
374    movzx      r3d, word [r4+r1*4-2]
375    add        r2d, r3d
376    movd        m3, r2d            ; s3
377
378    punpcklwd   m2, m3
379    punpckldq   m0, m2            ; s0, s1, s2, s3
380    %1          m3, m0, 11110110b ; s2, s1, s3, s3
381    %1          m0, m0, 01110100b ; s0, s1, s3, s1
382    paddw       m0, m3
383    psrlw       m0, 2
384    pavgw       m0, m4            ; s0+s2, s1, s3, s1+s3
385%if mmsize==16
386    punpcklwd   m0, m0
387    pshufd      m3, m0, 11111010b
388    punpckldq   m0, m0
389    SWAP         0,1
390%else
391    pshufw      m1, m0, 0x00
392    pshufw      m2, m0, 0x55
393    pshufw      m3, m0, 0xaa
394    pshufw      m4, m0, 0xff
395%endif
396    MOV8   r0+r1*1, m1, m2
397    MOV8   r0+r1*2, m1, m2
398    MOV8   r0+r5*1, m1, m2
399    MOV8   r0+r1*4, m1, m2
400    MOV8   r4+r1*1, m3, m4
401    MOV8   r4+r1*2, m3, m4
402    MOV8   r4+r5*1, m3, m4
403    MOV8   r4+r1*4, m3, m4
404    RET
405%endmacro
406
407INIT_MMX mmxext
408PRED8x8_DC pshufw
409INIT_XMM sse2
410PRED8x8_DC pshuflw
411
412;-----------------------------------------------------------------------------
413; void ff_pred8x8_top_dc(pixel *src, int stride)
414;-----------------------------------------------------------------------------
415INIT_XMM sse2
416cglobal pred8x8_top_dc_10, 2, 4
417    sub         r0, r1
418    mova        m0, [r0]
419    pshuflw     m1, m0, 0x4e
420    pshufhw     m1, m1, 0x4e
421    paddw       m0, m1
422    pshuflw     m1, m0, 0xb1
423    pshufhw     m1, m1, 0xb1
424    paddw       m0, m1
425    lea         r2, [r1*3]
426    lea         r3, [r0+r1*4]
427    paddw       m0, [pw_2]
428    psrlw       m0, 2
429    mova [r0+r1*1], m0
430    mova [r0+r1*2], m0
431    mova [r0+r2*1], m0
432    mova [r0+r1*4], m0
433    mova [r3+r1*1], m0
434    mova [r3+r1*2], m0
435    mova [r3+r2*1], m0
436    mova [r3+r1*4], m0
437    RET
438
439;-----------------------------------------------------------------------------
440; void ff_pred8x8_plane(pixel *src, int stride)
441;-----------------------------------------------------------------------------
442INIT_XMM sse2
443cglobal pred8x8_plane_10, 2, 7, 7
444    sub       r0, r1
445    lea       r2, [r1*3]
446    lea       r3, [r0+r1*4]
447    mova      m2, [r0]
448    pmaddwd   m2, [pw_m32101234]
449    HADDD     m2, m1
450    movd      m0, [r0-4]
451    psrld     m0, 14
452    psubw     m2, m0               ; H
453    movd      m0, [r3+r1*4-4]
454    movd      m1, [r0+12]
455    paddw     m0, m1
456    psllw     m0, 4                ; 16*(src[7*stride-1] + src[-stride+7])
457    movzx    r4d, word [r3+r1*1-2] ; src[4*stride-1]
458    movzx    r5d, word [r0+r2*1-2] ; src[2*stride-1]
459    sub      r4d, r5d
460    movzx    r6d, word [r3+r1*2-2] ; src[5*stride-1]
461    movzx    r5d, word [r0+r1*2-2] ; src[1*stride-1]
462    sub      r6d, r5d
463    lea      r4d, [r4+r6*2]
464    movzx    r5d, word [r3+r2*1-2] ; src[6*stride-1]
465    movzx    r6d, word [r0+r1*1-2] ; src[0*stride-1]
466    sub      r5d, r6d
467    lea      r5d, [r5*3]
468    add      r4d, r5d
469    movzx    r6d, word [r3+r1*4-2] ; src[7*stride-1]
470    movzx    r5d, word [r0+r1*0-2] ; src[ -stride-1]
471    sub      r6d, r5d
472    lea      r4d, [r4+r6*4]
473    movd      m3, r4d              ; V
474    punpckldq m2, m3
475    pmaddwd   m2, [pd_17]
476    paddd     m2, [pd_16]
477    psrad     m2, 5                ; b, c
478
479    mova      m3, [pw_pixel_max]
480    pxor      m1, m1
481    SPLATW    m0, m0, 1
482    SPLATW    m4, m2, 2
483    SPLATW    m2, m2, 0
484    pmullw    m2, [pw_m32101234]   ; b
485    pmullw    m5, m4, [pw_m3]      ; c
486    paddw     m5, [pw_16]
487    mov      r2d, 8
488    add       r0, r1
489.loop:
490    paddsw    m6, m2, m5
491    paddsw    m6, m0
492    psraw     m6, 5
493    CLIPW     m6, m1, m3
494    mova    [r0], m6
495    paddw     m5, m4
496    add       r0, r1
497    dec r2d
498    jg .loop
499    REP_RET
500
501
502;-----------------------------------------------------------------------------
503; void ff_pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright,
504;                         int stride)
505;-----------------------------------------------------------------------------
506%macro PRED8x8L_128_DC 0
507cglobal pred8x8l_128_dc_10, 4, 4
508    mova      m0, [pw_512] ; (1<<(BIT_DEPTH-1))
509    lea       r1, [r3*3]
510    lea       r2, [r0+r3*4]
511    MOV8 r0+r3*0, m0, m0
512    MOV8 r0+r3*1, m0, m0
513    MOV8 r0+r3*2, m0, m0
514    MOV8 r0+r1*1, m0, m0
515    MOV8 r2+r3*0, m0, m0
516    MOV8 r2+r3*1, m0, m0
517    MOV8 r2+r3*2, m0, m0
518    MOV8 r2+r1*1, m0, m0
519    RET
520%endmacro
521
522INIT_MMX mmxext
523PRED8x8L_128_DC
524INIT_XMM sse2
525PRED8x8L_128_DC
526
527;-----------------------------------------------------------------------------
528; void ff_pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright,
529;                         int stride)
530;-----------------------------------------------------------------------------
531%macro PRED8x8L_TOP_DC 0
532cglobal pred8x8l_top_dc_10, 4, 4, 6
533    sub         r0, r3
534    mova        m0, [r0]
535    shr        r1d, 14
536    shr        r2d, 13
537    neg         r1
538    pslldq      m1, m0, 2
539    psrldq      m2, m0, 2
540    pinsrw      m1, [r0+r1], 0
541    pinsrw      m2, [r0+r2+14], 7
542    lea         r1, [r3*3]
543    lea         r2, [r0+r3*4]
544    PRED4x4_LOWPASS m0, m2, m1, m0
545    HADDW       m0, m1
546    paddw       m0, [pw_4]
547    psrlw       m0, 3
548    SPLATW      m0, m0, 0
549    mova [r0+r3*1], m0
550    mova [r0+r3*2], m0
551    mova [r0+r1*1], m0
552    mova [r0+r3*4], m0
553    mova [r2+r3*1], m0
554    mova [r2+r3*2], m0
555    mova [r2+r1*1], m0
556    mova [r2+r3*4], m0
557    RET
558%endmacro
559
560INIT_XMM sse2
561PRED8x8L_TOP_DC
562%if HAVE_AVX_EXTERNAL
563INIT_XMM avx
564PRED8x8L_TOP_DC
565%endif
566
567;-------------------------------------------------------------------------------
568; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
569;-------------------------------------------------------------------------------
570;TODO: see if scalar is faster
571%macro PRED8x8L_DC 0
572cglobal pred8x8l_dc_10, 4, 6, 6
573    sub         r0, r3
574    lea         r4, [r0+r3*4]
575    lea         r5, [r3*3]
576    mova        m0, [r0+r3*2-16]
577    punpckhwd   m0, [r0+r3*1-16]
578    mova        m1, [r4+r3*0-16]
579    punpckhwd   m1, [r0+r5*1-16]
580    punpckhdq   m1, m0
581    mova        m2, [r4+r3*2-16]
582    punpckhwd   m2, [r4+r3*1-16]
583    mova        m3, [r4+r3*4-16]
584    punpckhwd   m3, [r4+r5*1-16]
585    punpckhdq   m3, m2
586    punpckhqdq  m3, m1
587    mova        m0, [r0]
588    shr        r1d, 14
589    shr        r2d, 13
590    neg         r1
591    pslldq      m1, m0, 2
592    psrldq      m2, m0, 2
593    pinsrw      m1, [r0+r1], 0
594    pinsrw      m2, [r0+r2+14], 7
595    not         r1
596    and         r1, r3
597    pslldq      m4, m3, 2
598    psrldq      m5, m3, 2
599    pshuflw     m4, m4, 11100101b
600    pinsrw      m5, [r0+r1-2], 7
601    PRED4x4_LOWPASS m3, m4, m5, m3
602    PRED4x4_LOWPASS m0, m2, m1, m0
603    paddw       m0, m3
604    HADDW       m0, m1
605    paddw       m0, [pw_8]
606    psrlw       m0, 4
607    SPLATW      m0, m0
608    mova [r0+r3*1], m0
609    mova [r0+r3*2], m0
610    mova [r0+r5*1], m0
611    mova [r0+r3*4], m0
612    mova [r4+r3*1], m0
613    mova [r4+r3*2], m0
614    mova [r4+r5*1], m0
615    mova [r4+r3*4], m0
616    RET
617%endmacro
618
619INIT_XMM sse2
620PRED8x8L_DC
621%if HAVE_AVX_EXTERNAL
622INIT_XMM avx
623PRED8x8L_DC
624%endif
625
626;-----------------------------------------------------------------------------
627; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
628;                           int stride)
629;-----------------------------------------------------------------------------
630%macro PRED8x8L_VERTICAL 0
631cglobal pred8x8l_vertical_10, 4, 4, 6
632    sub         r0, r3
633    mova        m0, [r0]
634    shr        r1d, 14
635    shr        r2d, 13
636    neg         r1
637    pslldq      m1, m0, 2
638    psrldq      m2, m0, 2
639    pinsrw      m1, [r0+r1], 0
640    pinsrw      m2, [r0+r2+14], 7
641    lea         r1, [r3*3]
642    lea         r2, [r0+r3*4]
643    PRED4x4_LOWPASS m0, m2, m1, m0
644    mova [r0+r3*1], m0
645    mova [r0+r3*2], m0
646    mova [r0+r1*1], m0
647    mova [r0+r3*4], m0
648    mova [r2+r3*1], m0
649    mova [r2+r3*2], m0
650    mova [r2+r1*1], m0
651    mova [r2+r3*4], m0
652    RET
653%endmacro
654
655INIT_XMM sse2
656PRED8x8L_VERTICAL
657%if HAVE_AVX_EXTERNAL
658INIT_XMM avx
659PRED8x8L_VERTICAL
660%endif
661
662;-----------------------------------------------------------------------------
663; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
664;                             int stride)
665;-----------------------------------------------------------------------------
666%macro PRED8x8L_HORIZONTAL 0
667cglobal pred8x8l_horizontal_10, 4, 4, 5
668    mova        m0, [r0-16]
669    shr        r1d, 14
670    dec         r1
671    and         r1, r3
672    sub         r1, r3
673    punpckhwd   m0, [r0+r1-16]
674    mova        m1, [r0+r3*2-16]
675    punpckhwd   m1, [r0+r3*1-16]
676    lea         r2, [r0+r3*4]
677    lea         r1, [r3*3]
678    punpckhdq   m1, m0
679    mova        m2, [r2+r3*0-16]
680    punpckhwd   m2, [r0+r1-16]
681    mova        m3, [r2+r3*2-16]
682    punpckhwd   m3, [r2+r3*1-16]
683    punpckhdq   m3, m2
684    punpckhqdq  m3, m1
685    PALIGNR     m4, m3, [r2+r1-16], 14, m0
686    pslldq      m0, m4, 2
687    pshuflw     m0, m0, 11100101b
688    PRED4x4_LOWPASS m4, m3, m0, m4
689    punpckhwd   m3, m4, m4
690    punpcklwd   m4, m4
691    pshufd      m0, m3, 0xff
692    pshufd      m1, m3, 0xaa
693    pshufd      m2, m3, 0x55
694    pshufd      m3, m3, 0x00
695    mova [r0+r3*0], m0
696    mova [r0+r3*1], m1
697    mova [r0+r3*2], m2
698    mova [r0+r1*1], m3
699    pshufd      m0, m4, 0xff
700    pshufd      m1, m4, 0xaa
701    pshufd      m2, m4, 0x55
702    pshufd      m3, m4, 0x00
703    mova [r2+r3*0], m0
704    mova [r2+r3*1], m1
705    mova [r2+r3*2], m2
706    mova [r2+r1*1], m3
707    RET
708%endmacro
709
710INIT_XMM sse2
711PRED8x8L_HORIZONTAL
712INIT_XMM ssse3
713PRED8x8L_HORIZONTAL
714%if HAVE_AVX_EXTERNAL
715INIT_XMM avx
716PRED8x8L_HORIZONTAL
717%endif
718
719;-----------------------------------------------------------------------------
720; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
721;                            int stride)
722;-----------------------------------------------------------------------------
723%macro PRED8x8L_DOWN_LEFT 0
724cglobal pred8x8l_down_left_10, 4, 4, 7
725    sub         r0, r3
726    mova        m3, [r0]
727    shr        r1d, 14
728    neg         r1
729    shr        r2d, 13
730    pslldq      m1, m3, 2
731    psrldq      m2, m3, 2
732    pinsrw      m1, [r0+r1], 0
733    pinsrw      m2, [r0+r2+14], 7
734    PRED4x4_LOWPASS m6, m2, m1, m3
735    jz .fix_tr ; flags from shr r2d
736    mova        m1, [r0+16]
737    psrldq      m5, m1, 2
738    PALIGNR     m2, m1, m3, 14, m3
739    pshufhw     m5, m5, 10100100b
740    PRED4x4_LOWPASS m1, m2, m5, m1
741.do_topright:
742    lea         r1, [r3*3]
743    psrldq      m5, m1, 14
744    lea         r2, [r0+r3*4]
745    PALIGNR     m2, m1, m6,  2, m0
746    PALIGNR     m3, m1, m6, 14, m0
747    PALIGNR     m5, m1,  2, m0
748    pslldq      m4, m6, 2
749    PRED4x4_LOWPASS m6, m4, m2, m6
750    PRED4x4_LOWPASS m1, m3, m5, m1
751    mova [r2+r3*4], m1
752    PALIGNR     m1, m6, 14, m2
753    pslldq      m6, 2
754    mova [r2+r1*1], m1
755    PALIGNR     m1, m6, 14, m2
756    pslldq      m6, 2
757    mova [r2+r3*2], m1
758    PALIGNR     m1, m6, 14, m2
759    pslldq      m6, 2
760    mova [r2+r3*1], m1
761    PALIGNR     m1, m6, 14, m2
762    pslldq      m6, 2
763    mova [r0+r3*4], m1
764    PALIGNR     m1, m6, 14, m2
765    pslldq      m6, 2
766    mova [r0+r1*1], m1
767    PALIGNR     m1, m6, 14, m2
768    pslldq      m6, 2
769    mova [r0+r3*2], m1
770    PALIGNR     m1, m6, 14, m6
771    mova [r0+r3*1], m1
772    RET
773.fix_tr:
774    punpckhwd   m3, m3
775    pshufd      m1, m3, 0xFF
776    jmp .do_topright
777%endmacro
778
779INIT_XMM sse2
780PRED8x8L_DOWN_LEFT
781INIT_XMM ssse3
782PRED8x8L_DOWN_LEFT
783%if HAVE_AVX_EXTERNAL
784INIT_XMM avx
785PRED8x8L_DOWN_LEFT
786%endif
787
788;-----------------------------------------------------------------------------
789; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
790;                             int stride)
791;-----------------------------------------------------------------------------
792%macro PRED8x8L_DOWN_RIGHT 0
793; standard forbids this when has_topleft is false
794; no need to check
795cglobal pred8x8l_down_right_10, 4, 5, 8
796    sub         r0, r3
797    lea         r4, [r0+r3*4]
798    lea         r1, [r3*3]
799    mova        m0, [r0+r3*1-16]
800    punpckhwd   m0, [r0+r3*0-16]
801    mova        m1, [r0+r1*1-16]
802    punpckhwd   m1, [r0+r3*2-16]
803    punpckhdq   m1, m0
804    mova        m2, [r4+r3*1-16]
805    punpckhwd   m2, [r4+r3*0-16]
806    mova        m3, [r4+r1*1-16]
807    punpckhwd   m3, [r4+r3*2-16]
808    punpckhdq   m3, m2
809    punpckhqdq  m3, m1
810    mova        m0, [r4+r3*4-16]
811    mova        m1, [r0]
812    PALIGNR     m4, m3, m0, 14, m0
813    PALIGNR     m1, m3,  2, m2
814    pslldq      m0, m4, 2
815    pshuflw     m0, m0, 11100101b
816    PRED4x4_LOWPASS m6, m1, m4, m3
817    PRED4x4_LOWPASS m4, m3, m0, m4
818    mova        m3, [r0]
819    shr        r2d, 13
820    pslldq      m1, m3, 2
821    psrldq      m2, m3, 2
822    pinsrw      m1, [r0-2], 0
823    pinsrw      m2, [r0+r2+14], 7
824    PRED4x4_LOWPASS m3, m2, m1, m3
825    PALIGNR     m2, m3, m6,  2, m0
826    PALIGNR     m5, m3, m6, 14, m0
827    psrldq      m7, m3, 2
828    PRED4x4_LOWPASS m6, m4, m2, m6
829    PRED4x4_LOWPASS m3, m5, m7, m3
830    mova [r4+r3*4], m6
831    PALIGNR     m3, m6, 14, m2
832    pslldq      m6, 2
833    mova [r0+r3*1], m3
834    PALIGNR     m3, m6, 14, m2
835    pslldq      m6, 2
836    mova [r0+r3*2], m3
837    PALIGNR     m3, m6, 14, m2
838    pslldq      m6, 2
839    mova [r0+r1*1], m3
840    PALIGNR     m3, m6, 14, m2
841    pslldq      m6, 2
842    mova [r0+r3*4], m3
843    PALIGNR     m3, m6, 14, m2
844    pslldq      m6, 2
845    mova [r4+r3*1], m3
846    PALIGNR     m3, m6, 14, m2
847    pslldq      m6, 2
848    mova [r4+r3*2], m3
849    PALIGNR     m3, m6, 14, m6
850    mova [r4+r1*1], m3
851    RET
852%endmacro
853
854INIT_XMM sse2
855PRED8x8L_DOWN_RIGHT
856INIT_XMM ssse3
857PRED8x8L_DOWN_RIGHT
858%if HAVE_AVX_EXTERNAL
859INIT_XMM avx
860PRED8x8L_DOWN_RIGHT
861%endif
862
863;-----------------------------------------------------------------------------
864; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
865;                                 int has_topright, int stride)
866;-----------------------------------------------------------------------------
867%macro PRED8x8L_VERTICAL_RIGHT 0
868; likewise with 8x8l_down_right
869cglobal pred8x8l_vertical_right_10, 4, 5, 7
870    sub         r0, r3
871    lea         r4, [r0+r3*4]
872    lea         r1, [r3*3]
873    mova        m0, [r0+r3*1-16]
874    punpckhwd   m0, [r0+r3*0-16]
875    mova        m1, [r0+r1*1-16]
876    punpckhwd   m1, [r0+r3*2-16]
877    punpckhdq   m1, m0
878    mova        m2, [r4+r3*1-16]
879    punpckhwd   m2, [r4+r3*0-16]
880    mova        m3, [r4+r1*1-16]
881    punpckhwd   m3, [r4+r3*2-16]
882    punpckhdq   m3, m2
883    punpckhqdq  m3, m1
884    mova        m0, [r4+r3*4-16]
885    mova        m1, [r0]
886    PALIGNR     m4, m3, m0, 14, m0
887    PALIGNR     m1, m3,  2, m2
888    PRED4x4_LOWPASS m3, m1, m4, m3
889    mova        m2, [r0]
890    shr        r2d, 13
891    pslldq      m1, m2, 2
892    psrldq      m5, m2, 2
893    pinsrw      m1, [r0-2], 0
894    pinsrw      m5, [r0+r2+14], 7
895    PRED4x4_LOWPASS m2, m5, m1, m2
896    PALIGNR     m6, m2, m3, 12, m1
897    PALIGNR     m5, m2, m3, 14, m0
898    PRED4x4_LOWPASS m0, m6, m2, m5
899    pavgw       m2, m5
900    mova [r0+r3*2], m0
901    mova [r0+r3*1], m2
902    pslldq      m6, m3, 4
903    pslldq      m1, m3, 2
904    PRED4x4_LOWPASS m1, m3, m6, m1
905    PALIGNR     m2, m1, 14, m4
906    mova [r0+r1*1], m2
907    pslldq      m1, 2
908    PALIGNR     m0, m1, 14, m3
909    mova [r0+r3*4], m0
910    pslldq      m1, 2
911    PALIGNR     m2, m1, 14, m4
912    mova [r4+r3*1], m2
913    pslldq      m1, 2
914    PALIGNR     m0, m1, 14, m3
915    mova [r4+r3*2], m0
916    pslldq      m1, 2
917    PALIGNR     m2, m1, 14, m4
918    mova [r4+r1*1], m2
919    pslldq      m1, 2
920    PALIGNR     m0, m1, 14, m1
921    mova [r4+r3*4], m0
922    RET
923%endmacro
924
925INIT_XMM sse2
926PRED8x8L_VERTICAL_RIGHT
927INIT_XMM ssse3
928PRED8x8L_VERTICAL_RIGHT
929%if HAVE_AVX_EXTERNAL
930INIT_XMM avx
931PRED8x8L_VERTICAL_RIGHT
932%endif
933
934;-----------------------------------------------------------------------------
935; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
936;                                int has_topright, int stride)
937;-----------------------------------------------------------------------------
938%macro PRED8x8L_HORIZONTAL_UP 0
939cglobal pred8x8l_horizontal_up_10, 4, 4, 6
940    mova        m0, [r0+r3*0-16]
941    punpckhwd   m0, [r0+r3*1-16]
942    shr        r1d, 14
943    dec         r1
944    and         r1, r3
945    sub         r1, r3
946    mova        m4, [r0+r1*1-16]
947    lea         r1, [r3*3]
948    lea         r2, [r0+r3*4]
949    mova        m1, [r0+r3*2-16]
950    punpckhwd   m1, [r0+r1*1-16]
951    punpckhdq   m0, m1
952    mova        m2, [r2+r3*0-16]
953    punpckhwd   m2, [r2+r3*1-16]
954    mova        m3, [r2+r3*2-16]
955    punpckhwd   m3, [r2+r1*1-16]
956    punpckhdq   m2, m3
957    punpckhqdq  m0, m2
958    PALIGNR     m1, m0, m4, 14, m4
959    psrldq      m2, m0, 2
960    pshufhw     m2, m2, 10100100b
961    PRED4x4_LOWPASS m0, m1, m2, m0
962    psrldq      m1, m0, 2
963    psrldq      m2, m0, 4
964    pshufhw     m1, m1, 10100100b
965    pshufhw     m2, m2, 01010100b
966    pavgw       m4, m0, m1
967    PRED4x4_LOWPASS m1, m2, m0, m1
968    punpckhwd   m5, m4, m1
969    punpcklwd   m4, m1
970    mova [r2+r3*0], m5
971    mova [r0+r3*0], m4
972    pshufd      m0, m5, 11111001b
973    pshufd      m1, m5, 11111110b
974    pshufd      m2, m5, 11111111b
975    mova [r2+r3*1], m0
976    mova [r2+r3*2], m1
977    mova [r2+r1*1], m2
978    PALIGNR     m2, m5, m4, 4, m0
979    PALIGNR     m3, m5, m4, 8, m1
980    PALIGNR     m5, m5, m4, 12, m4
981    mova [r0+r3*1], m2
982    mova [r0+r3*2], m3
983    mova [r0+r1*1], m5
984    RET
985%endmacro
986
987INIT_XMM sse2
988PRED8x8L_HORIZONTAL_UP
989INIT_XMM ssse3
990PRED8x8L_HORIZONTAL_UP
991%if HAVE_AVX_EXTERNAL
992INIT_XMM avx
993PRED8x8L_HORIZONTAL_UP
994%endif
995
996
997;-----------------------------------------------------------------------------
998; void ff_pred16x16_vertical(pixel *src, int stride)
999;-----------------------------------------------------------------------------
1000%macro MOV16 3-5
1001    mova [%1+     0], %2
1002    mova [%1+mmsize], %3
1003%if mmsize==8
1004    mova [%1+    16], %4
1005    mova [%1+    24], %5
1006%endif
1007%endmacro
1008
1009%macro PRED16x16_VERTICAL 0
1010cglobal pred16x16_vertical_10, 2, 3
1011    sub   r0, r1
1012    mov  r2d, 8
1013    mova  m0, [r0+ 0]
1014    mova  m1, [r0+mmsize]
1015%if mmsize==8
1016    mova  m2, [r0+16]
1017    mova  m3, [r0+24]
1018%endif
1019.loop:
1020    MOV16 r0+r1*1, m0, m1, m2, m3
1021    MOV16 r0+r1*2, m0, m1, m2, m3
1022    lea   r0, [r0+r1*2]
1023    dec   r2d
1024    jg .loop
1025    REP_RET
1026%endmacro
1027
1028INIT_MMX mmxext
1029PRED16x16_VERTICAL
1030INIT_XMM sse2
1031PRED16x16_VERTICAL
1032
1033;-----------------------------------------------------------------------------
1034; void ff_pred16x16_horizontal(pixel *src, int stride)
1035;-----------------------------------------------------------------------------
1036%macro PRED16x16_HORIZONTAL 0
1037cglobal pred16x16_horizontal_10, 2, 3
1038    mov   r2d, 8
1039.vloop:
1040    movd   m0, [r0+r1*0-4]
1041    movd   m1, [r0+r1*1-4]
1042    SPLATW m0, m0, 1
1043    SPLATW m1, m1, 1
1044    MOV16  r0+r1*0, m0, m0, m0, m0
1045    MOV16  r0+r1*1, m1, m1, m1, m1
1046    lea    r0, [r0+r1*2]
1047    dec    r2d
1048    jg .vloop
1049    REP_RET
1050%endmacro
1051
1052INIT_MMX mmxext
1053PRED16x16_HORIZONTAL
1054INIT_XMM sse2
1055PRED16x16_HORIZONTAL
1056
1057;-----------------------------------------------------------------------------
1058; void ff_pred16x16_dc(pixel *src, int stride)
1059;-----------------------------------------------------------------------------
1060%macro PRED16x16_DC 0
1061cglobal pred16x16_dc_10, 2, 6
1062    mov        r5, r0
1063    sub        r0, r1
1064    mova       m0, [r0+0]
1065    paddw      m0, [r0+mmsize]
1066%if mmsize==8
1067    paddw      m0, [r0+16]
1068    paddw      m0, [r0+24]
1069%endif
1070    HADDW      m0, m2
1071
1072    lea        r0, [r0+r1-2]
1073    movzx     r3d, word [r0]
1074    movzx     r4d, word [r0+r1]
1075%rep 7
1076    lea        r0, [r0+r1*2]
1077    movzx     r2d, word [r0]
1078    add       r3d, r2d
1079    movzx     r2d, word [r0+r1]
1080    add       r4d, r2d
1081%endrep
1082    lea       r3d, [r3+r4+16]
1083
1084    movd       m1, r3d
1085    paddw      m0, m1
1086    psrlw      m0, 5
1087    SPLATW     m0, m0
1088    mov       r3d, 8
1089.loop:
1090    MOV16 r5+r1*0, m0, m0, m0, m0
1091    MOV16 r5+r1*1, m0, m0, m0, m0
1092    lea        r5, [r5+r1*2]
1093    dec       r3d
1094    jg .loop
1095    REP_RET
1096%endmacro
1097
1098INIT_MMX mmxext
1099PRED16x16_DC
1100INIT_XMM sse2
1101PRED16x16_DC
1102
1103;-----------------------------------------------------------------------------
1104; void ff_pred16x16_top_dc(pixel *src, int stride)
1105;-----------------------------------------------------------------------------
1106%macro PRED16x16_TOP_DC 0
1107cglobal pred16x16_top_dc_10, 2, 3
1108    sub        r0, r1
1109    mova       m0, [r0+0]
1110    paddw      m0, [r0+mmsize]
1111%if mmsize==8
1112    paddw      m0, [r0+16]
1113    paddw      m0, [r0+24]
1114%endif
1115    HADDW      m0, m2
1116
1117    SPLATW     m0, m0
1118    paddw      m0, [pw_8]
1119    psrlw      m0, 4
1120    mov       r2d, 8
1121.loop:
1122    MOV16 r0+r1*1, m0, m0, m0, m0
1123    MOV16 r0+r1*2, m0, m0, m0, m0
1124    lea        r0, [r0+r1*2]
1125    dec       r2d
1126    jg .loop
1127    REP_RET
1128%endmacro
1129
1130INIT_MMX mmxext
1131PRED16x16_TOP_DC
1132INIT_XMM sse2
1133PRED16x16_TOP_DC
1134
1135;-----------------------------------------------------------------------------
1136; void ff_pred16x16_left_dc(pixel *src, int stride)
1137;-----------------------------------------------------------------------------
1138%macro PRED16x16_LEFT_DC 0
1139cglobal pred16x16_left_dc_10, 2, 6
1140    mov        r5, r0
1141
1142    sub        r0, 2
1143    movzx     r3d, word [r0]
1144    movzx     r4d, word [r0+r1]
1145%rep 7
1146    lea        r0, [r0+r1*2]
1147    movzx     r2d, word [r0]
1148    add       r3d, r2d
1149    movzx     r2d, word [r0+r1]
1150    add       r4d, r2d
1151%endrep
1152    lea       r3d, [r3+r4+8]
1153    shr       r3d, 4
1154
1155    movd       m0, r3d
1156    SPLATW     m0, m0
1157    mov       r3d, 8
1158.loop:
1159    MOV16 r5+r1*0, m0, m0, m0, m0
1160    MOV16 r5+r1*1, m0, m0, m0, m0
1161    lea        r5, [r5+r1*2]
1162    dec       r3d
1163    jg .loop
1164    REP_RET
1165%endmacro
1166
1167INIT_MMX mmxext
1168PRED16x16_LEFT_DC
1169INIT_XMM sse2
1170PRED16x16_LEFT_DC
1171
1172;-----------------------------------------------------------------------------
1173; void ff_pred16x16_128_dc(pixel *src, int stride)
1174;-----------------------------------------------------------------------------
1175%macro PRED16x16_128_DC 0
1176cglobal pred16x16_128_dc_10, 2,3
1177    mova       m0, [pw_512]
1178    mov       r2d, 8
1179.loop:
1180    MOV16 r0+r1*0, m0, m0, m0, m0
1181    MOV16 r0+r1*1, m0, m0, m0, m0
1182    lea        r0, [r0+r1*2]
1183    dec       r2d
1184    jg .loop
1185    REP_RET
1186%endmacro
1187
1188INIT_MMX mmxext
1189PRED16x16_128_DC
1190INIT_XMM sse2
1191PRED16x16_128_DC
1192