1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7;*
8;* This file is part of Libav.
9;*
10;* Libav is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* Libav is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with Libav; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "x86inc.asm"
26%include "x86util.asm"
27
28SECTION_RODATA
29
30cextern pw_16
31cextern pw_8
32cextern pw_4
33cextern pw_2
34cextern pw_1
35
36pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
37pw_m3:        times 8 dw -3
38pw_pixel_max: times 8 dw ((1 << 10)-1)
39pw_512:       times 8 dw 512
40pd_17:        times 4 dd 17
41pd_16:        times 4 dd 16
42
43SECTION .text
44
45; dest, left, right, src
46; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
47%macro PRED4x4_LOWPASS 4
48    paddw       %2, %3
49    psrlw       %2, 1
50    pavgw       %1, %4, %2
51%endmacro
52
53;-----------------------------------------------------------------------------
54; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
55;-----------------------------------------------------------------------------
56%macro PRED4x4_DR 1
57cglobal pred4x4_down_right_10_%1, 3,3
58    sub       r0, r2
59    lea       r1, [r0+r2*2]
60    movhps    m1, [r1-8]
61    movhps    m2, [r0+r2*1-8]
62    movhps    m4, [r0-8]
63    punpckhwd m2, m4
64    movq      m3, [r0]
65    punpckhdq m1, m2
66    PALIGNR   m3, m1, 10, m1
67    movhps    m4, [r1+r2*1-8]
68    PALIGNR   m0, m3, m4, 14, m4
69    movhps    m4, [r1+r2*2-8]
70    PALIGNR   m2, m0, m4, 14, m4
71    PRED4x4_LOWPASS m0, m2, m3, m0
72    movq      [r1+r2*2], m0
73    psrldq    m0, 2
74    movq      [r1+r2*1], m0
75    psrldq    m0, 2
76    movq      [r0+r2*2], m0
77    psrldq    m0, 2
78    movq      [r0+r2*1], m0
79    RET
80%endmacro
81
82INIT_XMM
83%define PALIGNR PALIGNR_MMX
84PRED4x4_DR sse2
85%define PALIGNR PALIGNR_SSSE3
86PRED4x4_DR ssse3
87%ifdef HAVE_AVX
88INIT_AVX
89PRED4x4_DR avx
90%endif
91
92;-----------------------------------------------------------------------------
93; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
94;-----------------------------------------------------------------------------
95%macro PRED4x4_VR 1
96cglobal pred4x4_vertical_right_10_%1, 3,3,6
97    sub     r0, r2
98    lea     r1, [r0+r2*2]
99    movq    m5, [r0]            ; ........t3t2t1t0
100    movhps  m1, [r0-8]
101    PALIGNR m0, m5, m1, 14, m1  ; ......t3t2t1t0lt
102    pavgw   m5, m0
103    movhps  m1, [r0+r2*1-8]
104    PALIGNR m0, m1, 14, m1      ; ....t3t2t1t0ltl0
105    movhps  m2, [r0+r2*2-8]
106    PALIGNR m1, m0, m2, 14, m2  ; ..t3t2t1t0ltl0l1
107    movhps  m3, [r1+r2*1-8]
108    PALIGNR m2, m1, m3, 14, m3  ; t3t2t1t0ltl0l1l2
109    PRED4x4_LOWPASS m1, m0, m2, m1
110    pslldq  m0, m1, 12
111    psrldq  m1, 4
112    movq    [r0+r2*1], m5
113    movq    [r0+r2*2], m1
114    PALIGNR m5, m0, 14, m2
115    pslldq  m0, 2
116    movq    [r1+r2*1], m5
117    PALIGNR m1, m0, 14, m0
118    movq    [r1+r2*2], m1
119    RET
120%endmacro
121
122INIT_XMM
123%define PALIGNR PALIGNR_MMX
124PRED4x4_VR sse2
125%define PALIGNR PALIGNR_SSSE3
126PRED4x4_VR ssse3
127%ifdef HAVE_AVX
128INIT_AVX
129PRED4x4_VR avx
130%endif
131
132;-----------------------------------------------------------------------------
133; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
134;-----------------------------------------------------------------------------
135%macro PRED4x4_HD 1
136cglobal pred4x4_horizontal_down_10_%1, 3,3
137    sub        r0, r2
138    lea        r1, [r0+r2*2]
139    movq       m0, [r0-8]      ; lt ..
140    movhps     m0, [r0]
141    pslldq     m0, 2           ; t2 t1 t0 lt .. .. .. ..
142    movq       m1, [r1+r2*2-8] ; l3
143    movq       m3, [r1+r2*1-8]
144    punpcklwd  m1, m3          ; l2 l3
145    movq       m2, [r0+r2*2-8] ; l1
146    movq       m3, [r0+r2*1-8]
147    punpcklwd  m2, m3          ; l0 l1
148    punpckhdq  m1, m2          ; l0 l1 l2 l3
149    punpckhqdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
150    psrldq     m0, m1, 4       ; .. .. t2 t1 t0 lt l0 l1
151    psrldq     m3, m1, 2       ; .. t2 t1 t0 lt l0 l1 l2
152    pavgw      m5, m1, m3
153    PRED4x4_LOWPASS m3, m1, m0, m3
154    punpcklwd  m5, m3
155    psrldq     m3, 8
156    PALIGNR    m3, m5, 12, m4
157    movq       [r1+r2*2], m5
158    movhps     [r0+r2*2], m5
159    psrldq     m5, 4
160    movq       [r1+r2*1], m5
161    movq       [r0+r2*1], m3
162    RET
163%endmacro
164
165INIT_XMM
166%define PALIGNR PALIGNR_MMX
167PRED4x4_HD sse2
168%define PALIGNR PALIGNR_SSSE3
169PRED4x4_HD ssse3
170%ifdef HAVE_AVX
171INIT_AVX
172PRED4x4_HD avx
173%endif
174
175;-----------------------------------------------------------------------------
176; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
177;-----------------------------------------------------------------------------
178%macro HADDD 2 ; sum junk
179%if mmsize == 16
180    movhlps %2, %1
181    paddd   %1, %2
182    pshuflw %2, %1, 0xE
183    paddd   %1, %2
184%else
185    pshufw  %2, %1, 0xE
186    paddd   %1, %2
187%endif
188%endmacro
189
190%macro HADDW 2
191    pmaddwd %1, [pw_1]
192    HADDD   %1, %2
193%endmacro
194
195INIT_MMX
196cglobal pred4x4_dc_10_mmxext, 3,3
197    sub    r0, r2
198    lea    r1, [r0+r2*2]
199    movq   m2, [r0+r2*1-8]
200    paddw  m2, [r0+r2*2-8]
201    paddw  m2, [r1+r2*1-8]
202    paddw  m2, [r1+r2*2-8]
203    psrlq  m2, 48
204    movq   m0, [r0]
205    HADDW  m0, m1
206    paddw  m0, [pw_4]
207    paddw  m0, m2
208    psrlw  m0, 3
209    SPLATW m0, m0, 0
210    movq   [r0+r2*1], m0
211    movq   [r0+r2*2], m0
212    movq   [r1+r2*1], m0
213    movq   [r1+r2*2], m0
214    RET
215
216;-----------------------------------------------------------------------------
217; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
218;-----------------------------------------------------------------------------
219%macro PRED4x4_DL 1
220cglobal pred4x4_down_left_10_%1, 3,3
221    sub        r0, r2
222    movq       m0, [r0]
223    movhps     m0, [r1]
224    psrldq     m2, m0, 2
225    pslldq     m3, m0, 2
226    pshufhw    m2, m2, 10100100b
227    PRED4x4_LOWPASS m0, m3, m2, m0
228    lea        r1, [r0+r2*2]
229    movhps     [r1+r2*2], m0
230    psrldq     m0, 2
231    movq       [r0+r2*1], m0
232    psrldq     m0, 2
233    movq       [r0+r2*2], m0
234    psrldq     m0, 2
235    movq       [r1+r2*1], m0
236    RET
237%endmacro
238
239INIT_XMM
240PRED4x4_DL sse2
241%ifdef HAVE_AVX
242INIT_AVX
243PRED4x4_DL avx
244%endif
245
246;-----------------------------------------------------------------------------
247; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
248;-----------------------------------------------------------------------------
249%macro PRED4x4_VL 1
250cglobal pred4x4_vertical_left_10_%1, 3,3
251    sub        r0, r2
252    movu       m1, [r0]
253    movhps     m1, [r1]
254    psrldq     m0, m1, 2
255    psrldq     m2, m1, 4
256    pavgw      m4, m0, m1
257    PRED4x4_LOWPASS m0, m1, m2, m0
258    lea        r1, [r0+r2*2]
259    movq       [r0+r2*1], m4
260    movq       [r0+r2*2], m0
261    psrldq     m4, 2
262    psrldq     m0, 2
263    movq       [r1+r2*1], m4
264    movq       [r1+r2*2], m0
265    RET
266%endmacro
267
268INIT_XMM
269PRED4x4_VL sse2
270%ifdef HAVE_AVX
271INIT_AVX
272PRED4x4_VL avx
273%endif
274
275;-----------------------------------------------------------------------------
276; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
277;-----------------------------------------------------------------------------
278INIT_MMX
279cglobal pred4x4_horizontal_up_10_mmxext, 3,3
280    sub       r0, r2
281    lea       r1, [r0+r2*2]
282    movq      m0, [r0+r2*1-8]
283    punpckhwd m0, [r0+r2*2-8]
284    movq      m1, [r1+r2*1-8]
285    punpckhwd m1, [r1+r2*2-8]
286    punpckhdq m0, m1
287    pshufw    m1, m1, 0xFF
288    movq      [r1+r2*2], m1
289    movd      [r1+r2*1+4], m1
290    pshufw    m2, m0, 11111001b
291    movq      m1, m2
292    pavgw     m2, m0
293
294    pshufw    m5, m0, 11111110b
295    PRED4x4_LOWPASS m1, m0, m5, m1
296    movq      m6, m2
297    punpcklwd m6, m1
298    movq      [r0+r2*1], m6
299    psrlq     m2, 16
300    psrlq     m1, 16
301    punpcklwd m2, m1
302    movq      [r0+r2*2], m2
303    psrlq     m2, 32
304    movd      [r1+r2*1], m2
305    RET
306
307
308
309;-----------------------------------------------------------------------------
310; void pred8x8_vertical(pixel *src, int stride)
311;-----------------------------------------------------------------------------
312INIT_XMM
313cglobal pred8x8_vertical_10_sse2, 2,2
314    sub  r0, r1
315    mova m0, [r0]
316%rep 3
317    mova [r0+r1*1], m0
318    mova [r0+r1*2], m0
319    lea  r0, [r0+r1*2]
320%endrep
321    mova [r0+r1*1], m0
322    mova [r0+r1*2], m0
323    RET
324
325;-----------------------------------------------------------------------------
326; void pred8x8_horizontal(pixel *src, int stride)
327;-----------------------------------------------------------------------------
328INIT_XMM
329cglobal pred8x8_horizontal_10_sse2, 2,3
330    mov         r2d, 4
331.loop:
332    movq         m0, [r0+r1*0-8]
333    movq         m1, [r0+r1*1-8]
334    pshuflw      m0, m0, 0xff
335    pshuflw      m1, m1, 0xff
336    punpcklqdq   m0, m0
337    punpcklqdq   m1, m1
338    mova  [r0+r1*0], m0
339    mova  [r0+r1*1], m1
340    lea          r0, [r0+r1*2]
341    dec          r2d
342    jg .loop
343    REP_RET
344
345;-----------------------------------------------------------------------------
346; void predict_8x8_dc(pixel *src, int stride)
347;-----------------------------------------------------------------------------
348%macro MOV8 2-3
349; sort of a hack, but it works
350%if mmsize==8
351    movq    [%1+0], %2
352    movq    [%1+8], %3
353%else
354    movdqa    [%1], %2
355%endif
356%endmacro
357
358%macro PRED8x8_DC 2
359cglobal pred8x8_dc_10_%1, 2,6
360    sub         r0, r1
361    pxor        m4, m4
362    movq        m0, [r0+0]
363    movq        m1, [r0+8]
364%if mmsize==16
365    punpcklwd   m0, m1
366    movhlps     m1, m0
367    paddw       m0, m1
368%else
369    pshufw      m2, m0, 00001110b
370    pshufw      m3, m1, 00001110b
371    paddw       m0, m2
372    paddw       m1, m3
373    punpcklwd   m0, m1
374%endif
375    %2          m2, m0, 00001110b
376    paddw       m0, m2
377
378    lea         r5, [r1*3]
379    lea         r4, [r0+r1*4]
380    movzx      r2d, word [r0+r1*1-2]
381    movzx      r3d, word [r0+r1*2-2]
382    add        r2d, r3d
383    movzx      r3d, word [r0+r5*1-2]
384    add        r2d, r3d
385    movzx      r3d, word [r4-2]
386    add        r2d, r3d
387    movd        m2, r2d            ; s2
388
389    movzx      r2d, word [r4+r1*1-2]
390    movzx      r3d, word [r4+r1*2-2]
391    add        r2d, r3d
392    movzx      r3d, word [r4+r5*1-2]
393    add        r2d, r3d
394    movzx      r3d, word [r4+r1*4-2]
395    add        r2d, r3d
396    movd        m3, r2d            ; s3
397
398    punpcklwd   m2, m3
399    punpckldq   m0, m2            ; s0, s1, s2, s3
400    %2          m3, m0, 11110110b ; s2, s1, s3, s3
401    %2          m0, m0, 01110100b ; s0, s1, s3, s1
402    paddw       m0, m3
403    psrlw       m0, 2
404    pavgw       m0, m4            ; s0+s2, s1, s3, s1+s3
405%if mmsize==16
406    punpcklwd   m0, m0
407    pshufd      m3, m0, 11111010b
408    punpckldq   m0, m0
409    SWAP         0,1
410%else
411    pshufw      m1, m0, 0x00
412    pshufw      m2, m0, 0x55
413    pshufw      m3, m0, 0xaa
414    pshufw      m4, m0, 0xff
415%endif
416    MOV8   r0+r1*1, m1, m2
417    MOV8   r0+r1*2, m1, m2
418    MOV8   r0+r5*1, m1, m2
419    MOV8   r0+r1*4, m1, m2
420    MOV8   r4+r1*1, m3, m4
421    MOV8   r4+r1*2, m3, m4
422    MOV8   r4+r5*1, m3, m4
423    MOV8   r4+r1*4, m3, m4
424    RET
425%endmacro
426
427INIT_MMX
428PRED8x8_DC mmxext, pshufw
429INIT_XMM
430PRED8x8_DC sse2  , pshuflw
431
432;-----------------------------------------------------------------------------
433; void pred8x8_top_dc(pixel *src, int stride)
434;-----------------------------------------------------------------------------
435INIT_XMM
436cglobal pred8x8_top_dc_10_sse2, 2,4
437    sub         r0, r1
438    mova        m0, [r0]
439    pshuflw     m1, m0, 0x4e
440    pshufhw     m1, m1, 0x4e
441    paddw       m0, m1
442    pshuflw     m1, m0, 0xb1
443    pshufhw     m1, m1, 0xb1
444    paddw       m0, m1
445    lea         r2, [r1*3]
446    lea         r3, [r0+r1*4]
447    paddw       m0, [pw_2]
448    psrlw       m0, 2
449    mova [r0+r1*1], m0
450    mova [r0+r1*2], m0
451    mova [r0+r2*1], m0
452    mova [r0+r1*4], m0
453    mova [r3+r1*1], m0
454    mova [r3+r1*2], m0
455    mova [r3+r2*1], m0
456    mova [r3+r1*4], m0
457    RET
458
459;-----------------------------------------------------------------------------
460; void pred8x8_plane(pixel *src, int stride)
461;-----------------------------------------------------------------------------
462INIT_XMM
463cglobal pred8x8_plane_10_sse2, 2,7,7
464    sub       r0, r1
465    lea       r2, [r1*3]
466    lea       r3, [r0+r1*4]
467    mova      m2, [r0]
468    pmaddwd   m2, [pw_m32101234]
469    HADDD     m2, m1
470    movd      m0, [r0-4]
471    psrld     m0, 14
472    psubw     m2, m0               ; H
473    movd      m0, [r3+r1*4-4]
474    movd      m1, [r0+12]
475    paddw     m0, m1
476    psllw     m0, 4                ; 16*(src[7*stride-1] + src[-stride+7])
477    movzx    r4d, word [r3+r1*1-2] ; src[4*stride-1]
478    movzx    r5d, word [r0+r2*1-2] ; src[2*stride-1]
479    sub      r4d, r5d
480    movzx    r6d, word [r3+r1*2-2] ; src[5*stride-1]
481    movzx    r5d, word [r0+r1*2-2] ; src[1*stride-1]
482    sub      r6d, r5d
483    lea      r4d, [r4+r6*2]
484    movzx    r5d, word [r3+r2*1-2] ; src[6*stride-1]
485    movzx    r6d, word [r0+r1*1-2] ; src[0*stride-1]
486    sub      r5d, r6d
487    lea      r5d, [r5*3]
488    add      r4d, r5d
489    movzx    r6d, word [r3+r1*4-2] ; src[7*stride-1]
490    movzx    r5d, word [r0+r1*0-2] ; src[ -stride-1]
491    sub      r6d, r5d
492    lea      r4d, [r4+r6*4]
493    movd      m3, r4d              ; V
494    punpckldq m2, m3
495    pmaddwd   m2, [pd_17]
496    paddd     m2, [pd_16]
497    psrad     m2, 5                ; b, c
498
499    mova      m3, [pw_pixel_max]
500    pxor      m1, m1
501    SPLATW    m0, m0, 1
502    SPLATW    m4, m2, 2
503    SPLATW    m2, m2, 0
504    pmullw    m2, [pw_m32101234]   ; b
505    pmullw    m5, m4, [pw_m3]      ; c
506    paddw     m5, [pw_16]
507    mov      r2d, 8
508    add       r0, r1
509.loop:
510    paddsw    m6, m2, m5
511    paddsw    m6, m0
512    psraw     m6, 5
513    CLIPW     m6, m1, m3
514    mova    [r0], m6
515    paddw     m5, m4
516    add       r0, r1
517    dec r2d
518    jg .loop
519    REP_RET
520
521
522;-----------------------------------------------------------------------------
523; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
524;-----------------------------------------------------------------------------
525%macro PRED8x8L_128_DC 1
526cglobal pred8x8l_128_dc_10_%1, 4,4
527    mova      m0, [pw_512] ; (1<<(BIT_DEPTH-1))
528    lea       r1, [r3*3]
529    lea       r2, [r0+r3*4]
530    MOV8 r0+r3*0, m0, m0
531    MOV8 r0+r3*1, m0, m0
532    MOV8 r0+r3*2, m0, m0
533    MOV8 r0+r1*1, m0, m0
534    MOV8 r2+r3*0, m0, m0
535    MOV8 r2+r3*1, m0, m0
536    MOV8 r2+r3*2, m0, m0
537    MOV8 r2+r1*1, m0, m0
538    RET
539%endmacro
540
541INIT_MMX
542PRED8x8L_128_DC mmxext
543INIT_XMM
544PRED8x8L_128_DC sse2
545
546;-----------------------------------------------------------------------------
547; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
548;-----------------------------------------------------------------------------
549%macro PRED8x8L_TOP_DC 1
550cglobal pred8x8l_top_dc_10_%1, 4,4,6
551    sub         r0, r3
552    mova        m0, [r0]
553    shr        r1d, 14
554    shr        r2d, 13
555    neg         r1
556    pslldq      m1, m0, 2
557    psrldq      m2, m0, 2
558    pinsrw      m1, [r0+r1], 0
559    pinsrw      m2, [r0+r2+14], 7
560    lea         r1, [r3*3]
561    lea         r2, [r0+r3*4]
562    PRED4x4_LOWPASS m0, m2, m1, m0
563    HADDW       m0, m1
564    paddw       m0, [pw_4]
565    psrlw       m0, 3
566    SPLATW      m0, m0, 0
567    mova [r0+r3*1], m0
568    mova [r0+r3*2], m0
569    mova [r0+r1*1], m0
570    mova [r0+r3*4], m0
571    mova [r2+r3*1], m0
572    mova [r2+r3*2], m0
573    mova [r2+r1*1], m0
574    mova [r2+r3*4], m0
575    RET
576%endmacro
577
578INIT_XMM
579PRED8x8L_TOP_DC sse2
580%ifdef HAVE_AVX
581INIT_AVX
582PRED8x8L_TOP_DC avx
583%endif
584
585;-----------------------------------------------------------------------------
586;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
587;-----------------------------------------------------------------------------
588;TODO: see if scalar is faster
589%macro PRED8x8L_DC 1
590cglobal pred8x8l_dc_10_%1, 4,6,6
591    sub         r0, r3
592    lea         r4, [r0+r3*4]
593    lea         r5, [r3*3]
594    mova        m0, [r0+r3*2-16]
595    punpckhwd   m0, [r0+r3*1-16]
596    mova        m1, [r4+r3*0-16]
597    punpckhwd   m1, [r0+r5*1-16]
598    punpckhdq   m1, m0
599    mova        m2, [r4+r3*2-16]
600    punpckhwd   m2, [r4+r3*1-16]
601    mova        m3, [r4+r3*4-16]
602    punpckhwd   m3, [r4+r5*1-16]
603    punpckhdq   m3, m2
604    punpckhqdq  m3, m1
605    mova        m0, [r0]
606    shr        r1d, 14
607    shr        r2d, 13
608    neg         r1
609    pslldq      m1, m0, 2
610    psrldq      m2, m0, 2
611    pinsrw      m1, [r0+r1], 0
612    pinsrw      m2, [r0+r2+14], 7
613    not         r1
614    and         r1, r3
615    pslldq      m4, m3, 2
616    psrldq      m5, m3, 2
617    pshuflw     m4, m4, 11100101b
618    pinsrw      m5, [r0+r1-2], 7
619    PRED4x4_LOWPASS m3, m4, m5, m3
620    PRED4x4_LOWPASS m0, m2, m1, m0
621    paddw       m0, m3
622    HADDW       m0, m1
623    paddw       m0, [pw_8]
624    psrlw       m0, 4
625    SPLATW      m0, m0
626    mova [r0+r3*1], m0
627    mova [r0+r3*2], m0
628    mova [r0+r5*1], m0
629    mova [r0+r3*4], m0
630    mova [r4+r3*1], m0
631    mova [r4+r3*2], m0
632    mova [r4+r5*1], m0
633    mova [r4+r3*4], m0
634    RET
635%endmacro
636
637INIT_XMM
638PRED8x8L_DC sse2
639%ifdef HAVE_AVX
640INIT_AVX
641PRED8x8L_DC avx
642%endif
643
644;-----------------------------------------------------------------------------
645; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
646;-----------------------------------------------------------------------------
647%macro PRED8x8L_VERTICAL 1
648cglobal pred8x8l_vertical_10_%1, 4,4,6
649    sub         r0, r3
650    mova        m0, [r0]
651    shr        r1d, 14
652    shr        r2d, 13
653    neg         r1
654    pslldq      m1, m0, 2
655    psrldq      m2, m0, 2
656    pinsrw      m1, [r0+r1], 0
657    pinsrw      m2, [r0+r2+14], 7
658    lea         r1, [r3*3]
659    lea         r2, [r0+r3*4]
660    PRED4x4_LOWPASS m0, m2, m1, m0
661    mova [r0+r3*1], m0
662    mova [r0+r3*2], m0
663    mova [r0+r1*1], m0
664    mova [r0+r3*4], m0
665    mova [r2+r3*1], m0
666    mova [r2+r3*2], m0
667    mova [r2+r1*1], m0
668    mova [r2+r3*4], m0
669    RET
670%endmacro
671
672INIT_XMM
673PRED8x8L_VERTICAL sse2
674%ifdef HAVE_AVX
675INIT_AVX
676PRED8x8L_VERTICAL avx
677%endif
678
679;-----------------------------------------------------------------------------
680; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
681;-----------------------------------------------------------------------------
682%macro PRED8x8L_HORIZONTAL 1
683cglobal pred8x8l_horizontal_10_%1, 4,4,5
684    mova        m0, [r0-16]
685    shr        r1d, 14
686    dec         r1
687    and         r1, r3
688    sub         r1, r3
689    punpckhwd   m0, [r0+r1-16]
690    mova        m1, [r0+r3*2-16]
691    punpckhwd   m1, [r0+r3*1-16]
692    lea         r2, [r0+r3*4]
693    lea         r1, [r3*3]
694    punpckhdq   m1, m0
695    mova        m2, [r2+r3*0-16]
696    punpckhwd   m2, [r0+r1-16]
697    mova        m3, [r2+r3*2-16]
698    punpckhwd   m3, [r2+r3*1-16]
699    punpckhdq   m3, m2
700    punpckhqdq  m3, m1
701    PALIGNR     m4, m3, [r2+r1-16], 14, m0
702    pslldq      m0, m4, 2
703    pshuflw     m0, m0, 11100101b
704    PRED4x4_LOWPASS m4, m3, m0, m4
705    punpckhwd   m3, m4, m4
706    punpcklwd   m4, m4
707    pshufd      m0, m3, 0xff
708    pshufd      m1, m3, 0xaa
709    pshufd      m2, m3, 0x55
710    pshufd      m3, m3, 0x00
711    mova [r0+r3*0], m0
712    mova [r0+r3*1], m1
713    mova [r0+r3*2], m2
714    mova [r0+r1*1], m3
715    pshufd      m0, m4, 0xff
716    pshufd      m1, m4, 0xaa
717    pshufd      m2, m4, 0x55
718    pshufd      m3, m4, 0x00
719    mova [r2+r3*0], m0
720    mova [r2+r3*1], m1
721    mova [r2+r3*2], m2
722    mova [r2+r1*1], m3
723    RET
724%endmacro
725
726INIT_XMM
727%define PALIGNR PALIGNR_MMX
728PRED8x8L_HORIZONTAL sse2
729%define PALIGNR PALIGNR_SSSE3
730PRED8x8L_HORIZONTAL ssse3
731%ifdef HAVE_AVX
732INIT_AVX
733PRED8x8L_HORIZONTAL avx
734%endif
735
736;-----------------------------------------------------------------------------
737;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
738;-----------------------------------------------------------------------------
739%macro PRED8x8L_DOWN_LEFT 1
740cglobal pred8x8l_down_left_10_%1, 4,4,7
741    sub         r0, r3
742    mova        m3, [r0]
743    shr        r1d, 14
744    neg         r1
745    shr        r2d, 13
746    pslldq      m1, m3, 2
747    psrldq      m2, m3, 2
748    pinsrw      m1, [r0+r1], 0
749    pinsrw      m2, [r0+r2+14], 7
750    PRED4x4_LOWPASS m6, m2, m1, m3
751    jz .fix_tr ; flags from shr r2d
752    mova        m1, [r0+16]
753    psrldq      m5, m1, 2
754    PALIGNR     m2, m1, m3, 14, m3
755    pshufhw     m5, m5, 10100100b
756    PRED4x4_LOWPASS m1, m2, m5, m1
757.do_topright:
758    lea         r1, [r3*3]
759    psrldq      m5, m1, 14
760    lea         r2, [r0+r3*4]
761    PALIGNR     m2, m1, m6,  2, m0
762    PALIGNR     m3, m1, m6, 14, m0
763    PALIGNR     m5, m1,  2, m0
764    pslldq      m4, m6, 2
765    PRED4x4_LOWPASS m6, m4, m2, m6
766    PRED4x4_LOWPASS m1, m3, m5, m1
767    mova [r2+r3*4], m1
768    PALIGNR     m1, m6, 14, m2
769    pslldq      m6, 2
770    mova [r2+r1*1], m1
771    PALIGNR     m1, m6, 14, m2
772    pslldq      m6, 2
773    mova [r2+r3*2], m1
774    PALIGNR     m1, m6, 14, m2
775    pslldq      m6, 2
776    mova [r2+r3*1], m1
777    PALIGNR     m1, m6, 14, m2
778    pslldq      m6, 2
779    mova [r0+r3*4], m1
780    PALIGNR     m1, m6, 14, m2
781    pslldq      m6, 2
782    mova [r0+r1*1], m1
783    PALIGNR     m1, m6, 14, m2
784    pslldq      m6, 2
785    mova [r0+r3*2], m1
786    PALIGNR     m1, m6, 14, m6
787    mova [r0+r3*1], m1
788    RET
789.fix_tr:
790    punpckhwd   m3, m3
791    pshufd      m1, m3, 0xFF
792    jmp .do_topright
793%endmacro
794
795INIT_XMM
796%define PALIGNR PALIGNR_MMX
797PRED8x8L_DOWN_LEFT sse2
798%define PALIGNR PALIGNR_SSSE3
799PRED8x8L_DOWN_LEFT ssse3
800%ifdef HAVE_AVX
801INIT_AVX
802PRED8x8L_DOWN_LEFT avx
803%endif
804
805;-----------------------------------------------------------------------------
806;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
807;-----------------------------------------------------------------------------
808%macro PRED8x8L_DOWN_RIGHT 1
809; standard forbids this when has_topleft is false
810; no need to check
811cglobal pred8x8l_down_right_10_%1, 4,5,8
812    sub         r0, r3
813    lea         r4, [r0+r3*4]
814    lea         r1, [r3*3]
815    mova        m0, [r0+r3*1-16]
816    punpckhwd   m0, [r0+r3*0-16]
817    mova        m1, [r0+r1*1-16]
818    punpckhwd   m1, [r0+r3*2-16]
819    punpckhdq   m1, m0
820    mova        m2, [r4+r3*1-16]
821    punpckhwd   m2, [r4+r3*0-16]
822    mova        m3, [r4+r1*1-16]
823    punpckhwd   m3, [r4+r3*2-16]
824    punpckhdq   m3, m2
825    punpckhqdq  m3, m1
826    mova        m0, [r4+r3*4-16]
827    mova        m1, [r0]
828    PALIGNR     m4, m3, m0, 14, m0
829    PALIGNR     m1, m3,  2, m2
830    pslldq      m0, m4, 2
831    pshuflw     m0, m0, 11100101b
832    PRED4x4_LOWPASS m6, m1, m4, m3
833    PRED4x4_LOWPASS m4, m3, m0, m4
834    mova        m3, [r0]
835    shr        r2d, 13
836    pslldq      m1, m3, 2
837    psrldq      m2, m3, 2
838    pinsrw      m1, [r0-2], 0
839    pinsrw      m2, [r0+r2+14], 7
840    PRED4x4_LOWPASS m3, m2, m1, m3
841    PALIGNR     m2, m3, m6,  2, m0
842    PALIGNR     m5, m3, m6, 14, m0
843    psrldq      m7, m3, 2
844    PRED4x4_LOWPASS m6, m4, m2, m6
845    PRED4x4_LOWPASS m3, m5, m7, m3
846    mova [r4+r3*4], m6
847    PALIGNR     m3, m6, 14, m2
848    pslldq      m6, 2
849    mova [r0+r3*1], m3
850    PALIGNR     m3, m6, 14, m2
851    pslldq      m6, 2
852    mova [r0+r3*2], m3
853    PALIGNR     m3, m6, 14, m2
854    pslldq      m6, 2
855    mova [r0+r1*1], m3
856    PALIGNR     m3, m6, 14, m2
857    pslldq      m6, 2
858    mova [r0+r3*4], m3
859    PALIGNR     m3, m6, 14, m2
860    pslldq      m6, 2
861    mova [r4+r3*1], m3
862    PALIGNR     m3, m6, 14, m2
863    pslldq      m6, 2
864    mova [r4+r3*2], m3
865    PALIGNR     m3, m6, 14, m6
866    mova [r4+r1*1], m3
867    RET
868%endmacro
869
870INIT_XMM
871%define PALIGNR PALIGNR_MMX
872PRED8x8L_DOWN_RIGHT sse2
873%define PALIGNR PALIGNR_SSSE3
874PRED8x8L_DOWN_RIGHT ssse3
875%ifdef HAVE_AVX
876INIT_AVX
877PRED8x8L_DOWN_RIGHT avx
878%endif
879
880;-----------------------------------------------------------------------------
881; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
882;-----------------------------------------------------------------------------
883%macro PRED8x8L_VERTICAL_RIGHT 1
884; likewise with 8x8l_down_right
885cglobal pred8x8l_vertical_right_10_%1, 4,5,7
886    sub         r0, r3
887    lea         r4, [r0+r3*4]
888    lea         r1, [r3*3]
889    mova        m0, [r0+r3*1-16]
890    punpckhwd   m0, [r0+r3*0-16]
891    mova        m1, [r0+r1*1-16]
892    punpckhwd   m1, [r0+r3*2-16]
893    punpckhdq   m1, m0
894    mova        m2, [r4+r3*1-16]
895    punpckhwd   m2, [r4+r3*0-16]
896    mova        m3, [r4+r1*1-16]
897    punpckhwd   m3, [r4+r3*2-16]
898    punpckhdq   m3, m2
899    punpckhqdq  m3, m1
900    mova        m0, [r4+r3*4-16]
901    mova        m1, [r0]
902    PALIGNR     m4, m3, m0, 14, m0
903    PALIGNR     m1, m3,  2, m2
904    PRED4x4_LOWPASS m3, m1, m4, m3
905    mova        m2, [r0]
906    shr        r2d, 13
907    pslldq      m1, m2, 2
908    psrldq      m5, m2, 2
909    pinsrw      m1, [r0-2], 0
910    pinsrw      m5, [r0+r2+14], 7
911    PRED4x4_LOWPASS m2, m5, m1, m2
912    PALIGNR     m6, m2, m3, 12, m1
913    PALIGNR     m5, m2, m3, 14, m0
914    PRED4x4_LOWPASS m0, m6, m2, m5
915    pavgw       m2, m5
916    mova [r0+r3*2], m0
917    mova [r0+r3*1], m2
918    pslldq      m6, m3, 4
919    pslldq      m1, m3, 2
920    PRED4x4_LOWPASS m1, m3, m6, m1
921    PALIGNR     m2, m1, 14, m4
922    mova [r0+r1*1], m2
923    pslldq      m1, 2
924    PALIGNR     m0, m1, 14, m3
925    mova [r0+r3*4], m0
926    pslldq      m1, 2
927    PALIGNR     m2, m1, 14, m4
928    mova [r4+r3*1], m2
929    pslldq      m1, 2
930    PALIGNR     m0, m1, 14, m3
931    mova [r4+r3*2], m0
932    pslldq      m1, 2
933    PALIGNR     m2, m1, 14, m4
934    mova [r4+r1*1], m2
935    pslldq      m1, 2
936    PALIGNR     m0, m1, 14, m1
937    mova [r4+r3*4], m0
938    RET
939%endmacro
940
941INIT_XMM
942%define PALIGNR PALIGNR_MMX
943PRED8x8L_VERTICAL_RIGHT sse2
944%define PALIGNR PALIGNR_SSSE3
945PRED8x8L_VERTICAL_RIGHT ssse3
946%ifdef HAVE_AVX
947INIT_AVX
948PRED8x8L_VERTICAL_RIGHT avx
949%endif
950
951;-----------------------------------------------------------------------------
952; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
953;-----------------------------------------------------------------------------
954%macro PRED8x8L_HORIZONTAL_UP 1
955cglobal pred8x8l_horizontal_up_10_%1, 4,4,6
956    mova        m0, [r0+r3*0-16]
957    punpckhwd   m0, [r0+r3*1-16]
958    shr        r1d, 14
959    dec         r1
960    and         r1, r3
961    sub         r1, r3
962    mova        m4, [r0+r1*1-16]
963    lea         r1, [r3*3]
964    lea         r2, [r0+r3*4]
965    mova        m1, [r0+r3*2-16]
966    punpckhwd   m1, [r0+r1*1-16]
967    punpckhdq   m0, m1
968    mova        m2, [r2+r3*0-16]
969    punpckhwd   m2, [r2+r3*1-16]
970    mova        m3, [r2+r3*2-16]
971    punpckhwd   m3, [r2+r1*1-16]
972    punpckhdq   m2, m3
973    punpckhqdq  m0, m2
974    PALIGNR     m1, m0, m4, 14, m4
975    psrldq      m2, m0, 2
976    pshufhw     m2, m2, 10100100b
977    PRED4x4_LOWPASS m0, m1, m2, m0
978    psrldq      m1, m0, 2
979    psrldq      m2, m0, 4
980    pshufhw     m1, m1, 10100100b
981    pshufhw     m2, m2, 01010100b
982    pavgw       m4, m0, m1
983    PRED4x4_LOWPASS m1, m2, m0, m1
984    punpckhwd   m5, m4, m1
985    punpcklwd   m4, m1
986    mova [r2+r3*0], m5
987    mova [r0+r3*0], m4
988    pshufd      m0, m5, 11111001b
989    pshufd      m1, m5, 11111110b
990    pshufd      m2, m5, 11111111b
991    mova [r2+r3*1], m0
992    mova [r2+r3*2], m1
993    mova [r2+r1*1], m2
994    PALIGNR     m2, m5, m4, 4, m0
995    PALIGNR     m3, m5, m4, 8, m1
996    PALIGNR     m5, m5, m4, 12, m4
997    mova [r0+r3*1], m2
998    mova [r0+r3*2], m3
999    mova [r0+r1*1], m5
1000    RET
1001%endmacro
1002
1003INIT_XMM
1004%define PALIGNR PALIGNR_MMX
1005PRED8x8L_HORIZONTAL_UP sse2
1006%define PALIGNR PALIGNR_SSSE3
1007PRED8x8L_HORIZONTAL_UP ssse3
1008%ifdef HAVE_AVX
1009INIT_AVX
1010PRED8x8L_HORIZONTAL_UP avx
1011%endif
1012
1013
1014;-----------------------------------------------------------------------------
1015; void pred16x16_vertical(pixel *src, int stride)
1016;-----------------------------------------------------------------------------
1017%macro MOV16 3-5
1018    mova [%1+     0], %2
1019    mova [%1+mmsize], %3
1020%if mmsize==8
1021    mova [%1+    16], %4
1022    mova [%1+    24], %5
1023%endif
1024%endmacro
1025
1026%macro PRED16x16_VERTICAL 1
1027cglobal pred16x16_vertical_10_%1, 2,3
1028    sub   r0, r1
1029    mov  r2d, 8
1030    mova  m0, [r0+ 0]
1031    mova  m1, [r0+mmsize]
1032%if mmsize==8
1033    mova  m2, [r0+16]
1034    mova  m3, [r0+24]
1035%endif
1036.loop:
1037    MOV16 r0+r1*1, m0, m1, m2, m3
1038    MOV16 r0+r1*2, m0, m1, m2, m3
1039    lea   r0, [r0+r1*2]
1040    dec   r2d
1041    jg .loop
1042    REP_RET
1043%endmacro
1044
1045INIT_MMX
1046PRED16x16_VERTICAL mmxext
1047INIT_XMM
1048PRED16x16_VERTICAL sse2
1049
1050;-----------------------------------------------------------------------------
1051; void pred16x16_horizontal(pixel *src, int stride)
1052;-----------------------------------------------------------------------------
1053%macro PRED16x16_HORIZONTAL 1
1054cglobal pred16x16_horizontal_10_%1, 2,3
1055    mov   r2d, 8
1056.vloop:
1057    movd   m0, [r0+r1*0-4]
1058    movd   m1, [r0+r1*1-4]
1059    SPLATW m0, m0, 1
1060    SPLATW m1, m1, 1
1061    MOV16  r0+r1*0, m0, m0, m0, m0
1062    MOV16  r0+r1*1, m1, m1, m1, m1
1063    lea    r0, [r0+r1*2]
1064    dec    r2d
1065    jg .vloop
1066    REP_RET
1067%endmacro
1068
1069INIT_MMX
1070PRED16x16_HORIZONTAL mmxext
1071INIT_XMM
1072PRED16x16_HORIZONTAL sse2
1073
1074;-----------------------------------------------------------------------------
1075; void pred16x16_dc(pixel *src, int stride)
1076;-----------------------------------------------------------------------------
1077%macro PRED16x16_DC 1
1078cglobal pred16x16_dc_10_%1, 2,6
1079    mov        r5, r0
1080    sub        r0, r1
1081    mova       m0, [r0+0]
1082    paddw      m0, [r0+mmsize]
1083%if mmsize==8
1084    paddw      m0, [r0+16]
1085    paddw      m0, [r0+24]
1086%endif
1087    HADDW      m0, m2
1088
1089    lea        r0, [r0+r1-2]
1090    movzx     r3d, word [r0]
1091    movzx     r4d, word [r0+r1]
1092%rep 7
1093    lea        r0, [r0+r1*2]
1094    movzx     r2d, word [r0]
1095    add       r3d, r2d
1096    movzx     r2d, word [r0+r1]
1097    add       r4d, r2d
1098%endrep
1099    lea       r3d, [r3+r4+16]
1100
1101    movd       m1, r3d
1102    paddw      m0, m1
1103    psrlw      m0, 5
1104    SPLATW     m0, m0
1105    mov       r3d, 8
1106.loop:
1107    MOV16 r5+r1*0, m0, m0, m0, m0
1108    MOV16 r5+r1*1, m0, m0, m0, m0
1109    lea        r5, [r5+r1*2]
1110    dec       r3d
1111    jg .loop
1112    REP_RET
1113%endmacro
1114
1115INIT_MMX
1116PRED16x16_DC mmxext
1117INIT_XMM
1118PRED16x16_DC sse2
1119
1120;-----------------------------------------------------------------------------
1121; void pred16x16_top_dc(pixel *src, int stride)
1122;-----------------------------------------------------------------------------
1123%macro PRED16x16_TOP_DC 1
1124cglobal pred16x16_top_dc_10_%1, 2,3
1125    sub        r0, r1
1126    mova       m0, [r0+0]
1127    paddw      m0, [r0+mmsize]
1128%if mmsize==8
1129    paddw      m0, [r0+16]
1130    paddw      m0, [r0+24]
1131%endif
1132    HADDW      m0, m2
1133
1134    SPLATW     m0, m0
1135    paddw      m0, [pw_8]
1136    psrlw      m0, 4
1137    mov       r2d, 8
1138.loop:
1139    MOV16 r0+r1*1, m0, m0, m0, m0
1140    MOV16 r0+r1*2, m0, m0, m0, m0
1141    lea        r0, [r0+r1*2]
1142    dec       r2d
1143    jg .loop
1144    REP_RET
1145%endmacro
1146
1147INIT_MMX
1148PRED16x16_TOP_DC mmxext
1149INIT_XMM
1150PRED16x16_TOP_DC sse2
1151
1152;-----------------------------------------------------------------------------
1153; void pred16x16_left_dc(pixel *src, int stride)
1154;-----------------------------------------------------------------------------
1155%macro PRED16x16_LEFT_DC 1
1156cglobal pred16x16_left_dc_10_%1, 2,6
1157    mov        r5, r0
1158
1159    sub        r0, 2
1160    movzx     r3d, word [r0]
1161    movzx     r4d, word [r0+r1]
1162%rep 7
1163    lea        r0, [r0+r1*2]
1164    movzx     r2d, word [r0]
1165    add       r3d, r2d
1166    movzx     r2d, word [r0+r1]
1167    add       r4d, r2d
1168%endrep
1169    lea       r3d, [r3+r4+8]
1170    shr       r3d, 4
1171
1172    movd       m0, r3d
1173    SPLATW     m0, m0
1174    mov       r3d, 8
1175.loop:
1176    MOV16 r5+r1*0, m0, m0, m0, m0
1177    MOV16 r5+r1*1, m0, m0, m0, m0
1178    lea        r5, [r5+r1*2]
1179    dec       r3d
1180    jg .loop
1181    REP_RET
1182%endmacro
1183
1184INIT_MMX
1185PRED16x16_LEFT_DC mmxext
1186INIT_XMM
1187PRED16x16_LEFT_DC sse2
1188
1189;-----------------------------------------------------------------------------
1190; void pred16x16_128_dc(pixel *src, int stride)
1191;-----------------------------------------------------------------------------
1192%macro PRED16x16_128_DC 1
1193cglobal pred16x16_128_dc_10_%1, 2,3
1194    mova       m0, [pw_512]
1195    mov       r2d, 8
1196.loop:
1197    MOV16 r0+r1*0, m0, m0, m0, m0
1198    MOV16 r0+r1*1, m0, m0, m0, m0
1199    lea        r0, [r0+r1*2]
1200    dec       r2d
1201    jg .loop
1202    REP_RET
1203%endmacro
1204
1205INIT_MMX
1206PRED16x16_128_DC mmxext
1207INIT_XMM
1208PRED16x16_128_DC sse2
1209