1;******************************************************************************
2;* H.264 intra prediction asm optimizations
3;* Copyright (c) 2010 Fiona Glaser
4;* Copyright (c) 2010 Holger Lubitz
5;* Copyright (c) 2010 Loren Merritt
6;* Copyright (c) 2010 Ronald S. Bultje
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29tm_shuf: times 8 db 0x03, 0x80
30pw_ff00: times 8 dw 0xff00
31plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
32             db  1,  2,  3,  4,  5,  6,  7,  8
33plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
34             db  1,  2,  3,  4,  0,  0,  0,  0
35pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
36pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
37pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
38pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
39
40SECTION .text
41
42cextern pb_1
43cextern pb_3
44cextern pw_4
45cextern pw_5
46cextern pw_8
47cextern pw_16
48cextern pw_17
49cextern pw_32
50
51;-----------------------------------------------------------------------------
52; void ff_pred16x16_vertical_8(uint8_t *src, int stride)
53;-----------------------------------------------------------------------------
54
55INIT_MMX mmx
56cglobal pred16x16_vertical_8, 2,3
57    sub   r0, r1
58    mov   r2, 8
59    movq mm0, [r0+0]
60    movq mm1, [r0+8]
61.loop:
62    movq [r0+r1*1+0], mm0
63    movq [r0+r1*1+8], mm1
64    movq [r0+r1*2+0], mm0
65    movq [r0+r1*2+8], mm1
66    lea   r0, [r0+r1*2]
67    dec   r2
68    jg .loop
69    REP_RET
70
71INIT_XMM sse
72cglobal pred16x16_vertical_8, 2,3
73    sub   r0, r1
74    mov   r2, 4
75    movaps xmm0, [r0]
76.loop:
77    movaps [r0+r1*1], xmm0
78    movaps [r0+r1*2], xmm0
79    lea   r0, [r0+r1*2]
80    movaps [r0+r1*1], xmm0
81    movaps [r0+r1*2], xmm0
82    lea   r0, [r0+r1*2]
83    dec   r2
84    jg .loop
85    REP_RET
86
87;-----------------------------------------------------------------------------
88; void ff_pred16x16_horizontal_8(uint8_t *src, int stride)
89;-----------------------------------------------------------------------------
90
91%macro PRED16x16_H 0
92cglobal pred16x16_horizontal_8, 2,3
93    mov       r2, 8
94%if cpuflag(ssse3)
95    mova      m2, [pb_3]
96%endif
97.loop:
98    movd      m0, [r0+r1*0-4]
99    movd      m1, [r0+r1*1-4]
100
101%if cpuflag(ssse3)
102    pshufb    m0, m2
103    pshufb    m1, m2
104%else
105    punpcklbw m0, m0
106    punpcklbw m1, m1
107    SPLATW    m0, m0, 3
108    SPLATW    m1, m1, 3
109    mova [r0+r1*0+8], m0
110    mova [r0+r1*1+8], m1
111%endif
112
113    mova [r0+r1*0], m0
114    mova [r0+r1*1], m1
115    lea       r0, [r0+r1*2]
116    dec       r2
117    jg .loop
118    REP_RET
119%endmacro
120
121INIT_MMX mmx
122PRED16x16_H
123INIT_MMX mmxext
124PRED16x16_H
125INIT_XMM ssse3
126PRED16x16_H
127
128;-----------------------------------------------------------------------------
129; void ff_pred16x16_dc_8(uint8_t *src, int stride)
130;-----------------------------------------------------------------------------
131
132%macro PRED16x16_DC 0
133cglobal pred16x16_dc_8, 2,7
134    mov       r4, r0
135    sub       r0, r1
136    pxor      mm0, mm0
137    pxor      mm1, mm1
138    psadbw    mm0, [r0+0]
139    psadbw    mm1, [r0+8]
140    dec        r0
141    movzx     r5d, byte [r0+r1*1]
142    paddw     mm0, mm1
143    movd      r6d, mm0
144    lea        r0, [r0+r1*2]
145%rep 7
146    movzx     r2d, byte [r0+r1*0]
147    movzx     r3d, byte [r0+r1*1]
148    add       r5d, r2d
149    add       r6d, r3d
150    lea        r0, [r0+r1*2]
151%endrep
152    movzx     r2d, byte [r0+r1*0]
153    add       r5d, r6d
154    lea       r2d, [r2+r5+16]
155    shr       r2d, 5
156%if cpuflag(ssse3)
157    pxor       m1, m1
158%endif
159    SPLATB_REG m0, r2, m1
160
161%if mmsize==8
162    mov       r3d, 8
163.loop:
164    mova [r4+r1*0+0], m0
165    mova [r4+r1*0+8], m0
166    mova [r4+r1*1+0], m0
167    mova [r4+r1*1+8], m0
168%else
169    mov       r3d, 4
170.loop:
171    mova [r4+r1*0], m0
172    mova [r4+r1*1], m0
173    lea   r4, [r4+r1*2]
174    mova [r4+r1*0], m0
175    mova [r4+r1*1], m0
176%endif
177    lea   r4, [r4+r1*2]
178    dec   r3d
179    jg .loop
180    REP_RET
181%endmacro
182
183INIT_MMX mmxext
184PRED16x16_DC
185INIT_XMM sse2
186PRED16x16_DC
187INIT_XMM ssse3
188PRED16x16_DC
189
190;-----------------------------------------------------------------------------
191; void ff_pred16x16_tm_vp8_8(uint8_t *src, int stride)
192;-----------------------------------------------------------------------------
193
194%macro PRED16x16_TM 0
195cglobal pred16x16_tm_vp8_8, 2,5
196    sub        r0, r1
197    pxor      mm7, mm7
198    movq      mm0, [r0+0]
199    movq      mm2, [r0+8]
200    movq      mm1, mm0
201    movq      mm3, mm2
202    punpcklbw mm0, mm7
203    punpckhbw mm1, mm7
204    punpcklbw mm2, mm7
205    punpckhbw mm3, mm7
206    movzx     r3d, byte [r0-1]
207    mov       r4d, 16
208.loop:
209    movzx     r2d, byte [r0+r1-1]
210    sub       r2d, r3d
211    movd      mm4, r2d
212    SPLATW    mm4, mm4, 0
213    movq      mm5, mm4
214    movq      mm6, mm4
215    movq      mm7, mm4
216    paddw     mm4, mm0
217    paddw     mm5, mm1
218    paddw     mm6, mm2
219    paddw     mm7, mm3
220    packuswb  mm4, mm5
221    packuswb  mm6, mm7
222    movq [r0+r1+0], mm4
223    movq [r0+r1+8], mm6
224    add        r0, r1
225    dec       r4d
226    jg .loop
227    REP_RET
228%endmacro
229
230INIT_MMX mmx
231PRED16x16_TM
232INIT_MMX mmxext
233PRED16x16_TM
234
235INIT_XMM sse2
236cglobal pred16x16_tm_vp8_8, 2,6,6
237    sub          r0, r1
238    pxor       xmm2, xmm2
239    movdqa     xmm0, [r0]
240    movdqa     xmm1, xmm0
241    punpcklbw  xmm0, xmm2
242    punpckhbw  xmm1, xmm2
243    movzx       r4d, byte [r0-1]
244    mov         r5d, 8
245.loop:
246    movzx       r2d, byte [r0+r1*1-1]
247    movzx       r3d, byte [r0+r1*2-1]
248    sub         r2d, r4d
249    sub         r3d, r4d
250    movd       xmm2, r2d
251    movd       xmm4, r3d
252    pshuflw    xmm2, xmm2, 0
253    pshuflw    xmm4, xmm4, 0
254    punpcklqdq xmm2, xmm2
255    punpcklqdq xmm4, xmm4
256    movdqa     xmm3, xmm2
257    movdqa     xmm5, xmm4
258    paddw      xmm2, xmm0
259    paddw      xmm3, xmm1
260    paddw      xmm4, xmm0
261    paddw      xmm5, xmm1
262    packuswb   xmm2, xmm3
263    packuswb   xmm4, xmm5
264    movdqa [r0+r1*1], xmm2
265    movdqa [r0+r1*2], xmm4
266    lea          r0, [r0+r1*2]
267    dec         r5d
268    jg .loop
269    REP_RET
270
271;-----------------------------------------------------------------------------
272; void ff_pred16x16_plane_*_8(uint8_t *src, int stride)
273;-----------------------------------------------------------------------------
274
275%macro H264_PRED16x16_PLANE 1
276cglobal pred16x16_plane_%1_8, 2,9,7
277    mov          r2, r1           ; +stride
278    neg          r1               ; -stride
279
280    movh         m0, [r0+r1  -1]
281%if mmsize == 8
282    pxor         m4, m4
283    movh         m1, [r0+r1  +3 ]
284    movh         m2, [r0+r1  +8 ]
285    movh         m3, [r0+r1  +12]
286    punpcklbw    m0, m4
287    punpcklbw    m1, m4
288    punpcklbw    m2, m4
289    punpcklbw    m3, m4
290    pmullw       m0, [pw_m8tom1  ]
291    pmullw       m1, [pw_m8tom1+8]
292    pmullw       m2, [pw_1to8    ]
293    pmullw       m3, [pw_1to8  +8]
294    paddw        m0, m2
295    paddw        m1, m3
296%else ; mmsize == 16
297%if cpuflag(ssse3)
298    movhps       m0, [r0+r1  +8]
299    pmaddubsw    m0, [plane_shuf] ; H coefficients
300%else ; sse2
301    pxor         m2, m2
302    movh         m1, [r0+r1  +8]
303    punpcklbw    m0, m2
304    punpcklbw    m1, m2
305    pmullw       m0, [pw_m8tom1]
306    pmullw       m1, [pw_1to8]
307    paddw        m0, m1
308%endif
309    movhlps      m1, m0
310%endif
311    paddw        m0, m1
312%if cpuflag(mmxext)
313    PSHUFLW      m1, m0, 0xE
314%elif cpuflag(mmx)
315    mova         m1, m0
316    psrlq        m1, 32
317%endif
318    paddw        m0, m1
319%if cpuflag(mmxext)
320    PSHUFLW      m1, m0, 0x1
321%elif cpuflag(mmx)
322    mova         m1, m0
323    psrlq        m1, 16
324%endif
325    paddw        m0, m1           ; sum of H coefficients
326
327    lea          r4, [r0+r2*8-1]
328    lea          r3, [r0+r2*4-1]
329    add          r4, r2
330
331%if ARCH_X86_64
332%define e_reg r8
333%else
334%define e_reg r0
335%endif
336
337    movzx     e_reg, byte [r3+r2*2   ]
338    movzx        r5, byte [r4+r1     ]
339    sub          r5, e_reg
340
341    movzx     e_reg, byte [r3+r2     ]
342    movzx        r6, byte [r4        ]
343    sub          r6, e_reg
344    lea          r5, [r5+r6*2]
345
346    movzx     e_reg, byte [r3+r1     ]
347    movzx        r6, byte [r4+r2*2   ]
348    sub          r6, e_reg
349    lea          r5, [r5+r6*4]
350
351    movzx     e_reg, byte [r3        ]
352%if ARCH_X86_64
353    movzx        r7, byte [r4+r2     ]
354    sub          r7, e_reg
355%else
356    movzx        r6, byte [r4+r2     ]
357    sub          r6, e_reg
358    lea          r5, [r5+r6*4]
359    sub          r5, r6
360%endif
361
362    lea       e_reg, [r3+r1*4]
363    lea          r3, [r4+r2*4]
364
365    movzx        r4, byte [e_reg+r2  ]
366    movzx        r6, byte [r3        ]
367    sub          r6, r4
368%if ARCH_X86_64
369    lea          r6, [r7+r6*2]
370    lea          r5, [r5+r6*2]
371    add          r5, r6
372%else
373    lea          r5, [r5+r6*4]
374    lea          r5, [r5+r6*2]
375%endif
376
377    movzx        r4, byte [e_reg     ]
378%if ARCH_X86_64
379    movzx        r7, byte [r3   +r2  ]
380    sub          r7, r4
381    sub          r5, r7
382%else
383    movzx        r6, byte [r3   +r2  ]
384    sub          r6, r4
385    lea          r5, [r5+r6*8]
386    sub          r5, r6
387%endif
388
389    movzx        r4, byte [e_reg+r1  ]
390    movzx        r6, byte [r3   +r2*2]
391    sub          r6, r4
392%if ARCH_X86_64
393    add          r6, r7
394%endif
395    lea          r5, [r5+r6*8]
396
397    movzx        r4, byte [e_reg+r2*2]
398    movzx        r6, byte [r3   +r1  ]
399    sub          r6, r4
400    lea          r5, [r5+r6*4]
401    add          r5, r6           ; sum of V coefficients
402
403%if ARCH_X86_64 == 0
404    mov          r0, r0m
405%endif
406
407%ifidn %1, h264
408    lea          r5, [r5*5+32]
409    sar          r5, 6
410%elifidn %1, rv40
411    lea          r5, [r5*5]
412    sar          r5, 6
413%elifidn %1, svq3
414    test         r5, r5
415    lea          r6, [r5+3]
416    cmovs        r5, r6
417    sar          r5, 2            ; V/4
418    lea          r5, [r5*5]       ; 5*(V/4)
419    test         r5, r5
420    lea          r6, [r5+15]
421    cmovs        r5, r6
422    sar          r5, 4            ; (5*(V/4))/16
423%endif
424
425    movzx        r4, byte [r0+r1  +15]
426    movzx        r3, byte [r3+r2*2   ]
427    lea          r3, [r3+r4+1]
428    shl          r3, 4
429
430    movd        r1d, m0
431    movsx       r1d, r1w
432%ifnidn %1, svq3
433%ifidn %1, h264
434    lea         r1d, [r1d*5+32]
435%else ; rv40
436    lea         r1d, [r1d*5]
437%endif
438    sar         r1d, 6
439%else ; svq3
440    test        r1d, r1d
441    lea         r4d, [r1d+3]
442    cmovs       r1d, r4d
443    sar         r1d, 2           ; H/4
444    lea         r1d, [r1d*5]     ; 5*(H/4)
445    test        r1d, r1d
446    lea         r4d, [r1d+15]
447    cmovs       r1d, r4d
448    sar         r1d, 4           ; (5*(H/4))/16
449%endif
450    movd         m0, r1d
451
452    add         r1d, r5d
453    add         r3d, r1d
454    shl         r1d, 3
455    sub         r3d, r1d          ; a
456
457    movd         m1, r5d
458    movd         m3, r3d
459    SPLATW       m0, m0, 0        ; H
460    SPLATW       m1, m1, 0        ; V
461    SPLATW       m3, m3, 0        ; a
462%ifidn %1, svq3
463    SWAP          0, 1
464%endif
465    mova         m2, m0
466%if mmsize == 8
467    mova         m5, m0
468%endif
469    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
470%if mmsize == 16
471    psllw        m2, 3
472%else
473    psllw        m5, 3
474    psllw        m2, 2
475    mova         m6, m5
476    paddw        m6, m2
477%endif
478    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
479    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
480%if mmsize == 8
481    paddw        m5, m0           ; a + {8,9,10,11}*H
482    paddw        m6, m0           ; a + {12,13,14,15}*H
483%endif
484
485    mov          r4, 8
486.loop:
487    mova         m3, m0           ; b[0..7]
488    mova         m4, m2           ; b[8..15]
489    psraw        m3, 5
490    psraw        m4, 5
491    packuswb     m3, m4
492    mova       [r0], m3
493%if mmsize == 8
494    mova         m3, m5           ; b[8..11]
495    mova         m4, m6           ; b[12..15]
496    psraw        m3, 5
497    psraw        m4, 5
498    packuswb     m3, m4
499    mova     [r0+8], m3
500%endif
501    paddw        m0, m1
502    paddw        m2, m1
503%if mmsize == 8
504    paddw        m5, m1
505    paddw        m6, m1
506%endif
507
508    mova         m3, m0           ; b[0..7]
509    mova         m4, m2           ; b[8..15]
510    psraw        m3, 5
511    psraw        m4, 5
512    packuswb     m3, m4
513    mova    [r0+r2], m3
514%if mmsize == 8
515    mova         m3, m5           ; b[8..11]
516    mova         m4, m6           ; b[12..15]
517    psraw        m3, 5
518    psraw        m4, 5
519    packuswb     m3, m4
520    mova  [r0+r2+8], m3
521%endif
522    paddw        m0, m1
523    paddw        m2, m1
524%if mmsize == 8
525    paddw        m5, m1
526    paddw        m6, m1
527%endif
528
529    lea          r0, [r0+r2*2]
530    dec          r4
531    jg .loop
532    REP_RET
533%endmacro
534
535INIT_MMX mmx
536H264_PRED16x16_PLANE h264
537H264_PRED16x16_PLANE rv40
538H264_PRED16x16_PLANE svq3
539INIT_MMX mmxext
540H264_PRED16x16_PLANE h264
541H264_PRED16x16_PLANE rv40
542H264_PRED16x16_PLANE svq3
543INIT_XMM sse2
544H264_PRED16x16_PLANE h264
545H264_PRED16x16_PLANE rv40
546H264_PRED16x16_PLANE svq3
547INIT_XMM ssse3
548H264_PRED16x16_PLANE h264
549H264_PRED16x16_PLANE rv40
550H264_PRED16x16_PLANE svq3
551
552;-----------------------------------------------------------------------------
553; void ff_pred8x8_plane_8(uint8_t *src, int stride)
554;-----------------------------------------------------------------------------
555
556%macro H264_PRED8x8_PLANE 0
557cglobal pred8x8_plane_8, 2,9,7
558    mov          r2, r1           ; +stride
559    neg          r1               ; -stride
560
561    movd         m0, [r0+r1  -1]
562%if mmsize == 8
563    pxor         m2, m2
564    movh         m1, [r0+r1  +4 ]
565    punpcklbw    m0, m2
566    punpcklbw    m1, m2
567    pmullw       m0, [pw_m4to4]
568    pmullw       m1, [pw_m4to4+8]
569%else ; mmsize == 16
570%if cpuflag(ssse3)
571    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
572    pmaddubsw    m0, [plane8_shuf] ; H coefficients
573%else ; sse2
574    pxor         m2, m2
575    movd         m1, [r0+r1  +4]
576    punpckldq    m0, m1
577    punpcklbw    m0, m2
578    pmullw       m0, [pw_m4to4]
579%endif
580    movhlps      m1, m0
581%endif
582    paddw        m0, m1
583
584%if notcpuflag(ssse3)
585%if cpuflag(mmxext)
586    PSHUFLW      m1, m0, 0xE
587%elif cpuflag(mmx)
588    mova         m1, m0
589    psrlq        m1, 32
590%endif
591    paddw        m0, m1
592%endif ; !ssse3
593
594%if cpuflag(mmxext)
595    PSHUFLW      m1, m0, 0x1
596%elif cpuflag(mmx)
597    mova         m1, m0
598    psrlq        m1, 16
599%endif
600    paddw        m0, m1           ; sum of H coefficients
601
602    lea          r4, [r0+r2*4-1]
603    lea          r3, [r0     -1]
604    add          r4, r2
605
606%if ARCH_X86_64
607%define e_reg r8
608%else
609%define e_reg r0
610%endif
611
612    movzx     e_reg, byte [r3+r2*2   ]
613    movzx        r5, byte [r4+r1     ]
614    sub          r5, e_reg
615
616    movzx     e_reg, byte [r3        ]
617%if ARCH_X86_64
618    movzx        r7, byte [r4+r2     ]
619    sub          r7, e_reg
620    sub          r5, r7
621%else
622    movzx        r6, byte [r4+r2     ]
623    sub          r6, e_reg
624    lea          r5, [r5+r6*4]
625    sub          r5, r6
626%endif
627
628    movzx     e_reg, byte [r3+r1     ]
629    movzx        r6, byte [r4+r2*2   ]
630    sub          r6, e_reg
631%if ARCH_X86_64
632    add          r6, r7
633%endif
634    lea          r5, [r5+r6*4]
635
636    movzx     e_reg, byte [r3+r2     ]
637    movzx        r6, byte [r4        ]
638    sub          r6, e_reg
639    lea          r6, [r5+r6*2]
640
641    lea          r5, [r6*9+16]
642    lea          r5, [r5+r6*8]
643    sar          r5, 5
644
645%if ARCH_X86_64 == 0
646    mov          r0, r0m
647%endif
648
649    movzx        r3, byte [r4+r2*2  ]
650    movzx        r4, byte [r0+r1  +7]
651    lea          r3, [r3+r4+1]
652    shl          r3, 4
653    movd        r1d, m0
654    movsx       r1d, r1w
655    imul        r1d, 17
656    add         r1d, 16
657    sar         r1d, 5
658    movd         m0, r1d
659    add         r1d, r5d
660    sub         r3d, r1d
661    add         r1d, r1d
662    sub         r3d, r1d          ; a
663
664    movd         m1, r5d
665    movd         m3, r3d
666    SPLATW       m0, m0, 0        ; H
667    SPLATW       m1, m1, 0        ; V
668    SPLATW       m3, m3, 0        ; a
669%if mmsize == 8
670    mova         m2, m0
671%endif
672    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
673    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
674%if mmsize == 8
675    psllw        m2, 2
676    paddw        m2, m0           ; a + {4,5,6,7}*H
677%endif
678
679    mov          r4, 4
680ALIGN 16
681.loop:
682%if mmsize == 16
683    mova         m3, m0           ; b[0..7]
684    paddw        m0, m1
685    psraw        m3, 5
686    mova         m4, m0           ; V+b[0..7]
687    paddw        m0, m1
688    psraw        m4, 5
689    packuswb     m3, m4
690    movh       [r0], m3
691    movhps  [r0+r2], m3
692%else ; mmsize == 8
693    mova         m3, m0           ; b[0..3]
694    mova         m4, m2           ; b[4..7]
695    paddw        m0, m1
696    paddw        m2, m1
697    psraw        m3, 5
698    psraw        m4, 5
699    mova         m5, m0           ; V+b[0..3]
700    mova         m6, m2           ; V+b[4..7]
701    paddw        m0, m1
702    paddw        m2, m1
703    psraw        m5, 5
704    psraw        m6, 5
705    packuswb     m3, m4
706    packuswb     m5, m6
707    mova       [r0], m3
708    mova    [r0+r2], m5
709%endif
710
711    lea          r0, [r0+r2*2]
712    dec          r4
713    jg .loop
714    REP_RET
715%endmacro
716
717INIT_MMX mmx
718H264_PRED8x8_PLANE
719INIT_MMX mmxext
720H264_PRED8x8_PLANE
721INIT_XMM sse2
722H264_PRED8x8_PLANE
723INIT_XMM ssse3
724H264_PRED8x8_PLANE
725
726;-----------------------------------------------------------------------------
727; void ff_pred8x8_vertical_8(uint8_t *src, int stride)
728;-----------------------------------------------------------------------------
729
730INIT_MMX mmx
731cglobal pred8x8_vertical_8, 2,2
732    sub    r0, r1
733    movq  mm0, [r0]
734%rep 3
735    movq [r0+r1*1], mm0
736    movq [r0+r1*2], mm0
737    lea    r0, [r0+r1*2]
738%endrep
739    movq [r0+r1*1], mm0
740    movq [r0+r1*2], mm0
741    RET
742
743;-----------------------------------------------------------------------------
744; void ff_pred8x8_horizontal_8(uint8_t *src, int stride)
745;-----------------------------------------------------------------------------
746
747%macro PRED8x8_H 0
748cglobal pred8x8_horizontal_8, 2,3
749    mov       r2, 4
750%if cpuflag(ssse3)
751    mova      m2, [pb_3]
752%endif
753.loop:
754    SPLATB_LOAD m0, r0+r1*0-1, m2
755    SPLATB_LOAD m1, r0+r1*1-1, m2
756    mova [r0+r1*0], m0
757    mova [r0+r1*1], m1
758    lea       r0, [r0+r1*2]
759    dec       r2
760    jg .loop
761    REP_RET
762%endmacro
763
764INIT_MMX mmx
765PRED8x8_H
766INIT_MMX mmxext
767PRED8x8_H
768INIT_MMX ssse3
769PRED8x8_H
770
771;-----------------------------------------------------------------------------
772; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, int stride)
773;-----------------------------------------------------------------------------
774INIT_MMX mmxext
775cglobal pred8x8_top_dc_8, 2,5
776    sub         r0, r1
777    movq       mm0, [r0]
778    pxor       mm1, mm1
779    pxor       mm2, mm2
780    lea         r2, [r0+r1*2]
781    punpckhbw  mm1, mm0
782    punpcklbw  mm0, mm2
783    psadbw     mm1, mm2        ; s1
784    lea         r3, [r2+r1*2]
785    psadbw     mm0, mm2        ; s0
786    psrlw      mm1, 1
787    psrlw      mm0, 1
788    pavgw      mm1, mm2
789    lea         r4, [r3+r1*2]
790    pavgw      mm0, mm2
791    pshufw     mm1, mm1, 0
792    pshufw     mm0, mm0, 0     ; dc0 (w)
793    packuswb   mm0, mm1        ; dc0,dc1 (b)
794    movq [r0+r1*1], mm0
795    movq [r0+r1*2], mm0
796    lea         r0, [r3+r1*2]
797    movq [r2+r1*1], mm0
798    movq [r2+r1*2], mm0
799    movq [r3+r1*1], mm0
800    movq [r3+r1*2], mm0
801    movq [r0+r1*1], mm0
802    movq [r0+r1*2], mm0
803    RET
804
805;-----------------------------------------------------------------------------
806; void ff_pred8x8_dc_8_mmxext(uint8_t *src, int stride)
807;-----------------------------------------------------------------------------
808
809INIT_MMX mmxext
810cglobal pred8x8_dc_8, 2,5
811    sub       r0, r1
812    pxor      m7, m7
813    movd      m0, [r0+0]
814    movd      m1, [r0+4]
815    psadbw    m0, m7            ; s0
816    mov       r4, r0
817    psadbw    m1, m7            ; s1
818
819    movzx    r2d, byte [r0+r1*1-1]
820    movzx    r3d, byte [r0+r1*2-1]
821    lea       r0, [r0+r1*2]
822    add      r2d, r3d
823    movzx    r3d, byte [r0+r1*1-1]
824    add      r2d, r3d
825    movzx    r3d, byte [r0+r1*2-1]
826    add      r2d, r3d
827    lea       r0, [r0+r1*2]
828    movd      m2, r2d            ; s2
829    movzx    r2d, byte [r0+r1*1-1]
830    movzx    r3d, byte [r0+r1*2-1]
831    lea       r0, [r0+r1*2]
832    add      r2d, r3d
833    movzx    r3d, byte [r0+r1*1-1]
834    add      r2d, r3d
835    movzx    r3d, byte [r0+r1*2-1]
836    add      r2d, r3d
837    movd      m3, r2d            ; s3
838
839    punpcklwd m0, m1
840    mov       r0, r4
841    punpcklwd m2, m3
842    punpckldq m0, m2            ; s0, s1, s2, s3
843    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
844    lea       r2, [r0+r1*2]
845    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
846    paddw     m0, m3
847    lea       r3, [r2+r1*2]
848    psrlw     m0, 2
849    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
850    lea       r4, [r3+r1*2]
851    packuswb  m0, m0
852    punpcklbw m0, m0
853    movq      m1, m0
854    punpcklbw m0, m0
855    punpckhbw m1, m1
856    movq [r0+r1*1], m0
857    movq [r0+r1*2], m0
858    movq [r2+r1*1], m0
859    movq [r2+r1*2], m0
860    movq [r3+r1*1], m1
861    movq [r3+r1*2], m1
862    movq [r4+r1*1], m1
863    movq [r4+r1*2], m1
864    RET
865
866;-----------------------------------------------------------------------------
867; void ff_pred8x8_dc_rv40_8(uint8_t *src, int stride)
868;-----------------------------------------------------------------------------
869
870INIT_MMX mmxext
871cglobal pred8x8_dc_rv40_8, 2,7
872    mov       r4, r0
873    sub       r0, r1
874    pxor      mm0, mm0
875    psadbw    mm0, [r0]
876    dec        r0
877    movzx     r5d, byte [r0+r1*1]
878    movd      r6d, mm0
879    lea        r0, [r0+r1*2]
880%rep 3
881    movzx     r2d, byte [r0+r1*0]
882    movzx     r3d, byte [r0+r1*1]
883    add       r5d, r2d
884    add       r6d, r3d
885    lea        r0, [r0+r1*2]
886%endrep
887    movzx     r2d, byte [r0+r1*0]
888    add       r5d, r6d
889    lea       r2d, [r2+r5+8]
890    shr       r2d, 4
891    movd      mm0, r2d
892    punpcklbw mm0, mm0
893    pshufw    mm0, mm0, 0
894    mov       r3d, 4
895.loop:
896    movq [r4+r1*0], mm0
897    movq [r4+r1*1], mm0
898    lea   r4, [r4+r1*2]
899    dec   r3d
900    jg .loop
901    REP_RET
902
903;-----------------------------------------------------------------------------
904; void ff_pred8x8_tm_vp8_8(uint8_t *src, int stride)
905;-----------------------------------------------------------------------------
906
907%macro PRED8x8_TM 0
908cglobal pred8x8_tm_vp8_8, 2,6
909    sub        r0, r1
910    pxor      mm7, mm7
911    movq      mm0, [r0]
912    movq      mm1, mm0
913    punpcklbw mm0, mm7
914    punpckhbw mm1, mm7
915    movzx     r4d, byte [r0-1]
916    mov       r5d, 4
917.loop:
918    movzx     r2d, byte [r0+r1*1-1]
919    movzx     r3d, byte [r0+r1*2-1]
920    sub       r2d, r4d
921    sub       r3d, r4d
922    movd      mm2, r2d
923    movd      mm4, r3d
924    SPLATW    mm2, mm2, 0
925    SPLATW    mm4, mm4, 0
926    movq      mm3, mm2
927    movq      mm5, mm4
928    paddw     mm2, mm0
929    paddw     mm3, mm1
930    paddw     mm4, mm0
931    paddw     mm5, mm1
932    packuswb  mm2, mm3
933    packuswb  mm4, mm5
934    movq [r0+r1*1], mm2
935    movq [r0+r1*2], mm4
936    lea        r0, [r0+r1*2]
937    dec       r5d
938    jg .loop
939    REP_RET
940%endmacro
941
942INIT_MMX mmx
943PRED8x8_TM
944INIT_MMX mmxext
945PRED8x8_TM
946
947INIT_XMM sse2
948cglobal pred8x8_tm_vp8_8, 2,6,4
949    sub          r0, r1
950    pxor       xmm1, xmm1
951    movq       xmm0, [r0]
952    punpcklbw  xmm0, xmm1
953    movzx       r4d, byte [r0-1]
954    mov         r5d, 4
955.loop:
956    movzx       r2d, byte [r0+r1*1-1]
957    movzx       r3d, byte [r0+r1*2-1]
958    sub         r2d, r4d
959    sub         r3d, r4d
960    movd       xmm2, r2d
961    movd       xmm3, r3d
962    pshuflw    xmm2, xmm2, 0
963    pshuflw    xmm3, xmm3, 0
964    punpcklqdq xmm2, xmm2
965    punpcklqdq xmm3, xmm3
966    paddw      xmm2, xmm0
967    paddw      xmm3, xmm0
968    packuswb   xmm2, xmm3
969    movq   [r0+r1*1], xmm2
970    movhps [r0+r1*2], xmm2
971    lea          r0, [r0+r1*2]
972    dec         r5d
973    jg .loop
974    REP_RET
975
976INIT_XMM ssse3
977cglobal pred8x8_tm_vp8_8, 2,3,6
978    sub          r0, r1
979    movdqa     xmm4, [tm_shuf]
980    pxor       xmm1, xmm1
981    movq       xmm0, [r0]
982    punpcklbw  xmm0, xmm1
983    movd       xmm5, [r0-4]
984    pshufb     xmm5, xmm4
985    mov         r2d, 4
986.loop:
987    movd       xmm2, [r0+r1*1-4]
988    movd       xmm3, [r0+r1*2-4]
989    pshufb     xmm2, xmm4
990    pshufb     xmm3, xmm4
991    psubw      xmm2, xmm5
992    psubw      xmm3, xmm5
993    paddw      xmm2, xmm0
994    paddw      xmm3, xmm0
995    packuswb   xmm2, xmm3
996    movq   [r0+r1*1], xmm2
997    movhps [r0+r1*2], xmm2
998    lea          r0, [r0+r1*2]
999    dec         r2d
1000    jg .loop
1001    REP_RET
1002
1003; dest, left, right, src, tmp
1004; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1005%macro PRED4x4_LOWPASS 5
1006    mova    %5, %2
1007    pavgb   %2, %3
1008    pxor    %3, %5
1009    mova    %1, %4
1010    pand    %3, [pb_1]
1011    psubusb %2, %3
1012    pavgb   %1, %2
1013%endmacro
1014
1015;-----------------------------------------------------------------------------
1016; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
1017;                           int stride)
1018;-----------------------------------------------------------------------------
1019%macro PRED8x8L_TOP_DC 0
1020cglobal pred8x8l_top_dc_8, 4,4
1021    sub          r0, r3
1022    pxor        mm7, mm7
1023    movq        mm0, [r0-8]
1024    movq        mm3, [r0]
1025    movq        mm1, [r0+8]
1026    movq        mm2, mm3
1027    movq        mm4, mm3
1028    PALIGNR     mm2, mm0, 7, mm0
1029    PALIGNR     mm1, mm4, 1, mm4
1030    test         r1, r1 ; top_left
1031    jz .fix_lt_2
1032    test         r2, r2 ; top_right
1033    jz .fix_tr_1
1034    jmp .body
1035.fix_lt_2:
1036    movq        mm5, mm3
1037    pxor        mm5, mm2
1038    psllq       mm5, 56
1039    psrlq       mm5, 56
1040    pxor        mm2, mm5
1041    test         r2, r2 ; top_right
1042    jnz .body
1043.fix_tr_1:
1044    movq        mm5, mm3
1045    pxor        mm5, mm1
1046    psrlq       mm5, 56
1047    psllq       mm5, 56
1048    pxor        mm1, mm5
1049.body:
1050    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1051    psadbw   mm7, mm0
1052    paddw    mm7, [pw_4]
1053    psrlw    mm7, 3
1054    pshufw   mm7, mm7, 0
1055    packuswb mm7, mm7
1056%rep 3
1057    movq [r0+r3*1], mm7
1058    movq [r0+r3*2], mm7
1059    lea    r0, [r0+r3*2]
1060%endrep
1061    movq [r0+r3*1], mm7
1062    movq [r0+r3*2], mm7
1063    RET
1064%endmacro
1065
1066INIT_MMX mmxext
1067PRED8x8L_TOP_DC
1068INIT_MMX ssse3
1069PRED8x8L_TOP_DC
1070
1071;-----------------------------------------------------------------------------
1072; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
1073;                       int stride)
1074;-----------------------------------------------------------------------------
1075
1076%macro PRED8x8L_DC 0
1077cglobal pred8x8l_dc_8, 4,5
1078    sub          r0, r3
1079    lea          r4, [r0+r3*2]
1080    movq        mm0, [r0+r3*1-8]
1081    punpckhbw   mm0, [r0+r3*0-8]
1082    movq        mm1, [r4+r3*1-8]
1083    punpckhbw   mm1, [r0+r3*2-8]
1084    mov          r4, r0
1085    punpckhwd   mm1, mm0
1086    lea          r0, [r0+r3*4]
1087    movq        mm2, [r0+r3*1-8]
1088    punpckhbw   mm2, [r0+r3*0-8]
1089    lea          r0, [r0+r3*2]
1090    movq        mm3, [r0+r3*1-8]
1091    punpckhbw   mm3, [r0+r3*0-8]
1092    punpckhwd   mm3, mm2
1093    punpckhdq   mm3, mm1
1094    lea          r0, [r0+r3*2]
1095    movq        mm0, [r0+r3*0-8]
1096    movq        mm1, [r4]
1097    mov          r0, r4
1098    movq        mm4, mm3
1099    movq        mm2, mm3
1100    PALIGNR     mm4, mm0, 7, mm0
1101    PALIGNR     mm1, mm2, 1, mm2
1102    test        r1, r1
1103    jnz .do_left
1104.fix_lt_1:
1105    movq        mm5, mm3
1106    pxor        mm5, mm4
1107    psrlq       mm5, 56
1108    psllq       mm5, 48
1109    pxor        mm1, mm5
1110    jmp .do_left
1111.fix_lt_2:
1112    movq        mm5, mm3
1113    pxor        mm5, mm2
1114    psllq       mm5, 56
1115    psrlq       mm5, 56
1116    pxor        mm2, mm5
1117    test         r2, r2
1118    jnz .body
1119.fix_tr_1:
1120    movq        mm5, mm3
1121    pxor        mm5, mm1
1122    psrlq       mm5, 56
1123    psllq       mm5, 56
1124    pxor        mm1, mm5
1125    jmp .body
1126.do_left:
1127    movq        mm0, mm4
1128    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1129    movq        mm4, mm0
1130    movq        mm7, mm2
1131    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1132    psllq       mm1, 56
1133    PALIGNR     mm7, mm1, 7, mm3
1134    movq        mm0, [r0-8]
1135    movq        mm3, [r0]
1136    movq        mm1, [r0+8]
1137    movq        mm2, mm3
1138    movq        mm4, mm3
1139    PALIGNR     mm2, mm0, 7, mm0
1140    PALIGNR     mm1, mm4, 1, mm4
1141    test         r1, r1
1142    jz .fix_lt_2
1143    test         r2, r2
1144    jz .fix_tr_1
1145.body:
1146    lea          r1, [r0+r3*2]
1147    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1148    pxor        mm0, mm0
1149    pxor        mm1, mm1
1150    lea          r2, [r1+r3*2]
1151    psadbw      mm0, mm7
1152    psadbw      mm1, mm6
1153    paddw       mm0, [pw_8]
1154    paddw       mm0, mm1
1155    lea          r4, [r2+r3*2]
1156    psrlw       mm0, 4
1157    pshufw      mm0, mm0, 0
1158    packuswb    mm0, mm0
1159    movq [r0+r3*1], mm0
1160    movq [r0+r3*2], mm0
1161    movq [r1+r3*1], mm0
1162    movq [r1+r3*2], mm0
1163    movq [r2+r3*1], mm0
1164    movq [r2+r3*2], mm0
1165    movq [r4+r3*1], mm0
1166    movq [r4+r3*2], mm0
1167    RET
1168%endmacro
1169
1170INIT_MMX mmxext
1171PRED8x8L_DC
1172INIT_MMX ssse3
1173PRED8x8L_DC
1174
1175;-----------------------------------------------------------------------------
1176; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
1177;                               int has_topright, int stride)
1178;-----------------------------------------------------------------------------
1179
1180%macro PRED8x8L_HORIZONTAL 0
1181cglobal pred8x8l_horizontal_8, 4,4
1182    sub          r0, r3
1183    lea          r2, [r0+r3*2]
1184    movq        mm0, [r0+r3*1-8]
1185    test         r1, r1
1186    lea          r1, [r0+r3]
1187    cmovnz       r1, r0
1188    punpckhbw   mm0, [r1+r3*0-8]
1189    movq        mm1, [r2+r3*1-8]
1190    punpckhbw   mm1, [r0+r3*2-8]
1191    mov          r2, r0
1192    punpckhwd   mm1, mm0
1193    lea          r0, [r0+r3*4]
1194    movq        mm2, [r0+r3*1-8]
1195    punpckhbw   mm2, [r0+r3*0-8]
1196    lea          r0, [r0+r3*2]
1197    movq        mm3, [r0+r3*1-8]
1198    punpckhbw   mm3, [r0+r3*0-8]
1199    punpckhwd   mm3, mm2
1200    punpckhdq   mm3, mm1
1201    lea          r0, [r0+r3*2]
1202    movq        mm0, [r0+r3*0-8]
1203    movq        mm1, [r1+r3*0-8]
1204    mov          r0, r2
1205    movq        mm4, mm3
1206    movq        mm2, mm3
1207    PALIGNR     mm4, mm0, 7, mm0
1208    PALIGNR     mm1, mm2, 1, mm2
1209    movq        mm0, mm4
1210    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1211    movq        mm4, mm0
1212    movq        mm7, mm2
1213    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1214    psllq       mm1, 56
1215    PALIGNR     mm7, mm1, 7, mm3
1216    movq        mm3, mm7
1217    lea         r1, [r0+r3*2]
1218    movq       mm7, mm3
1219    punpckhbw  mm3, mm3
1220    punpcklbw  mm7, mm7
1221    pshufw     mm0, mm3, 0xff
1222    pshufw     mm1, mm3, 0xaa
1223    lea         r2, [r1+r3*2]
1224    pshufw     mm2, mm3, 0x55
1225    pshufw     mm3, mm3, 0x00
1226    pshufw     mm4, mm7, 0xff
1227    pshufw     mm5, mm7, 0xaa
1228    pshufw     mm6, mm7, 0x55
1229    pshufw     mm7, mm7, 0x00
1230    movq [r0+r3*1], mm0
1231    movq [r0+r3*2], mm1
1232    movq [r1+r3*1], mm2
1233    movq [r1+r3*2], mm3
1234    movq [r2+r3*1], mm4
1235    movq [r2+r3*2], mm5
1236    lea         r0, [r2+r3*2]
1237    movq [r0+r3*1], mm6
1238    movq [r0+r3*2], mm7
1239    RET
1240%endmacro
1241
1242INIT_MMX mmxext
1243PRED8x8L_HORIZONTAL
1244INIT_MMX ssse3
1245PRED8x8L_HORIZONTAL
1246
1247;-----------------------------------------------------------------------------
1248; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
1249;                             int stride)
1250;-----------------------------------------------------------------------------
1251
1252%macro PRED8x8L_VERTICAL 0
1253cglobal pred8x8l_vertical_8, 4,4
1254    sub          r0, r3
1255    movq        mm0, [r0-8]
1256    movq        mm3, [r0]
1257    movq        mm1, [r0+8]
1258    movq        mm2, mm3
1259    movq        mm4, mm3
1260    PALIGNR     mm2, mm0, 7, mm0
1261    PALIGNR     mm1, mm4, 1, mm4
1262    test         r1, r1 ; top_left
1263    jz .fix_lt_2
1264    test         r2, r2 ; top_right
1265    jz .fix_tr_1
1266    jmp .body
1267.fix_lt_2:
1268    movq        mm5, mm3
1269    pxor        mm5, mm2
1270    psllq       mm5, 56
1271    psrlq       mm5, 56
1272    pxor        mm2, mm5
1273    test         r2, r2 ; top_right
1274    jnz .body
1275.fix_tr_1:
1276    movq        mm5, mm3
1277    pxor        mm5, mm1
1278    psrlq       mm5, 56
1279    psllq       mm5, 56
1280    pxor        mm1, mm5
1281.body:
1282    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1283%rep 3
1284    movq [r0+r3*1], mm0
1285    movq [r0+r3*2], mm0
1286    lea    r0, [r0+r3*2]
1287%endrep
1288    movq [r0+r3*1], mm0
1289    movq [r0+r3*2], mm0
1290    RET
1291%endmacro
1292
1293INIT_MMX mmxext
1294PRED8x8L_VERTICAL
1295INIT_MMX ssse3
1296PRED8x8L_VERTICAL
1297
1298;-----------------------------------------------------------------------------
1299; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
1300;                              int has_topright, int stride)
1301;-----------------------------------------------------------------------------
1302
1303INIT_MMX mmxext
1304cglobal pred8x8l_down_left_8, 4,5
1305    sub          r0, r3
1306    movq        mm0, [r0-8]
1307    movq        mm3, [r0]
1308    movq        mm1, [r0+8]
1309    movq        mm2, mm3
1310    movq        mm4, mm3
1311    PALIGNR     mm2, mm0, 7, mm0
1312    PALIGNR     mm1, mm4, 1, mm4
1313    test         r1, r1
1314    jz .fix_lt_2
1315    test         r2, r2
1316    jz .fix_tr_1
1317    jmp .do_top
1318.fix_lt_2:
1319    movq        mm5, mm3
1320    pxor        mm5, mm2
1321    psllq       mm5, 56
1322    psrlq       mm5, 56
1323    pxor        mm2, mm5
1324    test         r2, r2
1325    jnz .do_top
1326.fix_tr_1:
1327    movq        mm5, mm3
1328    pxor        mm5, mm1
1329    psrlq       mm5, 56
1330    psllq       mm5, 56
1331    pxor        mm1, mm5
1332    jmp .do_top
1333.fix_tr_2:
1334    punpckhbw   mm3, mm3
1335    pshufw      mm1, mm3, 0xFF
1336    jmp .do_topright
1337.do_top:
1338    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1339    movq        mm7, mm4
1340    test         r2, r2
1341    jz .fix_tr_2
1342    movq        mm0, [r0+8]
1343    movq        mm5, mm0
1344    movq        mm2, mm0
1345    movq        mm4, mm0
1346    psrlq       mm5, 56
1347    PALIGNR     mm2, mm3, 7, mm3
1348    PALIGNR     mm5, mm4, 1, mm4
1349    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1350.do_topright:
1351    lea          r1, [r0+r3*2]
1352    movq        mm6, mm1
1353    psrlq       mm1, 56
1354    movq        mm4, mm1
1355    lea          r2, [r1+r3*2]
1356    movq        mm2, mm6
1357    PALIGNR     mm2, mm7, 1, mm0
1358    movq        mm3, mm6
1359    PALIGNR     mm3, mm7, 7, mm0
1360    PALIGNR     mm4, mm6, 1, mm0
1361    movq        mm5, mm7
1362    movq        mm1, mm7
1363    movq        mm7, mm6
1364    lea          r4, [r2+r3*2]
1365    psllq       mm1, 8
1366    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1367    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1368    movq  [r4+r3*2], mm1
1369    movq        mm2, mm0
1370    psllq       mm1, 8
1371    psrlq       mm2, 56
1372    psllq       mm0, 8
1373    por         mm1, mm2
1374    movq  [r4+r3*1], mm1
1375    movq        mm2, mm0
1376    psllq       mm1, 8
1377    psrlq       mm2, 56
1378    psllq       mm0, 8
1379    por         mm1, mm2
1380    movq  [r2+r3*2], mm1
1381    movq        mm2, mm0
1382    psllq       mm1, 8
1383    psrlq       mm2, 56
1384    psllq       mm0, 8
1385    por         mm1, mm2
1386    movq  [r2+r3*1], mm1
1387    movq        mm2, mm0
1388    psllq       mm1, 8
1389    psrlq       mm2, 56
1390    psllq       mm0, 8
1391    por         mm1, mm2
1392    movq  [r1+r3*2], mm1
1393    movq        mm2, mm0
1394    psllq       mm1, 8
1395    psrlq       mm2, 56
1396    psllq       mm0, 8
1397    por         mm1, mm2
1398    movq  [r1+r3*1], mm1
1399    movq        mm2, mm0
1400    psllq       mm1, 8
1401    psrlq       mm2, 56
1402    psllq       mm0, 8
1403    por         mm1, mm2
1404    movq  [r0+r3*2], mm1
1405    psllq       mm1, 8
1406    psrlq       mm0, 56
1407    por         mm1, mm0
1408    movq  [r0+r3*1], mm1
1409    RET
1410
1411%macro PRED8x8L_DOWN_LEFT 0
1412cglobal pred8x8l_down_left_8, 4,4
1413    sub          r0, r3
1414    movq        mm0, [r0-8]
1415    movq        mm3, [r0]
1416    movq        mm1, [r0+8]
1417    movq        mm2, mm3
1418    movq        mm4, mm3
1419    PALIGNR     mm2, mm0, 7, mm0
1420    PALIGNR     mm1, mm4, 1, mm4
1421    test         r1, r1 ; top_left
1422    jz .fix_lt_2
1423    test         r2, r2 ; top_right
1424    jz .fix_tr_1
1425    jmp .do_top
1426.fix_lt_2:
1427    movq        mm5, mm3
1428    pxor        mm5, mm2
1429    psllq       mm5, 56
1430    psrlq       mm5, 56
1431    pxor        mm2, mm5
1432    test         r2, r2 ; top_right
1433    jnz .do_top
1434.fix_tr_1:
1435    movq        mm5, mm3
1436    pxor        mm5, mm1
1437    psrlq       mm5, 56
1438    psllq       mm5, 56
1439    pxor        mm1, mm5
1440    jmp .do_top
1441.fix_tr_2:
1442    punpckhbw   mm3, mm3
1443    pshufw      mm1, mm3, 0xFF
1444    jmp .do_topright
1445.do_top:
1446    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1447    movq2dq    xmm3, mm4
1448    test         r2, r2 ; top_right
1449    jz .fix_tr_2
1450    movq        mm0, [r0+8]
1451    movq        mm5, mm0
1452    movq        mm2, mm0
1453    movq        mm4, mm0
1454    psrlq       mm5, 56
1455    PALIGNR     mm2, mm3, 7, mm3
1456    PALIGNR     mm5, mm4, 1, mm4
1457    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1458.do_topright:
1459    movq2dq    xmm4, mm1
1460    psrlq       mm1, 56
1461    movq2dq    xmm5, mm1
1462    lea         r1, [r0+r3*2]
1463    pslldq    xmm4, 8
1464    por       xmm3, xmm4
1465    movdqa    xmm2, xmm3
1466    psrldq    xmm2, 1
1467    pslldq    xmm5, 15
1468    por       xmm2, xmm5
1469    lea         r2, [r1+r3*2]
1470    movdqa    xmm1, xmm3
1471    pslldq    xmm1, 1
1472INIT_XMM cpuname
1473    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1474    psrldq    xmm0, 1
1475    movq [r0+r3*1], xmm0
1476    psrldq    xmm0, 1
1477    movq [r0+r3*2], xmm0
1478    psrldq    xmm0, 1
1479    lea         r0, [r2+r3*2]
1480    movq [r1+r3*1], xmm0
1481    psrldq    xmm0, 1
1482    movq [r1+r3*2], xmm0
1483    psrldq    xmm0, 1
1484    movq [r2+r3*1], xmm0
1485    psrldq    xmm0, 1
1486    movq [r2+r3*2], xmm0
1487    psrldq    xmm0, 1
1488    movq [r0+r3*1], xmm0
1489    psrldq    xmm0, 1
1490    movq [r0+r3*2], xmm0
1491    RET
1492%endmacro
1493
1494INIT_MMX sse2
1495PRED8x8L_DOWN_LEFT
1496INIT_MMX ssse3
1497PRED8x8L_DOWN_LEFT
1498
1499;-----------------------------------------------------------------------------
1500; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft,
1501;                                      int has_topright, int stride)
1502;-----------------------------------------------------------------------------
1503
1504INIT_MMX mmxext
1505cglobal pred8x8l_down_right_8, 4,5
1506    sub          r0, r3
1507    lea          r4, [r0+r3*2]
1508    movq        mm0, [r0+r3*1-8]
1509    punpckhbw   mm0, [r0+r3*0-8]
1510    movq        mm1, [r4+r3*1-8]
1511    punpckhbw   mm1, [r0+r3*2-8]
1512    mov          r4, r0
1513    punpckhwd   mm1, mm0
1514    lea          r0, [r0+r3*4]
1515    movq        mm2, [r0+r3*1-8]
1516    punpckhbw   mm2, [r0+r3*0-8]
1517    lea          r0, [r0+r3*2]
1518    movq        mm3, [r0+r3*1-8]
1519    punpckhbw   mm3, [r0+r3*0-8]
1520    punpckhwd   mm3, mm2
1521    punpckhdq   mm3, mm1
1522    lea          r0, [r0+r3*2]
1523    movq        mm0, [r0+r3*0-8]
1524    movq        mm1, [r4]
1525    mov          r0, r4
1526    movq        mm4, mm3
1527    movq        mm2, mm3
1528    PALIGNR     mm4, mm0, 7, mm0
1529    PALIGNR     mm1, mm2, 1, mm2
1530    test        r1, r1 ; top_left
1531    jz .fix_lt_1
1532.do_left:
1533    movq        mm0, mm4
1534    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1535    movq        mm4, mm0
1536    movq        mm7, mm2
1537    movq        mm6, mm2
1538    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1539    psllq       mm1, 56
1540    PALIGNR     mm7, mm1, 7, mm3
1541    movq        mm0, [r0-8]
1542    movq        mm3, [r0]
1543    movq        mm1, [r0+8]
1544    movq        mm2, mm3
1545    movq        mm4, mm3
1546    PALIGNR     mm2, mm0, 7, mm0
1547    PALIGNR     mm1, mm4, 1, mm4
1548    test         r1, r1 ; top_left
1549    jz .fix_lt_2
1550    test         r2, r2 ; top_right
1551    jz .fix_tr_1
1552.do_top:
1553    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1554    movq        mm5, mm4
1555    jmp .body
1556.fix_lt_1:
1557    movq        mm5, mm3
1558    pxor        mm5, mm4
1559    psrlq       mm5, 56
1560    psllq       mm5, 48
1561    pxor        mm1, mm5
1562    jmp .do_left
1563.fix_lt_2:
1564    movq        mm5, mm3
1565    pxor        mm5, mm2
1566    psllq       mm5, 56
1567    psrlq       mm5, 56
1568    pxor        mm2, mm5
1569    test         r2, r2 ; top_right
1570    jnz .do_top
1571.fix_tr_1:
1572    movq        mm5, mm3
1573    pxor        mm5, mm1
1574    psrlq       mm5, 56
1575    psllq       mm5, 56
1576    pxor        mm1, mm5
1577    jmp .do_top
1578.body:
1579    lea         r1, [r0+r3*2]
1580    movq       mm1, mm7
1581    movq       mm7, mm5
1582    movq       mm5, mm6
1583    movq       mm2, mm7
1584    lea         r2, [r1+r3*2]
1585    PALIGNR    mm2, mm6, 1, mm0
1586    movq       mm3, mm7
1587    PALIGNR    mm3, mm6, 7, mm0
1588    movq       mm4, mm7
1589    lea         r4, [r2+r3*2]
1590    psrlq      mm4, 8
1591    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1592    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1593    movq [r4+r3*2], mm0
1594    movq       mm2, mm1
1595    psrlq      mm0, 8
1596    psllq      mm2, 56
1597    psrlq      mm1, 8
1598    por        mm0, mm2
1599    movq [r4+r3*1], mm0
1600    movq       mm2, mm1
1601    psrlq      mm0, 8
1602    psllq      mm2, 56
1603    psrlq      mm1, 8
1604    por        mm0, mm2
1605    movq [r2+r3*2], mm0
1606    movq       mm2, mm1
1607    psrlq      mm0, 8
1608    psllq      mm2, 56
1609    psrlq      mm1, 8
1610    por        mm0, mm2
1611    movq [r2+r3*1], mm0
1612    movq       mm2, mm1
1613    psrlq      mm0, 8
1614    psllq      mm2, 56
1615    psrlq      mm1, 8
1616    por        mm0, mm2
1617    movq [r1+r3*2], mm0
1618    movq       mm2, mm1
1619    psrlq      mm0, 8
1620    psllq      mm2, 56
1621    psrlq      mm1, 8
1622    por        mm0, mm2
1623    movq [r1+r3*1], mm0
1624    movq       mm2, mm1
1625    psrlq      mm0, 8
1626    psllq      mm2, 56
1627    psrlq      mm1, 8
1628    por        mm0, mm2
1629    movq [r0+r3*2], mm0
1630    psrlq      mm0, 8
1631    psllq      mm1, 56
1632    por        mm0, mm1
1633    movq [r0+r3*1], mm0
1634    RET
1635
1636%macro PRED8x8L_DOWN_RIGHT 0
1637cglobal pred8x8l_down_right_8, 4,5
1638    sub          r0, r3
1639    lea          r4, [r0+r3*2]
1640    movq        mm0, [r0+r3*1-8]
1641    punpckhbw   mm0, [r0+r3*0-8]
1642    movq        mm1, [r4+r3*1-8]
1643    punpckhbw   mm1, [r0+r3*2-8]
1644    mov          r4, r0
1645    punpckhwd   mm1, mm0
1646    lea          r0, [r0+r3*4]
1647    movq        mm2, [r0+r3*1-8]
1648    punpckhbw   mm2, [r0+r3*0-8]
1649    lea          r0, [r0+r3*2]
1650    movq        mm3, [r0+r3*1-8]
1651    punpckhbw   mm3, [r0+r3*0-8]
1652    punpckhwd   mm3, mm2
1653    punpckhdq   mm3, mm1
1654    lea          r0, [r0+r3*2]
1655    movq        mm0, [r0+r3*0-8]
1656    movq        mm1, [r4]
1657    mov          r0, r4
1658    movq        mm4, mm3
1659    movq        mm2, mm3
1660    PALIGNR     mm4, mm0, 7, mm0
1661    PALIGNR     mm1, mm2, 1, mm2
1662    test        r1, r1
1663    jz .fix_lt_1
1664    jmp .do_left
1665.fix_lt_1:
1666    movq        mm5, mm3
1667    pxor        mm5, mm4
1668    psrlq       mm5, 56
1669    psllq       mm5, 48
1670    pxor        mm1, mm5
1671    jmp .do_left
1672.fix_lt_2:
1673    movq        mm5, mm3
1674    pxor        mm5, mm2
1675    psllq       mm5, 56
1676    psrlq       mm5, 56
1677    pxor        mm2, mm5
1678    test         r2, r2
1679    jnz .do_top
1680.fix_tr_1:
1681    movq        mm5, mm3
1682    pxor        mm5, mm1
1683    psrlq       mm5, 56
1684    psllq       mm5, 56
1685    pxor        mm1, mm5
1686    jmp .do_top
1687.do_left:
1688    movq        mm0, mm4
1689    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1690    movq        mm4, mm0
1691    movq        mm7, mm2
1692    movq2dq    xmm3, mm2
1693    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1694    psllq       mm1, 56
1695    PALIGNR     mm7, mm1, 7, mm3
1696    movq2dq    xmm1, mm7
1697    movq        mm0, [r0-8]
1698    movq        mm3, [r0]
1699    movq        mm1, [r0+8]
1700    movq        mm2, mm3
1701    movq        mm4, mm3
1702    PALIGNR     mm2, mm0, 7, mm0
1703    PALIGNR     mm1, mm4, 1, mm4
1704    test         r1, r1
1705    jz .fix_lt_2
1706    test         r2, r2
1707    jz .fix_tr_1
1708.do_top:
1709    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1710    movq2dq   xmm4, mm4
1711    lea         r1, [r0+r3*2]
1712    movdqa    xmm0, xmm3
1713    pslldq    xmm4, 8
1714    por       xmm3, xmm4
1715    lea         r2, [r1+r3*2]
1716    pslldq    xmm4, 1
1717    por       xmm1, xmm4
1718    psrldq    xmm0, 7
1719    pslldq    xmm0, 15
1720    psrldq    xmm0, 7
1721    por       xmm1, xmm0
1722    lea         r0, [r2+r3*2]
1723    movdqa    xmm2, xmm3
1724    psrldq    xmm2, 1
1725INIT_XMM cpuname
1726    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1727    movdqa    xmm1, xmm0
1728    psrldq    xmm1, 1
1729    movq [r0+r3*2], xmm0
1730    movq [r0+r3*1], xmm1
1731    psrldq    xmm0, 2
1732    psrldq    xmm1, 2
1733    movq [r2+r3*2], xmm0
1734    movq [r2+r3*1], xmm1
1735    psrldq    xmm0, 2
1736    psrldq    xmm1, 2
1737    movq [r1+r3*2], xmm0
1738    movq [r1+r3*1], xmm1
1739    psrldq    xmm0, 2
1740    psrldq    xmm1, 2
1741    movq [r4+r3*2], xmm0
1742    movq [r4+r3*1], xmm1
1743    RET
1744%endmacro
1745
1746INIT_MMX sse2
1747PRED8x8L_DOWN_RIGHT
1748INIT_MMX ssse3
1749PRED8x8L_DOWN_RIGHT
1750
1751;-----------------------------------------------------------------------------
1752; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
1753;                                   int has_topright, int stride)
1754;-----------------------------------------------------------------------------
1755
1756INIT_MMX mmxext
1757cglobal pred8x8l_vertical_right_8, 4,5
1758    sub          r0, r3
1759    lea          r4, [r0+r3*2]
1760    movq        mm0, [r0+r3*1-8]
1761    punpckhbw   mm0, [r0+r3*0-8]
1762    movq        mm1, [r4+r3*1-8]
1763    punpckhbw   mm1, [r0+r3*2-8]
1764    mov          r4, r0
1765    punpckhwd   mm1, mm0
1766    lea          r0, [r0+r3*4]
1767    movq        mm2, [r0+r3*1-8]
1768    punpckhbw   mm2, [r0+r3*0-8]
1769    lea          r0, [r0+r3*2]
1770    movq        mm3, [r0+r3*1-8]
1771    punpckhbw   mm3, [r0+r3*0-8]
1772    punpckhwd   mm3, mm2
1773    punpckhdq   mm3, mm1
1774    lea          r0, [r0+r3*2]
1775    movq        mm0, [r0+r3*0-8]
1776    movq        mm1, [r4]
1777    mov          r0, r4
1778    movq        mm4, mm3
1779    movq        mm2, mm3
1780    PALIGNR     mm4, mm0, 7, mm0
1781    PALIGNR     mm1, mm2, 1, mm2
1782    test        r1, r1
1783    jz .fix_lt_1
1784    jmp .do_left
1785.fix_lt_1:
1786    movq        mm5, mm3
1787    pxor        mm5, mm4
1788    psrlq       mm5, 56
1789    psllq       mm5, 48
1790    pxor        mm1, mm5
1791    jmp .do_left
1792.fix_lt_2:
1793    movq        mm5, mm3
1794    pxor        mm5, mm2
1795    psllq       mm5, 56
1796    psrlq       mm5, 56
1797    pxor        mm2, mm5
1798    test         r2, r2
1799    jnz .do_top
1800.fix_tr_1:
1801    movq        mm5, mm3
1802    pxor        mm5, mm1
1803    psrlq       mm5, 56
1804    psllq       mm5, 56
1805    pxor        mm1, mm5
1806    jmp .do_top
1807.do_left:
1808    movq        mm0, mm4
1809    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1810    movq        mm7, mm2
1811    movq        mm0, [r0-8]
1812    movq        mm3, [r0]
1813    movq        mm1, [r0+8]
1814    movq        mm2, mm3
1815    movq        mm4, mm3
1816    PALIGNR     mm2, mm0, 7, mm0
1817    PALIGNR     mm1, mm4, 1, mm4
1818    test         r1, r1
1819    jz .fix_lt_2
1820    test         r2, r2
1821    jz .fix_tr_1
1822.do_top:
1823    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1824    lea         r1, [r0+r3*2]
1825    movq       mm2, mm6
1826    movq       mm3, mm6
1827    PALIGNR    mm3, mm7, 7, mm0
1828    PALIGNR    mm6, mm7, 6, mm1
1829    movq       mm4, mm3
1830    pavgb      mm3, mm2
1831    lea         r2, [r1+r3*2]
1832    PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1833    movq [r0+r3*1], mm3
1834    movq [r0+r3*2], mm0
1835    movq       mm5, mm0
1836    movq       mm6, mm3
1837    movq       mm1, mm7
1838    movq       mm2, mm1
1839    psllq      mm2, 8
1840    movq       mm3, mm1
1841    psllq      mm3, 16
1842    lea         r4, [r2+r3*2]
1843    PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1844    PALIGNR    mm6, mm0, 7, mm2
1845    movq [r1+r3*1], mm6
1846    psllq      mm0, 8
1847    PALIGNR    mm5, mm0, 7, mm1
1848    movq [r1+r3*2], mm5
1849    psllq      mm0, 8
1850    PALIGNR    mm6, mm0, 7, mm2
1851    movq [r2+r3*1], mm6
1852    psllq      mm0, 8
1853    PALIGNR    mm5, mm0, 7, mm1
1854    movq [r2+r3*2], mm5
1855    psllq      mm0, 8
1856    PALIGNR    mm6, mm0, 7, mm2
1857    movq [r4+r3*1], mm6
1858    psllq      mm0, 8
1859    PALIGNR    mm5, mm0, 7, mm1
1860    movq [r4+r3*2], mm5
1861    RET
1862
1863%macro PRED8x8L_VERTICAL_RIGHT 0
1864cglobal pred8x8l_vertical_right_8, 4,5,7
1865    ; manually spill XMM registers for Win64 because
1866    ; the code here is initialized with INIT_MMX
1867    WIN64_SPILL_XMM 7
1868    sub          r0, r3
1869    lea          r4, [r0+r3*2]
1870    movq        mm0, [r0+r3*1-8]
1871    punpckhbw   mm0, [r0+r3*0-8]
1872    movq        mm1, [r4+r3*1-8]
1873    punpckhbw   mm1, [r0+r3*2-8]
1874    mov          r4, r0
1875    punpckhwd   mm1, mm0
1876    lea          r0, [r0+r3*4]
1877    movq        mm2, [r0+r3*1-8]
1878    punpckhbw   mm2, [r0+r3*0-8]
1879    lea          r0, [r0+r3*2]
1880    movq        mm3, [r0+r3*1-8]
1881    punpckhbw   mm3, [r0+r3*0-8]
1882    punpckhwd   mm3, mm2
1883    punpckhdq   mm3, mm1
1884    lea          r0, [r0+r3*2]
1885    movq        mm0, [r0+r3*0-8]
1886    movq        mm1, [r4]
1887    mov          r0, r4
1888    movq        mm4, mm3
1889    movq        mm2, mm3
1890    PALIGNR     mm4, mm0, 7, mm0
1891    PALIGNR     mm1, mm2, 1, mm2
1892    test        r1, r1
1893    jnz .do_left
1894.fix_lt_1:
1895    movq        mm5, mm3
1896    pxor        mm5, mm4
1897    psrlq       mm5, 56
1898    psllq       mm5, 48
1899    pxor        mm1, mm5
1900    jmp .do_left
1901.fix_lt_2:
1902    movq        mm5, mm3
1903    pxor        mm5, mm2
1904    psllq       mm5, 56
1905    psrlq       mm5, 56
1906    pxor        mm2, mm5
1907    test         r2, r2
1908    jnz .do_top
1909.fix_tr_1:
1910    movq        mm5, mm3
1911    pxor        mm5, mm1
1912    psrlq       mm5, 56
1913    psllq       mm5, 56
1914    pxor        mm1, mm5
1915    jmp .do_top
1916.do_left:
1917    movq        mm0, mm4
1918    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1919    movq2dq    xmm0, mm2
1920    movq        mm0, [r0-8]
1921    movq        mm3, [r0]
1922    movq        mm1, [r0+8]
1923    movq        mm2, mm3
1924    movq        mm4, mm3
1925    PALIGNR     mm2, mm0, 7, mm0
1926    PALIGNR     mm1, mm4, 1, mm4
1927    test         r1, r1
1928    jz .fix_lt_2
1929    test         r2, r2
1930    jz .fix_tr_1
1931.do_top:
1932    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1933    lea           r1, [r0+r3*2]
1934    movq2dq     xmm4, mm6
1935    pslldq      xmm4, 8
1936    por         xmm0, xmm4
1937    movdqa      xmm6, [pw_ff00]
1938    movdqa      xmm1, xmm0
1939    lea           r2, [r1+r3*2]
1940    movdqa      xmm2, xmm0
1941    movdqa      xmm3, xmm0
1942    pslldq      xmm0, 1
1943    pslldq      xmm1, 2
1944    pavgb       xmm2, xmm0
1945INIT_XMM cpuname
1946    PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1947    pandn       xmm6, xmm4
1948    movdqa      xmm5, xmm4
1949    psrlw       xmm4, 8
1950    packuswb    xmm6, xmm4
1951    movhlps     xmm4, xmm6
1952    movhps [r0+r3*2], xmm5
1953    movhps [r0+r3*1], xmm2
1954    psrldq      xmm5, 4
1955    movss       xmm5, xmm6
1956    psrldq      xmm2, 4
1957    movss       xmm2, xmm4
1958    lea           r0, [r2+r3*2]
1959    psrldq      xmm5, 1
1960    psrldq      xmm2, 1
1961    movq        [r0+r3*2], xmm5
1962    movq        [r0+r3*1], xmm2
1963    psrldq      xmm5, 1
1964    psrldq      xmm2, 1
1965    movq        [r2+r3*2], xmm5
1966    movq        [r2+r3*1], xmm2
1967    psrldq      xmm5, 1
1968    psrldq      xmm2, 1
1969    movq        [r1+r3*2], xmm5
1970    movq        [r1+r3*1], xmm2
1971    RET
1972%endmacro
1973
1974INIT_MMX sse2
1975PRED8x8L_VERTICAL_RIGHT
1976INIT_MMX ssse3
1977PRED8x8L_VERTICAL_RIGHT
1978
1979;-----------------------------------------------------------------------------
1980; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
1981;                                  int has_topright, int stride)
1982;-----------------------------------------------------------------------------
1983
1984%macro PRED8x8L_VERTICAL_LEFT 0
1985cglobal pred8x8l_vertical_left_8, 4,4
1986    sub          r0, r3
1987    movq        mm0, [r0-8]
1988    movq        mm3, [r0]
1989    movq        mm1, [r0+8]
1990    movq        mm2, mm3
1991    movq        mm4, mm3
1992    PALIGNR     mm2, mm0, 7, mm0
1993    PALIGNR     mm1, mm4, 1, mm4
1994    test         r1, r1
1995    jz .fix_lt_2
1996    test         r2, r2
1997    jz .fix_tr_1
1998    jmp .do_top
1999.fix_lt_2:
2000    movq        mm5, mm3
2001    pxor        mm5, mm2
2002    psllq       mm5, 56
2003    psrlq       mm5, 56
2004    pxor        mm2, mm5
2005    test         r2, r2
2006    jnz .do_top
2007.fix_tr_1:
2008    movq        mm5, mm3
2009    pxor        mm5, mm1
2010    psrlq       mm5, 56
2011    psllq       mm5, 56
2012    pxor        mm1, mm5
2013    jmp .do_top
2014.fix_tr_2:
2015    punpckhbw   mm3, mm3
2016    pshufw      mm1, mm3, 0xFF
2017    jmp .do_topright
2018.do_top:
2019    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2020    movq2dq    xmm4, mm4
2021    test         r2, r2
2022    jz .fix_tr_2
2023    movq        mm0, [r0+8]
2024    movq        mm5, mm0
2025    movq        mm2, mm0
2026    movq        mm4, mm0
2027    psrlq       mm5, 56
2028    PALIGNR     mm2, mm3, 7, mm3
2029    PALIGNR     mm5, mm4, 1, mm4
2030    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2031.do_topright:
2032    movq2dq   xmm3, mm1
2033    lea         r1, [r0+r3*2]
2034    pslldq    xmm3, 8
2035    por       xmm4, xmm3
2036    movdqa    xmm2, xmm4
2037    movdqa    xmm1, xmm4
2038    movdqa    xmm3, xmm4
2039    psrldq    xmm2, 1
2040    pslldq    xmm1, 1
2041    pavgb     xmm3, xmm2
2042    lea         r2, [r1+r3*2]
2043INIT_XMM cpuname
2044    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2045    psrldq    xmm0, 1
2046    movq [r0+r3*1], xmm3
2047    movq [r0+r3*2], xmm0
2048    lea         r0, [r2+r3*2]
2049    psrldq    xmm3, 1
2050    psrldq    xmm0, 1
2051    movq [r1+r3*1], xmm3
2052    movq [r1+r3*2], xmm0
2053    psrldq    xmm3, 1
2054    psrldq    xmm0, 1
2055    movq [r2+r3*1], xmm3
2056    movq [r2+r3*2], xmm0
2057    psrldq    xmm3, 1
2058    psrldq    xmm0, 1
2059    movq [r0+r3*1], xmm3
2060    movq [r0+r3*2], xmm0
2061    RET
2062%endmacro
2063
2064INIT_MMX sse2
2065PRED8x8L_VERTICAL_LEFT
2066INIT_MMX ssse3
2067PRED8x8L_VERTICAL_LEFT
2068
2069;-----------------------------------------------------------------------------
2070; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
2071;                                  int has_topright, int stride)
2072;-----------------------------------------------------------------------------
2073
2074%macro PRED8x8L_HORIZONTAL_UP 0
2075cglobal pred8x8l_horizontal_up_8, 4,4
2076    sub          r0, r3
2077    lea          r2, [r0+r3*2]
2078    movq        mm0, [r0+r3*1-8]
2079    test         r1, r1
2080    lea          r1, [r0+r3]
2081    cmovnz       r1, r0
2082    punpckhbw   mm0, [r1+r3*0-8]
2083    movq        mm1, [r2+r3*1-8]
2084    punpckhbw   mm1, [r0+r3*2-8]
2085    mov          r2, r0
2086    punpckhwd   mm1, mm0
2087    lea          r0, [r0+r3*4]
2088    movq        mm2, [r0+r3*1-8]
2089    punpckhbw   mm2, [r0+r3*0-8]
2090    lea          r0, [r0+r3*2]
2091    movq        mm3, [r0+r3*1-8]
2092    punpckhbw   mm3, [r0+r3*0-8]
2093    punpckhwd   mm3, mm2
2094    punpckhdq   mm3, mm1
2095    lea          r0, [r0+r3*2]
2096    movq        mm0, [r0+r3*0-8]
2097    movq        mm1, [r1+r3*0-8]
2098    mov          r0, r2
2099    movq        mm4, mm3
2100    movq        mm2, mm3
2101    PALIGNR     mm4, mm0, 7, mm0
2102    PALIGNR     mm1, mm2, 1, mm2
2103    movq       mm0, mm4
2104    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2105    movq       mm4, mm0
2106    movq       mm7, mm2
2107    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2108    psllq      mm1, 56
2109    PALIGNR    mm7, mm1, 7, mm3
2110    lea         r1, [r0+r3*2]
2111    pshufw     mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2112    psllq      mm7, 56             ; l7 .. .. .. .. .. .. ..
2113    movq       mm2, mm0
2114    psllw      mm0, 8
2115    psrlw      mm2, 8
2116    por        mm2, mm0            ; l7 l6 l5 l4 l3 l2 l1 l0
2117    movq       mm3, mm2
2118    movq       mm4, mm2
2119    movq       mm5, mm2
2120    psrlq      mm2, 8
2121    psrlq      mm3, 16
2122    lea         r2, [r1+r3*2]
2123    por        mm2, mm7            ; l7 l7 l6 l5 l4 l3 l2 l1
2124    punpckhbw  mm7, mm7
2125    por        mm3, mm7            ; l7 l7 l7 l6 l5 l4 l3 l2
2126    pavgb      mm4, mm2
2127    PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2128    movq       mm5, mm4
2129    punpcklbw  mm4, mm1            ; p4 p3 p2 p1
2130    punpckhbw  mm5, mm1            ; p8 p7 p6 p5
2131    movq       mm6, mm5
2132    movq       mm7, mm5
2133    movq       mm0, mm5
2134    PALIGNR    mm5, mm4, 2, mm1
2135    pshufw     mm1, mm6, 11111001b
2136    PALIGNR    mm6, mm4, 4, mm2
2137    pshufw     mm2, mm7, 11111110b
2138    PALIGNR    mm7, mm4, 6, mm3
2139    pshufw     mm3, mm0, 11111111b
2140    movq [r0+r3*1], mm4
2141    movq [r0+r3*2], mm5
2142    lea         r0, [r2+r3*2]
2143    movq [r1+r3*1], mm6
2144    movq [r1+r3*2], mm7
2145    movq [r2+r3*1], mm0
2146    movq [r2+r3*2], mm1
2147    movq [r0+r3*1], mm2
2148    movq [r0+r3*2], mm3
2149    RET
2150%endmacro
2151
2152INIT_MMX mmxext
2153PRED8x8L_HORIZONTAL_UP
2154INIT_MMX ssse3
2155PRED8x8L_HORIZONTAL_UP
2156
2157;-----------------------------------------------------------------------------
2158; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
2159;                                    int has_topright, int stride)
2160;-----------------------------------------------------------------------------
2161
2162INIT_MMX mmxext
2163cglobal pred8x8l_horizontal_down_8, 4,5
2164    sub          r0, r3
2165    lea          r4, [r0+r3*2]
2166    movq        mm0, [r0+r3*1-8]
2167    punpckhbw   mm0, [r0+r3*0-8]
2168    movq        mm1, [r4+r3*1-8]
2169    punpckhbw   mm1, [r0+r3*2-8]
2170    mov          r4, r0
2171    punpckhwd   mm1, mm0
2172    lea          r0, [r0+r3*4]
2173    movq        mm2, [r0+r3*1-8]
2174    punpckhbw   mm2, [r0+r3*0-8]
2175    lea          r0, [r0+r3*2]
2176    movq        mm3, [r0+r3*1-8]
2177    punpckhbw   mm3, [r0+r3*0-8]
2178    punpckhwd   mm3, mm2
2179    punpckhdq   mm3, mm1
2180    lea          r0, [r0+r3*2]
2181    movq        mm0, [r0+r3*0-8]
2182    movq        mm1, [r4]
2183    mov          r0, r4
2184    movq        mm4, mm3
2185    movq        mm2, mm3
2186    PALIGNR     mm4, mm0, 7, mm0
2187    PALIGNR     mm1, mm2, 1, mm2
2188    test        r1, r1
2189    jnz .do_left
2190.fix_lt_1:
2191    movq        mm5, mm3
2192    pxor        mm5, mm4
2193    psrlq       mm5, 56
2194    psllq       mm5, 48
2195    pxor        mm1, mm5
2196    jmp .do_left
2197.fix_lt_2:
2198    movq        mm5, mm3
2199    pxor        mm5, mm2
2200    psllq       mm5, 56
2201    psrlq       mm5, 56
2202    pxor        mm2, mm5
2203    test         r2, r2
2204    jnz .do_top
2205.fix_tr_1:
2206    movq        mm5, mm3
2207    pxor        mm5, mm1
2208    psrlq       mm5, 56
2209    psllq       mm5, 56
2210    pxor        mm1, mm5
2211    jmp .do_top
2212.do_left:
2213    movq        mm0, mm4
2214    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2215    movq        mm4, mm0
2216    movq        mm7, mm2
2217    movq        mm6, mm2
2218    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2219    psllq       mm1, 56
2220    PALIGNR     mm7, mm1, 7, mm3
2221    movq        mm0, [r0-8]
2222    movq        mm3, [r0]
2223    movq        mm1, [r0+8]
2224    movq        mm2, mm3
2225    movq        mm4, mm3
2226    PALIGNR     mm2, mm0, 7, mm0
2227    PALIGNR     mm1, mm4, 1, mm4
2228    test         r1, r1
2229    jz .fix_lt_2
2230    test         r2, r2
2231    jz .fix_tr_1
2232.do_top:
2233    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2234    movq       mm5, mm4
2235    lea         r1, [r0+r3*2]
2236    psllq      mm7, 56
2237    movq       mm2, mm5
2238    movq       mm3, mm6
2239    movq       mm4, mm2
2240    PALIGNR    mm2, mm6, 7, mm5
2241    PALIGNR    mm6, mm7, 7, mm0
2242    lea         r2, [r1+r3*2]
2243    PALIGNR    mm4, mm3, 1, mm7
2244    movq       mm5, mm3
2245    pavgb      mm3, mm6
2246    PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2247    movq       mm4, mm2
2248    movq       mm1, mm2
2249    lea         r4, [r2+r3*2]
2250    psrlq      mm4, 16
2251    psrlq      mm1, 8
2252    PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2253    movq       mm7, mm3
2254    punpcklbw  mm3, mm0
2255    punpckhbw  mm7, mm0
2256    movq       mm1, mm7
2257    movq       mm0, mm7
2258    movq       mm4, mm7
2259    movq [r4+r3*2], mm3
2260    PALIGNR    mm7, mm3, 2, mm5
2261    movq [r4+r3*1], mm7
2262    PALIGNR    mm1, mm3, 4, mm5
2263    movq [r2+r3*2], mm1
2264    PALIGNR    mm0, mm3, 6, mm3
2265    movq [r2+r3*1], mm0
2266    movq       mm2, mm6
2267    movq       mm3, mm6
2268    movq [r1+r3*2], mm4
2269    PALIGNR    mm6, mm4, 2, mm5
2270    movq [r1+r3*1], mm6
2271    PALIGNR    mm2, mm4, 4, mm5
2272    movq [r0+r3*2], mm2
2273    PALIGNR    mm3, mm4, 6, mm4
2274    movq [r0+r3*1], mm3
2275    RET
2276
2277%macro PRED8x8L_HORIZONTAL_DOWN 0
2278cglobal pred8x8l_horizontal_down_8, 4,5
2279    sub          r0, r3
2280    lea          r4, [r0+r3*2]
2281    movq        mm0, [r0+r3*1-8]
2282    punpckhbw   mm0, [r0+r3*0-8]
2283    movq        mm1, [r4+r3*1-8]
2284    punpckhbw   mm1, [r0+r3*2-8]
2285    mov          r4, r0
2286    punpckhwd   mm1, mm0
2287    lea          r0, [r0+r3*4]
2288    movq        mm2, [r0+r3*1-8]
2289    punpckhbw   mm2, [r0+r3*0-8]
2290    lea          r0, [r0+r3*2]
2291    movq        mm3, [r0+r3*1-8]
2292    punpckhbw   mm3, [r0+r3*0-8]
2293    punpckhwd   mm3, mm2
2294    punpckhdq   mm3, mm1
2295    lea          r0, [r0+r3*2]
2296    movq        mm0, [r0+r3*0-8]
2297    movq        mm1, [r4]
2298    mov          r0, r4
2299    movq        mm4, mm3
2300    movq        mm2, mm3
2301    PALIGNR     mm4, mm0, 7, mm0
2302    PALIGNR     mm1, mm2, 1, mm2
2303    test        r1, r1
2304    jnz .do_left
2305.fix_lt_1:
2306    movq        mm5, mm3
2307    pxor        mm5, mm4
2308    psrlq       mm5, 56
2309    psllq       mm5, 48
2310    pxor        mm1, mm5
2311    jmp .do_left
2312.fix_lt_2:
2313    movq        mm5, mm3
2314    pxor        mm5, mm2
2315    psllq       mm5, 56
2316    psrlq       mm5, 56
2317    pxor        mm2, mm5
2318    test         r2, r2
2319    jnz .do_top
2320.fix_tr_1:
2321    movq        mm5, mm3
2322    pxor        mm5, mm1
2323    psrlq       mm5, 56
2324    psllq       mm5, 56
2325    pxor        mm1, mm5
2326    jmp .do_top
2327.fix_tr_2:
2328    punpckhbw   mm3, mm3
2329    pshufw      mm1, mm3, 0xFF
2330    jmp .do_topright
2331.do_left:
2332    movq        mm0, mm4
2333    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2334    movq2dq    xmm0, mm2
2335    pslldq     xmm0, 8
2336    movq        mm4, mm0
2337    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2338    movq2dq    xmm2, mm1
2339    pslldq     xmm2, 15
2340    psrldq     xmm2, 8
2341    por        xmm0, xmm2
2342    movq        mm0, [r0-8]
2343    movq        mm3, [r0]
2344    movq        mm1, [r0+8]
2345    movq        mm2, mm3
2346    movq        mm4, mm3
2347    PALIGNR     mm2, mm0, 7, mm0
2348    PALIGNR     mm1, mm4, 1, mm4
2349    test         r1, r1
2350    jz .fix_lt_2
2351    test         r2, r2
2352    jz .fix_tr_1
2353.do_top:
2354    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2355    movq2dq    xmm1, mm4
2356    test         r2, r2
2357    jz .fix_tr_2
2358    movq        mm0, [r0+8]
2359    movq        mm5, mm0
2360    movq        mm2, mm0
2361    movq        mm4, mm0
2362    psrlq       mm5, 56
2363    PALIGNR     mm2, mm3, 7, mm3
2364    PALIGNR     mm5, mm4, 1, mm4
2365    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2366.do_topright:
2367    movq2dq    xmm5, mm1
2368    pslldq     xmm5, 8
2369    por        xmm1, xmm5
2370INIT_XMM cpuname
2371    lea         r2, [r4+r3*2]
2372    movdqa    xmm2, xmm1
2373    movdqa    xmm3, xmm1
2374    PALIGNR   xmm1, xmm0, 7, xmm4
2375    PALIGNR   xmm2, xmm0, 9, xmm5
2376    lea         r1, [r2+r3*2]
2377    PALIGNR   xmm3, xmm0, 8, xmm0
2378    movdqa    xmm4, xmm1
2379    pavgb     xmm4, xmm3
2380    lea         r0, [r1+r3*2]
2381    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2382    punpcklbw xmm4, xmm0
2383    movhlps   xmm0, xmm4
2384    movq   [r0+r3*2], xmm4
2385    movq   [r2+r3*2], xmm0
2386    psrldq xmm4, 2
2387    psrldq xmm0, 2
2388    movq   [r0+r3*1], xmm4
2389    movq   [r2+r3*1], xmm0
2390    psrldq xmm4, 2
2391    psrldq xmm0, 2
2392    movq   [r1+r3*2], xmm4
2393    movq   [r4+r3*2], xmm0
2394    psrldq xmm4, 2
2395    psrldq xmm0, 2
2396    movq   [r1+r3*1], xmm4
2397    movq   [r4+r3*1], xmm0
2398    RET
2399%endmacro
2400
2401INIT_MMX sse2
2402PRED8x8L_HORIZONTAL_DOWN
2403INIT_MMX ssse3
2404PRED8x8L_HORIZONTAL_DOWN
2405
2406;-------------------------------------------------------------------------------
2407; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2408;-------------------------------------------------------------------------------
2409
2410INIT_MMX mmxext
2411cglobal pred4x4_dc_8, 3,5
2412    pxor   mm7, mm7
2413    mov     r4, r0
2414    sub     r0, r2
2415    movd   mm0, [r0]
2416    psadbw mm0, mm7
2417    movzx  r1d, byte [r0+r2*1-1]
2418    movd   r3d, mm0
2419    add    r3d, r1d
2420    movzx  r1d, byte [r0+r2*2-1]
2421    lea     r0, [r0+r2*2]
2422    add    r3d, r1d
2423    movzx  r1d, byte [r0+r2*1-1]
2424    add    r3d, r1d
2425    movzx  r1d, byte [r0+r2*2-1]
2426    add    r3d, r1d
2427    add    r3d, 4
2428    shr    r3d, 3
2429    imul   r3d, 0x01010101
2430    mov   [r4+r2*0], r3d
2431    mov   [r0+r2*0], r3d
2432    mov   [r0+r2*1], r3d
2433    mov   [r0+r2*2], r3d
2434    RET
2435
2436;-----------------------------------------------------------------------------
2437; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
2438;                                 int stride)
2439;-----------------------------------------------------------------------------
2440
2441%macro PRED4x4_TM 0
2442cglobal pred4x4_tm_vp8_8, 3,6
2443    sub        r0, r2
2444    pxor      mm7, mm7
2445    movd      mm0, [r0]
2446    punpcklbw mm0, mm7
2447    movzx     r4d, byte [r0-1]
2448    mov       r5d, 2
2449.loop:
2450    movzx     r1d, byte [r0+r2*1-1]
2451    movzx     r3d, byte [r0+r2*2-1]
2452    sub       r1d, r4d
2453    sub       r3d, r4d
2454    movd      mm2, r1d
2455    movd      mm4, r3d
2456%if cpuflag(mmxext)
2457    pshufw    mm2, mm2, 0
2458    pshufw    mm4, mm4, 0
2459%else
2460    punpcklwd mm2, mm2
2461    punpcklwd mm4, mm4
2462    punpckldq mm2, mm2
2463    punpckldq mm4, mm4
2464%endif
2465    paddw     mm2, mm0
2466    paddw     mm4, mm0
2467    packuswb  mm2, mm2
2468    packuswb  mm4, mm4
2469    movd [r0+r2*1], mm2
2470    movd [r0+r2*2], mm4
2471    lea        r0, [r0+r2*2]
2472    dec       r5d
2473    jg .loop
2474    REP_RET
2475%endmacro
2476
2477INIT_MMX mmx
2478PRED4x4_TM
2479INIT_MMX mmxext
2480PRED4x4_TM
2481
2482INIT_XMM ssse3
2483cglobal pred4x4_tm_vp8_8, 3,3
2484    sub         r0, r2
2485    movq       mm6, [tm_shuf]
2486    pxor       mm1, mm1
2487    movd       mm0, [r0]
2488    punpcklbw  mm0, mm1
2489    movd       mm7, [r0-4]
2490    pshufb     mm7, mm6
2491    lea         r1, [r0+r2*2]
2492    movd       mm2, [r0+r2*1-4]
2493    movd       mm3, [r0+r2*2-4]
2494    movd       mm4, [r1+r2*1-4]
2495    movd       mm5, [r1+r2*2-4]
2496    pshufb     mm2, mm6
2497    pshufb     mm3, mm6
2498    pshufb     mm4, mm6
2499    pshufb     mm5, mm6
2500    psubw      mm0, mm7
2501    paddw      mm2, mm0
2502    paddw      mm3, mm0
2503    paddw      mm4, mm0
2504    paddw      mm5, mm0
2505    packuswb   mm2, mm2
2506    packuswb   mm3, mm3
2507    packuswb   mm4, mm4
2508    packuswb   mm5, mm5
2509    movd [r0+r2*1], mm2
2510    movd [r0+r2*2], mm3
2511    movd [r1+r2*1], mm4
2512    movd [r1+r2*2], mm5
2513    RET
2514
2515;-----------------------------------------------------------------------------
2516; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
2517;                                       int stride)
2518;-----------------------------------------------------------------------------
2519
2520INIT_MMX mmxext
2521cglobal pred4x4_vertical_vp8_8, 3,3
2522    sub       r0, r2
2523    movd      m1, [r0-1]
2524    movd      m0, [r0]
2525    mova      m2, m0   ;t0 t1 t2 t3
2526    punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2527    lea       r1, [r0+r2*2]
2528    psrlq     m0, 8    ;t1 t2 t3 t4
2529    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2530    movd [r0+r2*1], m3
2531    movd [r0+r2*2], m3
2532    movd [r1+r2*1], m3
2533    movd [r1+r2*2], m3
2534    RET
2535
2536;-----------------------------------------------------------------------------
2537; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
2538;                                    int stride)
2539;-----------------------------------------------------------------------------
2540INIT_MMX mmxext
2541cglobal pred4x4_down_left_8, 3,3
2542    sub       r0, r2
2543    movq      m1, [r0]
2544    punpckldq m1, [r1]
2545    movq      m2, m1
2546    movq      m3, m1
2547    psllq     m1, 8
2548    pxor      m2, m1
2549    psrlq     m2, 8
2550    pxor      m2, m3
2551    PRED4x4_LOWPASS m0, m1, m2, m3, m4
2552    lea       r1, [r0+r2*2]
2553    psrlq     m0, 8
2554    movd      [r0+r2*1], m0
2555    psrlq     m0, 8
2556    movd      [r0+r2*2], m0
2557    psrlq     m0, 8
2558    movd      [r1+r2*1], m0
2559    psrlq     m0, 8
2560    movd      [r1+r2*2], m0
2561    RET
2562
2563;------------------------------------------------------------------------------
2564; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
2565;                                        int stride)
2566;------------------------------------------------------------------------------
2567
2568INIT_MMX mmxext
2569cglobal pred4x4_vertical_left_8, 3,3
2570    sub       r0, r2
2571    movq      m1, [r0]
2572    punpckldq m1, [r1]
2573    movq      m3, m1
2574    movq      m2, m1
2575    psrlq     m3, 8
2576    psrlq     m2, 16
2577    movq      m4, m3
2578    pavgb     m4, m1
2579    PRED4x4_LOWPASS m0, m1, m2, m3, m5
2580    lea       r1, [r0+r2*2]
2581    movh      [r0+r2*1], m4
2582    movh      [r0+r2*2], m0
2583    psrlq     m4, 8
2584    psrlq     m0, 8
2585    movh      [r1+r2*1], m4
2586    movh      [r1+r2*2], m0
2587    RET
2588
2589;------------------------------------------------------------------------------
2590; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
2591;                                        int stride)
2592;------------------------------------------------------------------------------
2593
2594INIT_MMX mmxext
2595cglobal pred4x4_horizontal_up_8, 3,3
2596    sub       r0, r2
2597    lea       r1, [r0+r2*2]
2598    movd      m0, [r0+r2*1-4]
2599    punpcklbw m0, [r0+r2*2-4]
2600    movd      m1, [r1+r2*1-4]
2601    punpcklbw m1, [r1+r2*2-4]
2602    punpckhwd m0, m1
2603    movq      m1, m0
2604    punpckhbw m1, m1
2605    pshufw    m1, m1, 0xFF
2606    punpckhdq m0, m1
2607    movq      m2, m0
2608    movq      m3, m0
2609    movq      m7, m0
2610    psrlq     m2, 16
2611    psrlq     m3, 8
2612    pavgb     m7, m3
2613    PRED4x4_LOWPASS m4, m0, m2, m3, m5
2614    punpcklbw m7, m4
2615    movd    [r0+r2*1], m7
2616    psrlq    m7, 16
2617    movd    [r0+r2*2], m7
2618    psrlq    m7, 16
2619    movd    [r1+r2*1], m7
2620    movd    [r1+r2*2], m1
2621    RET
2622
2623;------------------------------------------------------------------------------
2624; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
2625;                                          const uint8_t *topright, int stride)
2626;------------------------------------------------------------------------------
2627
2628INIT_MMX mmxext
2629cglobal pred4x4_horizontal_down_8, 3,3
2630    sub       r0, r2
2631    lea       r1, [r0+r2*2]
2632    movh      m0, [r0-4]      ; lt ..
2633    punpckldq m0, [r0]        ; t3 t2 t1 t0 lt .. .. ..
2634    psllq     m0, 8           ; t2 t1 t0 lt .. .. .. ..
2635    movd      m1, [r1+r2*2-4] ; l3
2636    punpcklbw m1, [r1+r2*1-4] ; l2 l3
2637    movd      m2, [r0+r2*2-4] ; l1
2638    punpcklbw m2, [r0+r2*1-4] ; l0 l1
2639    punpckhwd m1, m2          ; l0 l1 l2 l3
2640    punpckhdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
2641    movq      m0, m1
2642    movq      m2, m1
2643    movq      m5, m1
2644    psrlq     m0, 16          ; .. .. t2 t1 t0 lt l0 l1
2645    psrlq     m2, 8           ; .. t2 t1 t0 lt l0 l1 l2
2646    pavgb     m5, m2
2647    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2648    punpcklbw m5, m3
2649    psrlq     m3, 32
2650    PALIGNR   m3, m5, 6, m4
2651    movh      [r1+r2*2], m5
2652    psrlq     m5, 16
2653    movh      [r1+r2*1], m5
2654    psrlq     m5, 16
2655    movh      [r0+r2*2], m5
2656    movh      [r0+r2*1], m3
2657    RET
2658
2659;-----------------------------------------------------------------------------
2660; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
2661;                                         const uint8_t *topright, int stride)
2662;-----------------------------------------------------------------------------
2663
2664INIT_MMX mmxext
2665cglobal pred4x4_vertical_right_8, 3,3
2666    sub     r0, r2
2667    lea     r1, [r0+r2*2]
2668    movh    m0, [r0]                    ; ........t3t2t1t0
2669    movq    m5, m0
2670    PALIGNR m0, [r0-8], 7, m1           ; ......t3t2t1t0lt
2671    pavgb   m5, m0
2672    PALIGNR m0, [r0+r2*1-8], 7, m1      ; ....t3t2t1t0ltl0
2673    movq    m1, m0
2674    PALIGNR m0, [r0+r2*2-8], 7, m2      ; ..t3t2t1t0ltl0l1
2675    movq    m2, m0
2676    PALIGNR m0, [r1+r2*1-8], 7, m3      ; t3t2t1t0ltl0l1l2
2677    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2678    movq    m1, m3
2679    psrlq   m3, 16
2680    psllq   m1, 48
2681    movh    [r0+r2*1], m5
2682    movh    [r0+r2*2], m3
2683    PALIGNR m5, m1, 7, m2
2684    psllq   m1, 8
2685    movh    [r1+r2*1], m5
2686    PALIGNR m3, m1, 7, m1
2687    movh    [r1+r2*2], m3
2688    RET
2689
2690;-----------------------------------------------------------------------------
2691; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
2692;                                     int stride)
2693;-----------------------------------------------------------------------------
2694
2695INIT_MMX mmxext
2696cglobal pred4x4_down_right_8, 3,3
2697    sub       r0, r2
2698    lea       r1, [r0+r2*2]
2699    movq      m1, [r1-8]
2700    movq      m2, [r0+r2*1-8]
2701    punpckhbw m2, [r0-8]
2702    movh      m3, [r0]
2703    punpckhwd m1, m2
2704    PALIGNR   m3, m1, 5, m1
2705    movq      m1, m3
2706    PALIGNR   m3, [r1+r2*1-8], 7, m4
2707    movq      m2, m3
2708    PALIGNR   m3, [r1+r2*2-8], 7, m4
2709    PRED4x4_LOWPASS m0, m3, m1, m2, m4
2710    movh      [r1+r2*2], m0
2711    psrlq     m0, 8
2712    movh      [r1+r2*1], m0
2713    psrlq     m0, 8
2714    movh      [r0+r2*2], m0
2715    psrlq     m0, 8
2716    movh      [r0+r2*1], m0
2717    RET
2718