1;******************************************************************************
2;* H.264 intra prediction asm optimizations
3;* Copyright (c) 2010 Jason Garrett-Glaser
4;* Copyright (c) 2010 Holger Lubitz
5;* Copyright (c) 2010 Loren Merritt
6;* Copyright (c) 2010 Ronald S. Bultje
7;*
8;* This file is part of Libav.
9;*
10;* Libav is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* Libav is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with Libav; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "x86inc.asm"
26%include "x86util.asm"
27
28SECTION_RODATA
29
30tm_shuf: times 8 db 0x03, 0x80
31pw_ff00: times 8 dw 0xff00
32plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
33             db  1,  2,  3,  4,  5,  6,  7,  8
34plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
35             db  1,  2,  3,  4,  0,  0,  0,  0
36pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
37pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
38pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
39pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
40
41SECTION .text
42
43cextern pb_1
44cextern pb_3
45cextern pw_4
46cextern pw_5
47cextern pw_8
48cextern pw_16
49cextern pw_17
50cextern pw_32
51
52;-----------------------------------------------------------------------------
53; void pred16x16_vertical(uint8_t *src, int stride)
54;-----------------------------------------------------------------------------
55
56cglobal pred16x16_vertical_mmx, 2,3
57    sub   r0, r1
58    mov   r2, 8
59    movq mm0, [r0+0]
60    movq mm1, [r0+8]
61.loop:
62    movq [r0+r1*1+0], mm0
63    movq [r0+r1*1+8], mm1
64    movq [r0+r1*2+0], mm0
65    movq [r0+r1*2+8], mm1
66    lea   r0, [r0+r1*2]
67    dec   r2
68    jg .loop
69    REP_RET
70
71cglobal pred16x16_vertical_sse, 2,3
72    sub   r0, r1
73    mov   r2, 4
74    movaps xmm0, [r0]
75.loop:
76    movaps [r0+r1*1], xmm0
77    movaps [r0+r1*2], xmm0
78    lea   r0, [r0+r1*2]
79    movaps [r0+r1*1], xmm0
80    movaps [r0+r1*2], xmm0
81    lea   r0, [r0+r1*2]
82    dec   r2
83    jg .loop
84    REP_RET
85
86;-----------------------------------------------------------------------------
87; void pred16x16_horizontal(uint8_t *src, int stride)
88;-----------------------------------------------------------------------------
89
90%macro PRED16x16_H 1
91cglobal pred16x16_horizontal_%1, 2,3
92    mov       r2, 8
93%ifidn %1, ssse3
94    mova      m2, [pb_3]
95%endif
96.loop:
97    movd      m0, [r0+r1*0-4]
98    movd      m1, [r0+r1*1-4]
99
100%ifidn %1, ssse3
101    pshufb    m0, m2
102    pshufb    m1, m2
103%else
104    punpcklbw m0, m0
105    punpcklbw m1, m1
106%ifidn %1, mmxext
107    pshufw    m0, m0, 0xff
108    pshufw    m1, m1, 0xff
109%else
110    punpckhwd m0, m0
111    punpckhwd m1, m1
112    punpckhdq m0, m0
113    punpckhdq m1, m1
114%endif
115    mova [r0+r1*0+8], m0
116    mova [r0+r1*1+8], m1
117%endif
118
119    mova [r0+r1*0], m0
120    mova [r0+r1*1], m1
121    lea       r0, [r0+r1*2]
122    dec       r2
123    jg .loop
124    REP_RET
125%endmacro
126
127INIT_MMX
128PRED16x16_H mmx
129PRED16x16_H mmxext
130INIT_XMM
131PRED16x16_H ssse3
132
133;-----------------------------------------------------------------------------
134; void pred16x16_dc(uint8_t *src, int stride)
135;-----------------------------------------------------------------------------
136
137%macro PRED16x16_DC 1
138cglobal pred16x16_dc_%1, 2,7
139    mov       r4, r0
140    sub       r0, r1
141    pxor      mm0, mm0
142    pxor      mm1, mm1
143    psadbw    mm0, [r0+0]
144    psadbw    mm1, [r0+8]
145    dec        r0
146    movzx     r5d, byte [r0+r1*1]
147    paddw     mm0, mm1
148    movd      r6d, mm0
149    lea        r0, [r0+r1*2]
150%rep 7
151    movzx     r2d, byte [r0+r1*0]
152    movzx     r3d, byte [r0+r1*1]
153    add       r5d, r2d
154    add       r6d, r3d
155    lea        r0, [r0+r1*2]
156%endrep
157    movzx     r2d, byte [r0+r1*0]
158    add       r5d, r6d
159    lea       r2d, [r2+r5+16]
160    shr       r2d, 5
161%ifidn %1, mmxext
162    movd       m0, r2d
163    punpcklbw  m0, m0
164    pshufw     m0, m0, 0
165%elifidn %1, sse2
166    movd       m0, r2d
167    punpcklbw  m0, m0
168    pshuflw    m0, m0, 0
169    punpcklqdq m0, m0
170%elifidn %1, ssse3
171    pxor       m1, m1
172    movd       m0, r2d
173    pshufb     m0, m1
174%endif
175
176%if mmsize==8
177    mov       r3d, 8
178.loop:
179    mova [r4+r1*0+0], m0
180    mova [r4+r1*0+8], m0
181    mova [r4+r1*1+0], m0
182    mova [r4+r1*1+8], m0
183%else
184    mov       r3d, 4
185.loop:
186    mova [r4+r1*0], m0
187    mova [r4+r1*1], m0
188    lea   r4, [r4+r1*2]
189    mova [r4+r1*0], m0
190    mova [r4+r1*1], m0
191%endif
192    lea   r4, [r4+r1*2]
193    dec   r3d
194    jg .loop
195    REP_RET
196%endmacro
197
198INIT_MMX
199PRED16x16_DC mmxext
200INIT_XMM
201PRED16x16_DC   sse2
202PRED16x16_DC  ssse3
203
204;-----------------------------------------------------------------------------
205; void pred16x16_tm_vp8(uint8_t *src, int stride)
206;-----------------------------------------------------------------------------
207
208%macro PRED16x16_TM_MMX 1
209cglobal pred16x16_tm_vp8_%1, 2,5
210    sub        r0, r1
211    pxor      mm7, mm7
212    movq      mm0, [r0+0]
213    movq      mm2, [r0+8]
214    movq      mm1, mm0
215    movq      mm3, mm2
216    punpcklbw mm0, mm7
217    punpckhbw mm1, mm7
218    punpcklbw mm2, mm7
219    punpckhbw mm3, mm7
220    movzx     r3d, byte [r0-1]
221    mov       r4d, 16
222.loop:
223    movzx     r2d, byte [r0+r1-1]
224    sub       r2d, r3d
225    movd      mm4, r2d
226%ifidn %1, mmx
227    punpcklwd mm4, mm4
228    punpckldq mm4, mm4
229%else
230    pshufw    mm4, mm4, 0
231%endif
232    movq      mm5, mm4
233    movq      mm6, mm4
234    movq      mm7, mm4
235    paddw     mm4, mm0
236    paddw     mm5, mm1
237    paddw     mm6, mm2
238    paddw     mm7, mm3
239    packuswb  mm4, mm5
240    packuswb  mm6, mm7
241    movq [r0+r1+0], mm4
242    movq [r0+r1+8], mm6
243    add        r0, r1
244    dec       r4d
245    jg .loop
246    REP_RET
247%endmacro
248
249PRED16x16_TM_MMX mmx
250PRED16x16_TM_MMX mmxext
251
252cglobal pred16x16_tm_vp8_sse2, 2,6,6
253    sub          r0, r1
254    pxor       xmm2, xmm2
255    movdqa     xmm0, [r0]
256    movdqa     xmm1, xmm0
257    punpcklbw  xmm0, xmm2
258    punpckhbw  xmm1, xmm2
259    movzx       r4d, byte [r0-1]
260    mov         r5d, 8
261.loop:
262    movzx       r2d, byte [r0+r1*1-1]
263    movzx       r3d, byte [r0+r1*2-1]
264    sub         r2d, r4d
265    sub         r3d, r4d
266    movd       xmm2, r2d
267    movd       xmm4, r3d
268    pshuflw    xmm2, xmm2, 0
269    pshuflw    xmm4, xmm4, 0
270    punpcklqdq xmm2, xmm2
271    punpcklqdq xmm4, xmm4
272    movdqa     xmm3, xmm2
273    movdqa     xmm5, xmm4
274    paddw      xmm2, xmm0
275    paddw      xmm3, xmm1
276    paddw      xmm4, xmm0
277    paddw      xmm5, xmm1
278    packuswb   xmm2, xmm3
279    packuswb   xmm4, xmm5
280    movdqa [r0+r1*1], xmm2
281    movdqa [r0+r1*2], xmm4
282    lea          r0, [r0+r1*2]
283    dec         r5d
284    jg .loop
285    REP_RET
286
287;-----------------------------------------------------------------------------
288; void pred16x16_plane(uint8_t *src, int stride)
289;-----------------------------------------------------------------------------
290
291%macro H264_PRED16x16_PLANE 3
292cglobal pred16x16_plane_%3_%1, 2, 7, %2
293    mov          r2, r1           ; +stride
294    neg          r1               ; -stride
295
296    movh         m0, [r0+r1  -1]
297%if mmsize == 8
298    pxor         m4, m4
299    movh         m1, [r0+r1  +3 ]
300    movh         m2, [r0+r1  +8 ]
301    movh         m3, [r0+r1  +12]
302    punpcklbw    m0, m4
303    punpcklbw    m1, m4
304    punpcklbw    m2, m4
305    punpcklbw    m3, m4
306    pmullw       m0, [pw_m8tom1  ]
307    pmullw       m1, [pw_m8tom1+8]
308    pmullw       m2, [pw_1to8    ]
309    pmullw       m3, [pw_1to8  +8]
310    paddw        m0, m2
311    paddw        m1, m3
312%else ; mmsize == 16
313%ifidn %1, sse2
314    pxor         m2, m2
315    movh         m1, [r0+r1  +8]
316    punpcklbw    m0, m2
317    punpcklbw    m1, m2
318    pmullw       m0, [pw_m8tom1]
319    pmullw       m1, [pw_1to8]
320    paddw        m0, m1
321%else ; ssse3
322    movhps       m0, [r0+r1  +8]
323    pmaddubsw    m0, [plane_shuf] ; H coefficients
324%endif
325    movhlps      m1, m0
326%endif
327    paddw        m0, m1
328%ifidn %1, mmx
329    mova         m1, m0
330    psrlq        m1, 32
331%elifidn %1, mmx2
332    pshufw       m1, m0, 0xE
333%else ; mmsize == 16
334    pshuflw      m1, m0, 0xE
335%endif
336    paddw        m0, m1
337%ifidn %1, mmx
338    mova         m1, m0
339    psrlq        m1, 16
340%elifidn %1, mmx2
341    pshufw       m1, m0, 0x1
342%else
343    pshuflw      m1, m0, 0x1
344%endif
345    paddw        m0, m1           ; sum of H coefficients
346
347    lea          r4, [r0+r2*8-1]
348    lea          r3, [r0+r2*4-1]
349    add          r4, r2
350
351%ifdef ARCH_X86_64
352%define e_reg r11
353%else
354%define e_reg r0
355%endif
356
357    movzx     e_reg, byte [r3+r2*2   ]
358    movzx        r5, byte [r4+r1     ]
359    sub          r5, e_reg
360
361    movzx     e_reg, byte [r3+r2     ]
362    movzx        r6, byte [r4        ]
363    sub          r6, e_reg
364    lea          r5, [r5+r6*2]
365
366    movzx     e_reg, byte [r3+r1     ]
367    movzx        r6, byte [r4+r2*2   ]
368    sub          r6, e_reg
369    lea          r5, [r5+r6*4]
370
371    movzx     e_reg, byte [r3        ]
372%ifdef ARCH_X86_64
373    movzx       r10, byte [r4+r2     ]
374    sub         r10, e_reg
375%else
376    movzx        r6, byte [r4+r2     ]
377    sub          r6, e_reg
378    lea          r5, [r5+r6*4]
379    sub          r5, r6
380%endif
381
382    lea       e_reg, [r3+r1*4]
383    lea          r3, [r4+r2*4]
384
385    movzx        r4, byte [e_reg+r2  ]
386    movzx        r6, byte [r3        ]
387    sub          r6, r4
388%ifdef ARCH_X86_64
389    lea          r6, [r10+r6*2]
390    lea          r5, [r5+r6*2]
391    add          r5, r6
392%else
393    lea          r5, [r5+r6*4]
394    lea          r5, [r5+r6*2]
395%endif
396
397    movzx        r4, byte [e_reg     ]
398%ifdef ARCH_X86_64
399    movzx       r10, byte [r3   +r2  ]
400    sub         r10, r4
401    sub          r5, r10
402%else
403    movzx        r6, byte [r3   +r2  ]
404    sub          r6, r4
405    lea          r5, [r5+r6*8]
406    sub          r5, r6
407%endif
408
409    movzx        r4, byte [e_reg+r1  ]
410    movzx        r6, byte [r3   +r2*2]
411    sub          r6, r4
412%ifdef ARCH_X86_64
413    add          r6, r10
414%endif
415    lea          r5, [r5+r6*8]
416
417    movzx        r4, byte [e_reg+r2*2]
418    movzx        r6, byte [r3   +r1  ]
419    sub          r6, r4
420    lea          r5, [r5+r6*4]
421    add          r5, r6           ; sum of V coefficients
422
423%ifndef ARCH_X86_64
424    mov          r0, r0m
425%endif
426
427%ifidn %3, h264
428    lea          r5, [r5*5+32]
429    sar          r5, 6
430%elifidn %3, rv40
431    lea          r5, [r5*5]
432    sar          r5, 6
433%elifidn %3, svq3
434    test         r5, r5
435    lea          r6, [r5+3]
436    cmovs        r5, r6
437    sar          r5, 2            ; V/4
438    lea          r5, [r5*5]       ; 5*(V/4)
439    test         r5, r5
440    lea          r6, [r5+15]
441    cmovs        r5, r6
442    sar          r5, 4            ; (5*(V/4))/16
443%endif
444
445    movzx        r4, byte [r0+r1  +15]
446    movzx        r3, byte [r3+r2*2   ]
447    lea          r3, [r3+r4+1]
448    shl          r3, 4
449
450    movd        r1d, m0
451    movsx       r1d, r1w
452%ifnidn %3, svq3
453%ifidn %3, h264
454    lea         r1d, [r1d*5+32]
455%else ; rv40
456    lea         r1d, [r1d*5]
457%endif
458    sar         r1d, 6
459%else ; svq3
460    test        r1d, r1d
461    lea         r4d, [r1d+3]
462    cmovs       r1d, r4d
463    sar         r1d, 2           ; H/4
464    lea         r1d, [r1d*5]     ; 5*(H/4)
465    test        r1d, r1d
466    lea         r4d, [r1d+15]
467    cmovs       r1d, r4d
468    sar         r1d, 4           ; (5*(H/4))/16
469%endif
470    movd         m0, r1d
471
472    add         r1d, r5d
473    add         r3d, r1d
474    shl         r1d, 3
475    sub         r3d, r1d          ; a
476
477    movd         m1, r5d
478    movd         m3, r3d
479%ifidn %1, mmx
480    punpcklwd    m0, m0
481    punpcklwd    m1, m1
482    punpcklwd    m3, m3
483    punpckldq    m0, m0
484    punpckldq    m1, m1
485    punpckldq    m3, m3
486%elifidn %1, mmx2
487    pshufw       m0, m0, 0x0
488    pshufw       m1, m1, 0x0
489    pshufw       m3, m3, 0x0
490%else
491    pshuflw      m0, m0, 0x0
492    pshuflw      m1, m1, 0x0
493    pshuflw      m3, m3, 0x0
494    punpcklqdq   m0, m0           ; splat H (words)
495    punpcklqdq   m1, m1           ; splat V (words)
496    punpcklqdq   m3, m3           ; splat a (words)
497%endif
498%ifidn %3, svq3
499    SWAP          0, 1
500%endif
501    mova         m2, m0
502%if mmsize == 8
503    mova         m5, m0
504%endif
505    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
506%if mmsize == 16
507    psllw        m2, 3
508%else
509    psllw        m5, 3
510    psllw        m2, 2
511    mova         m6, m5
512    paddw        m6, m2
513%endif
514    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
515    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
516%if mmsize == 8
517    paddw        m5, m0           ; a + {8,9,10,11}*H
518    paddw        m6, m0           ; a + {12,13,14,15}*H
519%endif
520
521    mov          r4, 8
522.loop
523    mova         m3, m0           ; b[0..7]
524    mova         m4, m2           ; b[8..15]
525    psraw        m3, 5
526    psraw        m4, 5
527    packuswb     m3, m4
528    mova       [r0], m3
529%if mmsize == 8
530    mova         m3, m5           ; b[8..11]
531    mova         m4, m6           ; b[12..15]
532    psraw        m3, 5
533    psraw        m4, 5
534    packuswb     m3, m4
535    mova     [r0+8], m3
536%endif
537    paddw        m0, m1
538    paddw        m2, m1
539%if mmsize == 8
540    paddw        m5, m1
541    paddw        m6, m1
542%endif
543
544    mova         m3, m0           ; b[0..7]
545    mova         m4, m2           ; b[8..15]
546    psraw        m3, 5
547    psraw        m4, 5
548    packuswb     m3, m4
549    mova    [r0+r2], m3
550%if mmsize == 8
551    mova         m3, m5           ; b[8..11]
552    mova         m4, m6           ; b[12..15]
553    psraw        m3, 5
554    psraw        m4, 5
555    packuswb     m3, m4
556    mova  [r0+r2+8], m3
557%endif
558    paddw        m0, m1
559    paddw        m2, m1
560%if mmsize == 8
561    paddw        m5, m1
562    paddw        m6, m1
563%endif
564
565    lea          r0, [r0+r2*2]
566    dec          r4
567    jg .loop
568    REP_RET
569%endmacro
570
571INIT_MMX
572H264_PRED16x16_PLANE mmx,   0, h264
573H264_PRED16x16_PLANE mmx,   0, rv40
574H264_PRED16x16_PLANE mmx,   0, svq3
575H264_PRED16x16_PLANE mmx2,  0, h264
576H264_PRED16x16_PLANE mmx2,  0, rv40
577H264_PRED16x16_PLANE mmx2,  0, svq3
578INIT_XMM
579H264_PRED16x16_PLANE sse2,  8, h264
580H264_PRED16x16_PLANE sse2,  8, rv40
581H264_PRED16x16_PLANE sse2,  8, svq3
582H264_PRED16x16_PLANE ssse3, 8, h264
583H264_PRED16x16_PLANE ssse3, 8, rv40
584H264_PRED16x16_PLANE ssse3, 8, svq3
585
586;-----------------------------------------------------------------------------
587; void pred8x8_plane(uint8_t *src, int stride)
588;-----------------------------------------------------------------------------
589
590%macro H264_PRED8x8_PLANE 2
591cglobal pred8x8_plane_%1, 2, 7, %2
592    mov          r2, r1           ; +stride
593    neg          r1               ; -stride
594
595    movd         m0, [r0+r1  -1]
596%if mmsize == 8
597    pxor         m2, m2
598    movh         m1, [r0+r1  +4 ]
599    punpcklbw    m0, m2
600    punpcklbw    m1, m2
601    pmullw       m0, [pw_m4to4]
602    pmullw       m1, [pw_m4to4+8]
603%else ; mmsize == 16
604%ifidn %1, sse2
605    pxor         m2, m2
606    movd         m1, [r0+r1  +4]
607    punpckldq    m0, m1
608    punpcklbw    m0, m2
609    pmullw       m0, [pw_m4to4]
610%else ; ssse3
611    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
612    pmaddubsw    m0, [plane8_shuf] ; H coefficients
613%endif
614    movhlps      m1, m0
615%endif
616    paddw        m0, m1
617
618%ifnidn %1, ssse3
619%ifidn %1, mmx
620    mova         m1, m0
621    psrlq        m1, 32
622%elifidn %1, mmx2
623    pshufw       m1, m0, 0xE
624%else ; mmsize == 16
625    pshuflw      m1, m0, 0xE
626%endif
627    paddw        m0, m1
628%endif ; !ssse3
629
630%ifidn %1, mmx
631    mova         m1, m0
632    psrlq        m1, 16
633%elifidn %1, mmx2
634    pshufw       m1, m0, 0x1
635%else
636    pshuflw      m1, m0, 0x1
637%endif
638    paddw        m0, m1           ; sum of H coefficients
639
640    lea          r4, [r0+r2*4-1]
641    lea          r3, [r0     -1]
642    add          r4, r2
643
644%ifdef ARCH_X86_64
645%define e_reg r11
646%else
647%define e_reg r0
648%endif
649
650    movzx     e_reg, byte [r3+r2*2   ]
651    movzx        r5, byte [r4+r1     ]
652    sub          r5, e_reg
653
654    movzx     e_reg, byte [r3        ]
655%ifdef ARCH_X86_64
656    movzx       r10, byte [r4+r2     ]
657    sub         r10, e_reg
658    sub          r5, r10
659%else
660    movzx        r6, byte [r4+r2     ]
661    sub          r6, e_reg
662    lea          r5, [r5+r6*4]
663    sub          r5, r6
664%endif
665
666    movzx     e_reg, byte [r3+r1     ]
667    movzx        r6, byte [r4+r2*2   ]
668    sub          r6, e_reg
669%ifdef ARCH_X86_64
670    add          r6, r10
671%endif
672    lea          r5, [r5+r6*4]
673
674    movzx     e_reg, byte [r3+r2     ]
675    movzx        r6, byte [r4        ]
676    sub          r6, e_reg
677    lea          r6, [r5+r6*2]
678
679    lea          r5, [r6*9+16]
680    lea          r5, [r5+r6*8]
681    sar          r5, 5
682
683%ifndef ARCH_X86_64
684    mov          r0, r0m
685%endif
686
687    movzx        r3, byte [r4+r2*2  ]
688    movzx        r4, byte [r0+r1  +7]
689    lea          r3, [r3+r4+1]
690    shl          r3, 4
691    movd        r1d, m0
692    movsx       r1d, r1w
693    imul        r1d, 17
694    add         r1d, 16
695    sar         r1d, 5
696    movd         m0, r1d
697    add         r1d, r5d
698    sub         r3d, r1d
699    add         r1d, r1d
700    sub         r3d, r1d          ; a
701
702    movd         m1, r5d
703    movd         m3, r3d
704%ifidn %1, mmx
705    punpcklwd    m0, m0
706    punpcklwd    m1, m1
707    punpcklwd    m3, m3
708    punpckldq    m0, m0
709    punpckldq    m1, m1
710    punpckldq    m3, m3
711%elifidn %1, mmx2
712    pshufw       m0, m0, 0x0
713    pshufw       m1, m1, 0x0
714    pshufw       m3, m3, 0x0
715%else
716    pshuflw      m0, m0, 0x0
717    pshuflw      m1, m1, 0x0
718    pshuflw      m3, m3, 0x0
719    punpcklqdq   m0, m0           ; splat H (words)
720    punpcklqdq   m1, m1           ; splat V (words)
721    punpcklqdq   m3, m3           ; splat a (words)
722%endif
723%if mmsize == 8
724    mova         m2, m0
725%endif
726    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
727    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
728%if mmsize == 8
729    psllw        m2, 2
730    paddw        m2, m0           ; a + {4,5,6,7}*H
731%endif
732
733    mov          r4, 4
734ALIGN 16
735.loop
736%if mmsize == 16
737    mova         m3, m0           ; b[0..7]
738    paddw        m0, m1
739    psraw        m3, 5
740    mova         m4, m0           ; V+b[0..7]
741    paddw        m0, m1
742    psraw        m4, 5
743    packuswb     m3, m4
744    movh       [r0], m3
745    movhps  [r0+r2], m3
746%else ; mmsize == 8
747    mova         m3, m0           ; b[0..3]
748    mova         m4, m2           ; b[4..7]
749    paddw        m0, m1
750    paddw        m2, m1
751    psraw        m3, 5
752    psraw        m4, 5
753    mova         m5, m0           ; V+b[0..3]
754    mova         m6, m2           ; V+b[4..7]
755    paddw        m0, m1
756    paddw        m2, m1
757    psraw        m5, 5
758    psraw        m6, 5
759    packuswb     m3, m4
760    packuswb     m5, m6
761    mova       [r0], m3
762    mova    [r0+r2], m5
763%endif
764
765    lea          r0, [r0+r2*2]
766    dec          r4
767    jg .loop
768    REP_RET
769%endmacro
770
771INIT_MMX
772H264_PRED8x8_PLANE mmx,   0
773H264_PRED8x8_PLANE mmx2,  0
774INIT_XMM
775H264_PRED8x8_PLANE sse2,  8
776H264_PRED8x8_PLANE ssse3, 8
777
778;-----------------------------------------------------------------------------
779; void pred8x8_vertical(uint8_t *src, int stride)
780;-----------------------------------------------------------------------------
781
782cglobal pred8x8_vertical_mmx, 2,2
783    sub    r0, r1
784    movq  mm0, [r0]
785%rep 3
786    movq [r0+r1*1], mm0
787    movq [r0+r1*2], mm0
788    lea    r0, [r0+r1*2]
789%endrep
790    movq [r0+r1*1], mm0
791    movq [r0+r1*2], mm0
792    RET
793
794;-----------------------------------------------------------------------------
795; void pred8x8_horizontal(uint8_t *src, int stride)
796;-----------------------------------------------------------------------------
797
798%macro PRED8x8_H 1
799cglobal pred8x8_horizontal_%1, 2,3
800    mov       r2, 4
801%ifidn %1, ssse3
802    mova      m2, [pb_3]
803%endif
804.loop:
805    movd      m0, [r0+r1*0-4]
806    movd      m1, [r0+r1*1-4]
807%ifidn %1, ssse3
808    pshufb    m0, m2
809    pshufb    m1, m2
810%else
811    punpcklbw m0, m0
812    punpcklbw m1, m1
813%ifidn %1, mmxext
814    pshufw    m0, m0, 0xff
815    pshufw    m1, m1, 0xff
816%else
817    punpckhwd m0, m0
818    punpckhwd m1, m1
819    punpckhdq m0, m0
820    punpckhdq m1, m1
821%endif
822%endif
823    mova [r0+r1*0], m0
824    mova [r0+r1*1], m1
825    lea       r0, [r0+r1*2]
826    dec       r2
827    jg .loop
828    REP_RET
829%endmacro
830
831INIT_MMX
832PRED8x8_H mmx
833PRED8x8_H mmxext
834PRED8x8_H ssse3
835
836;-----------------------------------------------------------------------------
837; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
838;-----------------------------------------------------------------------------
839cglobal pred8x8_top_dc_mmxext, 2,5
840    sub         r0, r1
841    movq       mm0, [r0]
842    pxor       mm1, mm1
843    pxor       mm2, mm2
844    lea         r2, [r0+r1*2]
845    punpckhbw  mm1, mm0
846    punpcklbw  mm0, mm2
847    psadbw     mm1, mm2        ; s1
848    lea         r3, [r2+r1*2]
849    psadbw     mm0, mm2        ; s0
850    psrlw      mm1, 1
851    psrlw      mm0, 1
852    pavgw      mm1, mm2
853    lea         r4, [r3+r1*2]
854    pavgw      mm0, mm2
855    pshufw     mm1, mm1, 0
856    pshufw     mm0, mm0, 0     ; dc0 (w)
857    packuswb   mm0, mm1        ; dc0,dc1 (b)
858    movq [r0+r1*1], mm0
859    movq [r0+r1*2], mm0
860    lea         r0, [r3+r1*2]
861    movq [r2+r1*1], mm0
862    movq [r2+r1*2], mm0
863    movq [r3+r1*1], mm0
864    movq [r3+r1*2], mm0
865    movq [r0+r1*1], mm0
866    movq [r0+r1*2], mm0
867    RET
868
869;-----------------------------------------------------------------------------
870; void pred8x8_dc_mmxext(uint8_t *src, int stride)
871;-----------------------------------------------------------------------------
872
873INIT_MMX
874cglobal pred8x8_dc_mmxext, 2,5
875    sub       r0, r1
876    pxor      m7, m7
877    movd      m0, [r0+0]
878    movd      m1, [r0+4]
879    psadbw    m0, m7            ; s0
880    mov       r4, r0
881    psadbw    m1, m7            ; s1
882
883    movzx    r2d, byte [r0+r1*1-1]
884    movzx    r3d, byte [r0+r1*2-1]
885    lea       r0, [r0+r1*2]
886    add      r2d, r3d
887    movzx    r3d, byte [r0+r1*1-1]
888    add      r2d, r3d
889    movzx    r3d, byte [r0+r1*2-1]
890    add      r2d, r3d
891    lea       r0, [r0+r1*2]
892    movd      m2, r2d            ; s2
893    movzx    r2d, byte [r0+r1*1-1]
894    movzx    r3d, byte [r0+r1*2-1]
895    lea       r0, [r0+r1*2]
896    add      r2d, r3d
897    movzx    r3d, byte [r0+r1*1-1]
898    add      r2d, r3d
899    movzx    r3d, byte [r0+r1*2-1]
900    add      r2d, r3d
901    movd      m3, r2d            ; s3
902
903    punpcklwd m0, m1
904    mov       r0, r4
905    punpcklwd m2, m3
906    punpckldq m0, m2            ; s0, s1, s2, s3
907    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
908    lea       r2, [r0+r1*2]
909    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
910    paddw     m0, m3
911    lea       r3, [r2+r1*2]
912    psrlw     m0, 2
913    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
914    lea       r4, [r3+r1*2]
915    packuswb  m0, m0
916    punpcklbw m0, m0
917    movq      m1, m0
918    punpcklbw m0, m0
919    punpckhbw m1, m1
920    movq [r0+r1*1], m0
921    movq [r0+r1*2], m0
922    movq [r2+r1*1], m0
923    movq [r2+r1*2], m0
924    movq [r3+r1*1], m1
925    movq [r3+r1*2], m1
926    movq [r4+r1*1], m1
927    movq [r4+r1*2], m1
928    RET
929
930;-----------------------------------------------------------------------------
931; void pred8x8_dc_rv40(uint8_t *src, int stride)
932;-----------------------------------------------------------------------------
933
934cglobal pred8x8_dc_rv40_mmxext, 2,7
935    mov       r4, r0
936    sub       r0, r1
937    pxor      mm0, mm0
938    psadbw    mm0, [r0]
939    dec        r0
940    movzx     r5d, byte [r0+r1*1]
941    movd      r6d, mm0
942    lea        r0, [r0+r1*2]
943%rep 3
944    movzx     r2d, byte [r0+r1*0]
945    movzx     r3d, byte [r0+r1*1]
946    add       r5d, r2d
947    add       r6d, r3d
948    lea        r0, [r0+r1*2]
949%endrep
950    movzx     r2d, byte [r0+r1*0]
951    add       r5d, r6d
952    lea       r2d, [r2+r5+8]
953    shr       r2d, 4
954    movd      mm0, r2d
955    punpcklbw mm0, mm0
956    pshufw    mm0, mm0, 0
957    mov       r3d, 4
958.loop:
959    movq [r4+r1*0], mm0
960    movq [r4+r1*1], mm0
961    lea   r4, [r4+r1*2]
962    dec   r3d
963    jg .loop
964    REP_RET
965
966;-----------------------------------------------------------------------------
967; void pred8x8_tm_vp8(uint8_t *src, int stride)
968;-----------------------------------------------------------------------------
969
970%macro PRED8x8_TM_MMX 1
971cglobal pred8x8_tm_vp8_%1, 2,6
972    sub        r0, r1
973    pxor      mm7, mm7
974    movq      mm0, [r0]
975    movq      mm1, mm0
976    punpcklbw mm0, mm7
977    punpckhbw mm1, mm7
978    movzx     r4d, byte [r0-1]
979    mov       r5d, 4
980.loop:
981    movzx     r2d, byte [r0+r1*1-1]
982    movzx     r3d, byte [r0+r1*2-1]
983    sub       r2d, r4d
984    sub       r3d, r4d
985    movd      mm2, r2d
986    movd      mm4, r3d
987%ifidn %1, mmx
988    punpcklwd mm2, mm2
989    punpcklwd mm4, mm4
990    punpckldq mm2, mm2
991    punpckldq mm4, mm4
992%else
993    pshufw    mm2, mm2, 0
994    pshufw    mm4, mm4, 0
995%endif
996    movq      mm3, mm2
997    movq      mm5, mm4
998    paddw     mm2, mm0
999    paddw     mm3, mm1
1000    paddw     mm4, mm0
1001    paddw     mm5, mm1
1002    packuswb  mm2, mm3
1003    packuswb  mm4, mm5
1004    movq [r0+r1*1], mm2
1005    movq [r0+r1*2], mm4
1006    lea        r0, [r0+r1*2]
1007    dec       r5d
1008    jg .loop
1009    REP_RET
1010%endmacro
1011
1012PRED8x8_TM_MMX mmx
1013PRED8x8_TM_MMX mmxext
1014
1015cglobal pred8x8_tm_vp8_sse2, 2,6,4
1016    sub          r0, r1
1017    pxor       xmm1, xmm1
1018    movq       xmm0, [r0]
1019    punpcklbw  xmm0, xmm1
1020    movzx       r4d, byte [r0-1]
1021    mov         r5d, 4
1022.loop:
1023    movzx       r2d, byte [r0+r1*1-1]
1024    movzx       r3d, byte [r0+r1*2-1]
1025    sub         r2d, r4d
1026    sub         r3d, r4d
1027    movd       xmm2, r2d
1028    movd       xmm3, r3d
1029    pshuflw    xmm2, xmm2, 0
1030    pshuflw    xmm3, xmm3, 0
1031    punpcklqdq xmm2, xmm2
1032    punpcklqdq xmm3, xmm3
1033    paddw      xmm2, xmm0
1034    paddw      xmm3, xmm0
1035    packuswb   xmm2, xmm3
1036    movq   [r0+r1*1], xmm2
1037    movhps [r0+r1*2], xmm2
1038    lea          r0, [r0+r1*2]
1039    dec         r5d
1040    jg .loop
1041    REP_RET
1042
1043cglobal pred8x8_tm_vp8_ssse3, 2,3,6
1044    sub          r0, r1
1045    movdqa     xmm4, [tm_shuf]
1046    pxor       xmm1, xmm1
1047    movq       xmm0, [r0]
1048    punpcklbw  xmm0, xmm1
1049    movd       xmm5, [r0-4]
1050    pshufb     xmm5, xmm4
1051    mov         r2d, 4
1052.loop:
1053    movd       xmm2, [r0+r1*1-4]
1054    movd       xmm3, [r0+r1*2-4]
1055    pshufb     xmm2, xmm4
1056    pshufb     xmm3, xmm4
1057    psubw      xmm2, xmm5
1058    psubw      xmm3, xmm5
1059    paddw      xmm2, xmm0
1060    paddw      xmm3, xmm0
1061    packuswb   xmm2, xmm3
1062    movq   [r0+r1*1], xmm2
1063    movhps [r0+r1*2], xmm2
1064    lea          r0, [r0+r1*2]
1065    dec         r2d
1066    jg .loop
1067    REP_RET
1068
1069; dest, left, right, src, tmp
1070; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1071%macro PRED4x4_LOWPASS 5
1072    mova    %5, %2
1073    pavgb   %2, %3
1074    pxor    %3, %5
1075    mova    %1, %4
1076    pand    %3, [pb_1]
1077    psubusb %2, %3
1078    pavgb   %1, %2
1079%endmacro
1080
1081;-----------------------------------------------------------------------------
1082; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1083;-----------------------------------------------------------------------------
1084%macro PRED8x8L_TOP_DC 1
1085cglobal pred8x8l_top_dc_%1, 4,4
1086    sub          r0, r3
1087    pxor        mm7, mm7
1088    movq        mm0, [r0-8]
1089    movq        mm3, [r0]
1090    movq        mm1, [r0+8]
1091    movq        mm2, mm3
1092    movq        mm4, mm3
1093    PALIGNR     mm2, mm0, 7, mm0
1094    PALIGNR     mm1, mm4, 1, mm4
1095    test         r1, r1 ; top_left
1096    jz .fix_lt_2
1097    test         r2, r2 ; top_right
1098    jz .fix_tr_1
1099    jmp .body
1100.fix_lt_2:
1101    movq        mm5, mm3
1102    pxor        mm5, mm2
1103    psllq       mm5, 56
1104    psrlq       mm5, 56
1105    pxor        mm2, mm5
1106    test         r2, r2 ; top_right
1107    jnz .body
1108.fix_tr_1:
1109    movq        mm5, mm3
1110    pxor        mm5, mm1
1111    psrlq       mm5, 56
1112    psllq       mm5, 56
1113    pxor        mm1, mm5
1114.body
1115    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1116    psadbw   mm7, mm0
1117    paddw    mm7, [pw_4]
1118    psrlw    mm7, 3
1119    pshufw   mm7, mm7, 0
1120    packuswb mm7, mm7
1121%rep 3
1122    movq [r0+r3*1], mm7
1123    movq [r0+r3*2], mm7
1124    lea    r0, [r0+r3*2]
1125%endrep
1126    movq [r0+r3*1], mm7
1127    movq [r0+r3*2], mm7
1128    RET
1129%endmacro
1130
1131INIT_MMX
1132%define PALIGNR PALIGNR_MMX
1133PRED8x8L_TOP_DC mmxext
1134%define PALIGNR PALIGNR_SSSE3
1135PRED8x8L_TOP_DC ssse3
1136
1137;-----------------------------------------------------------------------------
1138;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
1139;-----------------------------------------------------------------------------
1140
1141%macro PRED8x8L_DC 1
1142cglobal pred8x8l_dc_%1, 4,5
1143    sub          r0, r3
1144    lea          r4, [r0+r3*2]
1145    movq        mm0, [r0+r3*1-8]
1146    punpckhbw   mm0, [r0+r3*0-8]
1147    movq        mm1, [r4+r3*1-8]
1148    punpckhbw   mm1, [r0+r3*2-8]
1149    mov          r4, r0
1150    punpckhwd   mm1, mm0
1151    lea          r0, [r0+r3*4]
1152    movq        mm2, [r0+r3*1-8]
1153    punpckhbw   mm2, [r0+r3*0-8]
1154    lea          r0, [r0+r3*2]
1155    movq        mm3, [r0+r3*1-8]
1156    punpckhbw   mm3, [r0+r3*0-8]
1157    punpckhwd   mm3, mm2
1158    punpckhdq   mm3, mm1
1159    lea          r0, [r0+r3*2]
1160    movq        mm0, [r0+r3*0-8]
1161    movq        mm1, [r4]
1162    mov          r0, r4
1163    movq        mm4, mm3
1164    movq        mm2, mm3
1165    PALIGNR     mm4, mm0, 7, mm0
1166    PALIGNR     mm1, mm2, 1, mm2
1167    test        r1, r1
1168    jnz .do_left
1169.fix_lt_1:
1170    movq        mm5, mm3
1171    pxor        mm5, mm4
1172    psrlq       mm5, 56
1173    psllq       mm5, 48
1174    pxor        mm1, mm5
1175    jmp .do_left
1176.fix_lt_2:
1177    movq        mm5, mm3
1178    pxor        mm5, mm2
1179    psllq       mm5, 56
1180    psrlq       mm5, 56
1181    pxor        mm2, mm5
1182    test         r2, r2
1183    jnz .body
1184.fix_tr_1:
1185    movq        mm5, mm3
1186    pxor        mm5, mm1
1187    psrlq       mm5, 56
1188    psllq       mm5, 56
1189    pxor        mm1, mm5
1190    jmp .body
1191.do_left:
1192    movq        mm0, mm4
1193    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1194    movq        mm4, mm0
1195    movq        mm7, mm2
1196    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1197    psllq       mm1, 56
1198    PALIGNR     mm7, mm1, 7, mm3
1199    movq        mm0, [r0-8]
1200    movq        mm3, [r0]
1201    movq        mm1, [r0+8]
1202    movq        mm2, mm3
1203    movq        mm4, mm3
1204    PALIGNR     mm2, mm0, 7, mm0
1205    PALIGNR     mm1, mm4, 1, mm4
1206    test         r1, r1
1207    jz .fix_lt_2
1208    test         r2, r2
1209    jz .fix_tr_1
1210.body
1211    lea          r1, [r0+r3*2]
1212    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1213    pxor        mm0, mm0
1214    pxor        mm1, mm1
1215    lea          r2, [r1+r3*2]
1216    psadbw      mm0, mm7
1217    psadbw      mm1, mm6
1218    paddw       mm0, [pw_8]
1219    paddw       mm0, mm1
1220    lea          r4, [r2+r3*2]
1221    psrlw       mm0, 4
1222    pshufw      mm0, mm0, 0
1223    packuswb    mm0, mm0
1224    movq [r0+r3*1], mm0
1225    movq [r0+r3*2], mm0
1226    movq [r1+r3*1], mm0
1227    movq [r1+r3*2], mm0
1228    movq [r2+r3*1], mm0
1229    movq [r2+r3*2], mm0
1230    movq [r4+r3*1], mm0
1231    movq [r4+r3*2], mm0
1232    RET
1233%endmacro
1234INIT_MMX
1235%define PALIGNR PALIGNR_MMX
1236PRED8x8L_DC mmxext
1237%define PALIGNR PALIGNR_SSSE3
1238PRED8x8L_DC ssse3
1239
1240;-----------------------------------------------------------------------------
1241; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
1242;-----------------------------------------------------------------------------
1243
1244%macro PRED8x8L_HORIZONTAL 1
1245cglobal pred8x8l_horizontal_%1, 4,4
1246    sub          r0, r3
1247    lea          r2, [r0+r3*2]
1248    movq        mm0, [r0+r3*1-8]
1249    test         r1, r1
1250    lea          r1, [r0+r3]
1251    cmovnz       r1, r0
1252    punpckhbw   mm0, [r1+r3*0-8]
1253    movq        mm1, [r2+r3*1-8]
1254    punpckhbw   mm1, [r0+r3*2-8]
1255    mov          r2, r0
1256    punpckhwd   mm1, mm0
1257    lea          r0, [r0+r3*4]
1258    movq        mm2, [r0+r3*1-8]
1259    punpckhbw   mm2, [r0+r3*0-8]
1260    lea          r0, [r0+r3*2]
1261    movq        mm3, [r0+r3*1-8]
1262    punpckhbw   mm3, [r0+r3*0-8]
1263    punpckhwd   mm3, mm2
1264    punpckhdq   mm3, mm1
1265    lea          r0, [r0+r3*2]
1266    movq        mm0, [r0+r3*0-8]
1267    movq        mm1, [r1+r3*0-8]
1268    mov          r0, r2
1269    movq        mm4, mm3
1270    movq        mm2, mm3
1271    PALIGNR     mm4, mm0, 7, mm0
1272    PALIGNR     mm1, mm2, 1, mm2
1273    movq        mm0, mm4
1274    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1275    movq        mm4, mm0
1276    movq        mm7, mm2
1277    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1278    psllq       mm1, 56
1279    PALIGNR     mm7, mm1, 7, mm3
1280    movq        mm3, mm7
1281    lea         r1, [r0+r3*2]
1282    movq       mm7, mm3
1283    punpckhbw  mm3, mm3
1284    punpcklbw  mm7, mm7
1285    pshufw     mm0, mm3, 0xff
1286    pshufw     mm1, mm3, 0xaa
1287    lea         r2, [r1+r3*2]
1288    pshufw     mm2, mm3, 0x55
1289    pshufw     mm3, mm3, 0x00
1290    pshufw     mm4, mm7, 0xff
1291    pshufw     mm5, mm7, 0xaa
1292    pshufw     mm6, mm7, 0x55
1293    pshufw     mm7, mm7, 0x00
1294    movq [r0+r3*1], mm0
1295    movq [r0+r3*2], mm1
1296    movq [r1+r3*1], mm2
1297    movq [r1+r3*2], mm3
1298    movq [r2+r3*1], mm4
1299    movq [r2+r3*2], mm5
1300    lea         r0, [r2+r3*2]
1301    movq [r0+r3*1], mm6
1302    movq [r0+r3*2], mm7
1303    RET
1304%endmacro
1305
1306INIT_MMX
1307%define PALIGNR PALIGNR_MMX
1308PRED8x8L_HORIZONTAL mmxext
1309%define PALIGNR PALIGNR_SSSE3
1310PRED8x8L_HORIZONTAL ssse3
1311
1312;-----------------------------------------------------------------------------
1313; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
1314;-----------------------------------------------------------------------------
1315
1316%macro PRED8x8L_VERTICAL 1
1317cglobal pred8x8l_vertical_%1, 4,4
1318    sub          r0, r3
1319    movq        mm0, [r0-8]
1320    movq        mm3, [r0]
1321    movq        mm1, [r0+8]
1322    movq        mm2, mm3
1323    movq        mm4, mm3
1324    PALIGNR     mm2, mm0, 7, mm0
1325    PALIGNR     mm1, mm4, 1, mm4
1326    test         r1, r1 ; top_left
1327    jz .fix_lt_2
1328    test         r2, r2 ; top_right
1329    jz .fix_tr_1
1330    jmp .body
1331.fix_lt_2:
1332    movq        mm5, mm3
1333    pxor        mm5, mm2
1334    psllq       mm5, 56
1335    psrlq       mm5, 56
1336    pxor        mm2, mm5
1337    test         r2, r2 ; top_right
1338    jnz .body
1339.fix_tr_1:
1340    movq        mm5, mm3
1341    pxor        mm5, mm1
1342    psrlq       mm5, 56
1343    psllq       mm5, 56
1344    pxor        mm1, mm5
1345.body
1346    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1347%rep 3
1348    movq [r0+r3*1], mm0
1349    movq [r0+r3*2], mm0
1350    lea    r0, [r0+r3*2]
1351%endrep
1352    movq [r0+r3*1], mm0
1353    movq [r0+r3*2], mm0
1354    RET
1355%endmacro
1356
1357INIT_MMX
1358%define PALIGNR PALIGNR_MMX
1359PRED8x8L_VERTICAL mmxext
1360%define PALIGNR PALIGNR_SSSE3
1361PRED8x8L_VERTICAL ssse3
1362
1363;-----------------------------------------------------------------------------
1364;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
1365;-----------------------------------------------------------------------------
1366
1367INIT_MMX
1368%define PALIGNR PALIGNR_MMX
1369cglobal pred8x8l_down_left_mmxext, 4,5
1370    sub          r0, r3
1371    movq        mm0, [r0-8]
1372    movq        mm3, [r0]
1373    movq        mm1, [r0+8]
1374    movq        mm2, mm3
1375    movq        mm4, mm3
1376    PALIGNR     mm2, mm0, 7, mm0
1377    PALIGNR     mm1, mm4, 1, mm4
1378    test         r1, r1
1379    jz .fix_lt_2
1380    test         r2, r2
1381    jz .fix_tr_1
1382    jmp .do_top
1383.fix_lt_2:
1384    movq        mm5, mm3
1385    pxor        mm5, mm2
1386    psllq       mm5, 56
1387    psrlq       mm5, 56
1388    pxor        mm2, mm5
1389    test         r2, r2
1390    jnz .do_top
1391.fix_tr_1:
1392    movq        mm5, mm3
1393    pxor        mm5, mm1
1394    psrlq       mm5, 56
1395    psllq       mm5, 56
1396    pxor        mm1, mm5
1397    jmp .do_top
1398.fix_tr_2:
1399    punpckhbw   mm3, mm3
1400    pshufw      mm1, mm3, 0xFF
1401    jmp .do_topright
1402.do_top:
1403    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1404    movq        mm7, mm4
1405    test         r2, r2
1406    jz .fix_tr_2
1407    movq        mm0, [r0+8]
1408    movq        mm5, mm0
1409    movq        mm2, mm0
1410    movq        mm4, mm0
1411    psrlq       mm5, 56
1412    PALIGNR     mm2, mm3, 7, mm3
1413    PALIGNR     mm5, mm4, 1, mm4
1414    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1415.do_topright:
1416    lea          r1, [r0+r3*2]
1417    movq        mm6, mm1
1418    psrlq       mm1, 56
1419    movq        mm4, mm1
1420    lea          r2, [r1+r3*2]
1421    movq        mm2, mm6
1422    PALIGNR     mm2, mm7, 1, mm0
1423    movq        mm3, mm6
1424    PALIGNR     mm3, mm7, 7, mm0
1425    PALIGNR     mm4, mm6, 1, mm0
1426    movq        mm5, mm7
1427    movq        mm1, mm7
1428    movq        mm7, mm6
1429    lea          r4, [r2+r3*2]
1430    psllq       mm1, 8
1431    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1432    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1433    movq  [r4+r3*2], mm1
1434    movq        mm2, mm0
1435    psllq       mm1, 8
1436    psrlq       mm2, 56
1437    psllq       mm0, 8
1438    por         mm1, mm2
1439    movq  [r4+r3*1], mm1
1440    movq        mm2, mm0
1441    psllq       mm1, 8
1442    psrlq       mm2, 56
1443    psllq       mm0, 8
1444    por         mm1, mm2
1445    movq  [r2+r3*2], mm1
1446    movq        mm2, mm0
1447    psllq       mm1, 8
1448    psrlq       mm2, 56
1449    psllq       mm0, 8
1450    por         mm1, mm2
1451    movq  [r2+r3*1], mm1
1452    movq        mm2, mm0
1453    psllq       mm1, 8
1454    psrlq       mm2, 56
1455    psllq       mm0, 8
1456    por         mm1, mm2
1457    movq  [r1+r3*2], mm1
1458    movq        mm2, mm0
1459    psllq       mm1, 8
1460    psrlq       mm2, 56
1461    psllq       mm0, 8
1462    por         mm1, mm2
1463    movq  [r1+r3*1], mm1
1464    movq        mm2, mm0
1465    psllq       mm1, 8
1466    psrlq       mm2, 56
1467    psllq       mm0, 8
1468    por         mm1, mm2
1469    movq  [r0+r3*2], mm1
1470    psllq       mm1, 8
1471    psrlq       mm0, 56
1472    por         mm1, mm0
1473    movq  [r0+r3*1], mm1
1474    RET
1475
1476%macro PRED8x8L_DOWN_LEFT 1
1477cglobal pred8x8l_down_left_%1, 4,4
1478    sub          r0, r3
1479    movq        mm0, [r0-8]
1480    movq        mm3, [r0]
1481    movq        mm1, [r0+8]
1482    movq        mm2, mm3
1483    movq        mm4, mm3
1484    PALIGNR     mm2, mm0, 7, mm0
1485    PALIGNR     mm1, mm4, 1, mm4
1486    test         r1, r1 ; top_left
1487    jz .fix_lt_2
1488    test         r2, r2 ; top_right
1489    jz .fix_tr_1
1490    jmp .do_top
1491.fix_lt_2:
1492    movq        mm5, mm3
1493    pxor        mm5, mm2
1494    psllq       mm5, 56
1495    psrlq       mm5, 56
1496    pxor        mm2, mm5
1497    test         r2, r2 ; top_right
1498    jnz .do_top
1499.fix_tr_1:
1500    movq        mm5, mm3
1501    pxor        mm5, mm1
1502    psrlq       mm5, 56
1503    psllq       mm5, 56
1504    pxor        mm1, mm5
1505    jmp .do_top
1506.fix_tr_2:
1507    punpckhbw   mm3, mm3
1508    pshufw      mm1, mm3, 0xFF
1509    jmp .do_topright
1510.do_top:
1511    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1512    movq2dq    xmm3, mm4
1513    test         r2, r2 ; top_right
1514    jz .fix_tr_2
1515    movq        mm0, [r0+8]
1516    movq        mm5, mm0
1517    movq        mm2, mm0
1518    movq        mm4, mm0
1519    psrlq       mm5, 56
1520    PALIGNR     mm2, mm3, 7, mm3
1521    PALIGNR     mm5, mm4, 1, mm4
1522    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1523.do_topright:
1524    movq2dq    xmm4, mm1
1525    psrlq       mm1, 56
1526    movq2dq    xmm5, mm1
1527    lea         r1, [r0+r3*2]
1528    pslldq    xmm4, 8
1529    por       xmm3, xmm4
1530    movdqa    xmm2, xmm3
1531    psrldq    xmm2, 1
1532    pslldq    xmm5, 15
1533    por       xmm2, xmm5
1534    lea         r2, [r1+r3*2]
1535    movdqa    xmm1, xmm3
1536    pslldq    xmm1, 1
1537INIT_XMM
1538    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1539    psrldq    xmm0, 1
1540    movq [r0+r3*1], xmm0
1541    psrldq    xmm0, 1
1542    movq [r0+r3*2], xmm0
1543    psrldq    xmm0, 1
1544    lea         r0, [r2+r3*2]
1545    movq [r1+r3*1], xmm0
1546    psrldq    xmm0, 1
1547    movq [r1+r3*2], xmm0
1548    psrldq    xmm0, 1
1549    movq [r2+r3*1], xmm0
1550    psrldq    xmm0, 1
1551    movq [r2+r3*2], xmm0
1552    psrldq    xmm0, 1
1553    movq [r0+r3*1], xmm0
1554    psrldq    xmm0, 1
1555    movq [r0+r3*2], xmm0
1556    RET
1557%endmacro
1558
1559INIT_MMX
1560%define PALIGNR PALIGNR_MMX
1561PRED8x8L_DOWN_LEFT sse2
1562INIT_MMX
1563%define PALIGNR PALIGNR_SSSE3
1564PRED8x8L_DOWN_LEFT ssse3
1565
1566;-----------------------------------------------------------------------------
1567;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
1568;-----------------------------------------------------------------------------
1569
1570INIT_MMX
1571%define PALIGNR PALIGNR_MMX
1572cglobal pred8x8l_down_right_mmxext, 4,5
1573    sub          r0, r3
1574    lea          r4, [r0+r3*2]
1575    movq        mm0, [r0+r3*1-8]
1576    punpckhbw   mm0, [r0+r3*0-8]
1577    movq        mm1, [r4+r3*1-8]
1578    punpckhbw   mm1, [r0+r3*2-8]
1579    mov          r4, r0
1580    punpckhwd   mm1, mm0
1581    lea          r0, [r0+r3*4]
1582    movq        mm2, [r0+r3*1-8]
1583    punpckhbw   mm2, [r0+r3*0-8]
1584    lea          r0, [r0+r3*2]
1585    movq        mm3, [r0+r3*1-8]
1586    punpckhbw   mm3, [r0+r3*0-8]
1587    punpckhwd   mm3, mm2
1588    punpckhdq   mm3, mm1
1589    lea          r0, [r0+r3*2]
1590    movq        mm0, [r0+r3*0-8]
1591    movq        mm1, [r4]
1592    mov          r0, r4
1593    movq        mm4, mm3
1594    movq        mm2, mm3
1595    PALIGNR     mm4, mm0, 7, mm0
1596    PALIGNR     mm1, mm2, 1, mm2
1597    test        r1, r1 ; top_left
1598    jz .fix_lt_1
1599.do_left:
1600    movq        mm0, mm4
1601    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1602    movq        mm4, mm0
1603    movq        mm7, mm2
1604    movq        mm6, mm2
1605    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1606    psllq       mm1, 56
1607    PALIGNR     mm7, mm1, 7, mm3
1608    movq        mm0, [r0-8]
1609    movq        mm3, [r0]
1610    movq        mm1, [r0+8]
1611    movq        mm2, mm3
1612    movq        mm4, mm3
1613    PALIGNR     mm2, mm0, 7, mm0
1614    PALIGNR     mm1, mm4, 1, mm4
1615    test         r1, r1 ; top_left
1616    jz .fix_lt_2
1617    test         r2, r2 ; top_right
1618    jz .fix_tr_1
1619.do_top:
1620    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1621    movq        mm5, mm4
1622    jmp .body
1623.fix_lt_1:
1624    movq        mm5, mm3
1625    pxor        mm5, mm4
1626    psrlq       mm5, 56
1627    psllq       mm5, 48
1628    pxor        mm1, mm5
1629    jmp .do_left
1630.fix_lt_2:
1631    movq        mm5, mm3
1632    pxor        mm5, mm2
1633    psllq       mm5, 56
1634    psrlq       mm5, 56
1635    pxor        mm2, mm5
1636    test         r2, r2 ; top_right
1637    jnz .do_top
1638.fix_tr_1:
1639    movq        mm5, mm3
1640    pxor        mm5, mm1
1641    psrlq       mm5, 56
1642    psllq       mm5, 56
1643    pxor        mm1, mm5
1644    jmp .do_top
1645.body
1646    lea         r1, [r0+r3*2]
1647    movq       mm1, mm7
1648    movq       mm7, mm5
1649    movq       mm5, mm6
1650    movq       mm2, mm7
1651    lea         r2, [r1+r3*2]
1652    PALIGNR    mm2, mm6, 1, mm0
1653    movq       mm3, mm7
1654    PALIGNR    mm3, mm6, 7, mm0
1655    movq       mm4, mm7
1656    lea         r4, [r2+r3*2]
1657    psrlq      mm4, 8
1658    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1659    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1660    movq [r4+r3*2], mm0
1661    movq       mm2, mm1
1662    psrlq      mm0, 8
1663    psllq      mm2, 56
1664    psrlq      mm1, 8
1665    por        mm0, mm2
1666    movq [r4+r3*1], mm0
1667    movq       mm2, mm1
1668    psrlq      mm0, 8
1669    psllq      mm2, 56
1670    psrlq      mm1, 8
1671    por        mm0, mm2
1672    movq [r2+r3*2], mm0
1673    movq       mm2, mm1
1674    psrlq      mm0, 8
1675    psllq      mm2, 56
1676    psrlq      mm1, 8
1677    por        mm0, mm2
1678    movq [r2+r3*1], mm0
1679    movq       mm2, mm1
1680    psrlq      mm0, 8
1681    psllq      mm2, 56
1682    psrlq      mm1, 8
1683    por        mm0, mm2
1684    movq [r1+r3*2], mm0
1685    movq       mm2, mm1
1686    psrlq      mm0, 8
1687    psllq      mm2, 56
1688    psrlq      mm1, 8
1689    por        mm0, mm2
1690    movq [r1+r3*1], mm0
1691    movq       mm2, mm1
1692    psrlq      mm0, 8
1693    psllq      mm2, 56
1694    psrlq      mm1, 8
1695    por        mm0, mm2
1696    movq [r0+r3*2], mm0
1697    psrlq      mm0, 8
1698    psllq      mm1, 56
1699    por        mm0, mm1
1700    movq [r0+r3*1], mm0
1701    RET
1702
1703%macro PRED8x8L_DOWN_RIGHT 1
1704cglobal pred8x8l_down_right_%1, 4,5
1705    sub          r0, r3
1706    lea          r4, [r0+r3*2]
1707    movq        mm0, [r0+r3*1-8]
1708    punpckhbw   mm0, [r0+r3*0-8]
1709    movq        mm1, [r4+r3*1-8]
1710    punpckhbw   mm1, [r0+r3*2-8]
1711    mov          r4, r0
1712    punpckhwd   mm1, mm0
1713    lea          r0, [r0+r3*4]
1714    movq        mm2, [r0+r3*1-8]
1715    punpckhbw   mm2, [r0+r3*0-8]
1716    lea          r0, [r0+r3*2]
1717    movq        mm3, [r0+r3*1-8]
1718    punpckhbw   mm3, [r0+r3*0-8]
1719    punpckhwd   mm3, mm2
1720    punpckhdq   mm3, mm1
1721    lea          r0, [r0+r3*2]
1722    movq        mm0, [r0+r3*0-8]
1723    movq        mm1, [r4]
1724    mov          r0, r4
1725    movq        mm4, mm3
1726    movq        mm2, mm3
1727    PALIGNR     mm4, mm0, 7, mm0
1728    PALIGNR     mm1, mm2, 1, mm2
1729    test        r1, r1
1730    jz .fix_lt_1
1731    jmp .do_left
1732.fix_lt_1:
1733    movq        mm5, mm3
1734    pxor        mm5, mm4
1735    psrlq       mm5, 56
1736    psllq       mm5, 48
1737    pxor        mm1, mm5
1738    jmp .do_left
1739.fix_lt_2:
1740    movq        mm5, mm3
1741    pxor        mm5, mm2
1742    psllq       mm5, 56
1743    psrlq       mm5, 56
1744    pxor        mm2, mm5
1745    test         r2, r2
1746    jnz .do_top
1747.fix_tr_1:
1748    movq        mm5, mm3
1749    pxor        mm5, mm1
1750    psrlq       mm5, 56
1751    psllq       mm5, 56
1752    pxor        mm1, mm5
1753    jmp .do_top
1754.do_left:
1755    movq        mm0, mm4
1756    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1757    movq        mm4, mm0
1758    movq        mm7, mm2
1759    movq2dq    xmm3, mm2
1760    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1761    psllq       mm1, 56
1762    PALIGNR     mm7, mm1, 7, mm3
1763    movq2dq    xmm1, mm7
1764    movq        mm0, [r0-8]
1765    movq        mm3, [r0]
1766    movq        mm1, [r0+8]
1767    movq        mm2, mm3
1768    movq        mm4, mm3
1769    PALIGNR     mm2, mm0, 7, mm0
1770    PALIGNR     mm1, mm4, 1, mm4
1771    test         r1, r1
1772    jz .fix_lt_2
1773    test         r2, r2
1774    jz .fix_tr_1
1775.do_top:
1776    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1777    movq2dq   xmm4, mm4
1778    lea         r1, [r0+r3*2]
1779    movdqa    xmm0, xmm3
1780    pslldq    xmm4, 8
1781    por       xmm3, xmm4
1782    lea         r2, [r1+r3*2]
1783    pslldq    xmm4, 1
1784    por       xmm1, xmm4
1785    psrldq    xmm0, 7
1786    pslldq    xmm0, 15
1787    psrldq    xmm0, 7
1788    por       xmm1, xmm0
1789    lea         r0, [r2+r3*2]
1790    movdqa    xmm2, xmm3
1791    psrldq    xmm2, 1
1792INIT_XMM
1793    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1794    movdqa    xmm1, xmm0
1795    psrldq    xmm1, 1
1796    movq [r0+r3*2], xmm0
1797    movq [r0+r3*1], xmm1
1798    psrldq    xmm0, 2
1799    psrldq    xmm1, 2
1800    movq [r2+r3*2], xmm0
1801    movq [r2+r3*1], xmm1
1802    psrldq    xmm0, 2
1803    psrldq    xmm1, 2
1804    movq [r1+r3*2], xmm0
1805    movq [r1+r3*1], xmm1
1806    psrldq    xmm0, 2
1807    psrldq    xmm1, 2
1808    movq [r4+r3*2], xmm0
1809    movq [r4+r3*1], xmm1
1810    RET
1811%endmacro
1812
1813INIT_MMX
1814%define PALIGNR PALIGNR_MMX
1815PRED8x8L_DOWN_RIGHT sse2
1816INIT_MMX
1817%define PALIGNR PALIGNR_SSSE3
1818PRED8x8L_DOWN_RIGHT ssse3
1819
1820;-----------------------------------------------------------------------------
1821; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
1822;-----------------------------------------------------------------------------
1823
1824INIT_MMX
1825%define PALIGNR PALIGNR_MMX
1826cglobal pred8x8l_vertical_right_mmxext, 4,5
1827    sub          r0, r3
1828    lea          r4, [r0+r3*2]
1829    movq        mm0, [r0+r3*1-8]
1830    punpckhbw   mm0, [r0+r3*0-8]
1831    movq        mm1, [r4+r3*1-8]
1832    punpckhbw   mm1, [r0+r3*2-8]
1833    mov          r4, r0
1834    punpckhwd   mm1, mm0
1835    lea          r0, [r0+r3*4]
1836    movq        mm2, [r0+r3*1-8]
1837    punpckhbw   mm2, [r0+r3*0-8]
1838    lea          r0, [r0+r3*2]
1839    movq        mm3, [r0+r3*1-8]
1840    punpckhbw   mm3, [r0+r3*0-8]
1841    punpckhwd   mm3, mm2
1842    punpckhdq   mm3, mm1
1843    lea          r0, [r0+r3*2]
1844    movq        mm0, [r0+r3*0-8]
1845    movq        mm1, [r4]
1846    mov          r0, r4
1847    movq        mm4, mm3
1848    movq        mm2, mm3
1849    PALIGNR     mm4, mm0, 7, mm0
1850    PALIGNR     mm1, mm2, 1, mm2
1851    test        r1, r1
1852    jz .fix_lt_1
1853    jmp .do_left
1854.fix_lt_1:
1855    movq        mm5, mm3
1856    pxor        mm5, mm4
1857    psrlq       mm5, 56
1858    psllq       mm5, 48
1859    pxor        mm1, mm5
1860    jmp .do_left
1861.fix_lt_2:
1862    movq        mm5, mm3
1863    pxor        mm5, mm2
1864    psllq       mm5, 56
1865    psrlq       mm5, 56
1866    pxor        mm2, mm5
1867    test         r2, r2
1868    jnz .do_top
1869.fix_tr_1:
1870    movq        mm5, mm3
1871    pxor        mm5, mm1
1872    psrlq       mm5, 56
1873    psllq       mm5, 56
1874    pxor        mm1, mm5
1875    jmp .do_top
1876.do_left:
1877    movq        mm0, mm4
1878    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1879    movq        mm7, mm2
1880    movq        mm0, [r0-8]
1881    movq        mm3, [r0]
1882    movq        mm1, [r0+8]
1883    movq        mm2, mm3
1884    movq        mm4, mm3
1885    PALIGNR     mm2, mm0, 7, mm0
1886    PALIGNR     mm1, mm4, 1, mm4
1887    test         r1, r1
1888    jz .fix_lt_2
1889    test         r2, r2
1890    jz .fix_tr_1
1891.do_top
1892    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1893    lea         r1, [r0+r3*2]
1894    movq       mm2, mm6
1895    movq       mm3, mm6
1896    PALIGNR    mm3, mm7, 7, mm0
1897    PALIGNR    mm6, mm7, 6, mm1
1898    movq       mm4, mm3
1899    pavgb      mm3, mm2
1900    lea         r2, [r1+r3*2]
1901    PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1902    movq [r0+r3*1], mm3
1903    movq [r0+r3*2], mm0
1904    movq       mm5, mm0
1905    movq       mm6, mm3
1906    movq       mm1, mm7
1907    movq       mm2, mm1
1908    psllq      mm2, 8
1909    movq       mm3, mm1
1910    psllq      mm3, 16
1911    lea         r4, [r2+r3*2]
1912    PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1913    PALIGNR    mm6, mm0, 7, mm2
1914    movq [r1+r3*1], mm6
1915    psllq      mm0, 8
1916    PALIGNR    mm5, mm0, 7, mm1
1917    movq [r1+r3*2], mm5
1918    psllq      mm0, 8
1919    PALIGNR    mm6, mm0, 7, mm2
1920    movq [r2+r3*1], mm6
1921    psllq      mm0, 8
1922    PALIGNR    mm5, mm0, 7, mm1
1923    movq [r2+r3*2], mm5
1924    psllq      mm0, 8
1925    PALIGNR    mm6, mm0, 7, mm2
1926    movq [r4+r3*1], mm6
1927    psllq      mm0, 8
1928    PALIGNR    mm5, mm0, 7, mm1
1929    movq [r4+r3*2], mm5
1930    RET
1931
1932%macro PRED8x8L_VERTICAL_RIGHT 1
1933cglobal pred8x8l_vertical_right_%1, 4,5,7
1934    sub          r0, r3
1935    lea          r4, [r0+r3*2]
1936    movq        mm0, [r0+r3*1-8]
1937    punpckhbw   mm0, [r0+r3*0-8]
1938    movq        mm1, [r4+r3*1-8]
1939    punpckhbw   mm1, [r0+r3*2-8]
1940    mov          r4, r0
1941    punpckhwd   mm1, mm0
1942    lea          r0, [r0+r3*4]
1943    movq        mm2, [r0+r3*1-8]
1944    punpckhbw   mm2, [r0+r3*0-8]
1945    lea          r0, [r0+r3*2]
1946    movq        mm3, [r0+r3*1-8]
1947    punpckhbw   mm3, [r0+r3*0-8]
1948    punpckhwd   mm3, mm2
1949    punpckhdq   mm3, mm1
1950    lea          r0, [r0+r3*2]
1951    movq        mm0, [r0+r3*0-8]
1952    movq        mm1, [r4]
1953    mov          r0, r4
1954    movq        mm4, mm3
1955    movq        mm2, mm3
1956    PALIGNR     mm4, mm0, 7, mm0
1957    PALIGNR     mm1, mm2, 1, mm2
1958    test        r1, r1
1959    jnz .do_left
1960.fix_lt_1:
1961    movq        mm5, mm3
1962    pxor        mm5, mm4
1963    psrlq       mm5, 56
1964    psllq       mm5, 48
1965    pxor        mm1, mm5
1966    jmp .do_left
1967.fix_lt_2:
1968    movq        mm5, mm3
1969    pxor        mm5, mm2
1970    psllq       mm5, 56
1971    psrlq       mm5, 56
1972    pxor        mm2, mm5
1973    test         r2, r2
1974    jnz .do_top
1975.fix_tr_1:
1976    movq        mm5, mm3
1977    pxor        mm5, mm1
1978    psrlq       mm5, 56
1979    psllq       mm5, 56
1980    pxor        mm1, mm5
1981    jmp .do_top
1982.do_left:
1983    movq        mm0, mm4
1984    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1985    movq2dq    xmm0, mm2
1986    movq        mm0, [r0-8]
1987    movq        mm3, [r0]
1988    movq        mm1, [r0+8]
1989    movq        mm2, mm3
1990    movq        mm4, mm3
1991    PALIGNR     mm2, mm0, 7, mm0
1992    PALIGNR     mm1, mm4, 1, mm4
1993    test         r1, r1
1994    jz .fix_lt_2
1995    test         r2, r2
1996    jz .fix_tr_1
1997.do_top
1998    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1999    lea           r1, [r0+r3*2]
2000    movq2dq     xmm4, mm6
2001    pslldq      xmm4, 8
2002    por         xmm0, xmm4
2003    movdqa      xmm6, [pw_ff00]
2004    movdqa      xmm1, xmm0
2005    lea           r2, [r1+r3*2]
2006    movdqa      xmm2, xmm0
2007    movdqa      xmm3, xmm0
2008    pslldq      xmm0, 1
2009    pslldq      xmm1, 2
2010    pavgb       xmm2, xmm0
2011INIT_XMM
2012    PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
2013    pandn       xmm6, xmm4
2014    movdqa      xmm5, xmm4
2015    psrlw       xmm4, 8
2016    packuswb    xmm6, xmm4
2017    movhlps     xmm4, xmm6
2018    movhps [r0+r3*2], xmm5
2019    movhps [r0+r3*1], xmm2
2020    psrldq      xmm5, 4
2021    movss       xmm5, xmm6
2022    psrldq      xmm2, 4
2023    movss       xmm2, xmm4
2024    lea           r0, [r2+r3*2]
2025    psrldq      xmm5, 1
2026    psrldq      xmm2, 1
2027    movq        [r0+r3*2], xmm5
2028    movq        [r0+r3*1], xmm2
2029    psrldq      xmm5, 1
2030    psrldq      xmm2, 1
2031    movq        [r2+r3*2], xmm5
2032    movq        [r2+r3*1], xmm2
2033    psrldq      xmm5, 1
2034    psrldq      xmm2, 1
2035    movq        [r1+r3*2], xmm5
2036    movq        [r1+r3*1], xmm2
2037    RET
2038%endmacro
2039
2040INIT_MMX
2041%define PALIGNR PALIGNR_MMX
2042PRED8x8L_VERTICAL_RIGHT sse2
2043INIT_MMX
2044%define PALIGNR PALIGNR_SSSE3
2045PRED8x8L_VERTICAL_RIGHT ssse3
2046
2047;-----------------------------------------------------------------------------
2048;void pred8x8l_vertical_left(uint8_t *src, int has_topleft, int has_topright, int stride)
2049;-----------------------------------------------------------------------------
2050
2051%macro PRED8x8L_VERTICAL_LEFT 1
2052cglobal pred8x8l_vertical_left_%1, 4,4
2053    sub          r0, r3
2054    movq        mm0, [r0-8]
2055    movq        mm3, [r0]
2056    movq        mm1, [r0+8]
2057    movq        mm2, mm3
2058    movq        mm4, mm3
2059    PALIGNR     mm2, mm0, 7, mm0
2060    PALIGNR     mm1, mm4, 1, mm4
2061    test         r1, r1
2062    jz .fix_lt_2
2063    test         r2, r2
2064    jz .fix_tr_1
2065    jmp .do_top
2066.fix_lt_2:
2067    movq        mm5, mm3
2068    pxor        mm5, mm2
2069    psllq       mm5, 56
2070    psrlq       mm5, 56
2071    pxor        mm2, mm5
2072    test         r2, r2
2073    jnz .do_top
2074.fix_tr_1:
2075    movq        mm5, mm3
2076    pxor        mm5, mm1
2077    psrlq       mm5, 56
2078    psllq       mm5, 56
2079    pxor        mm1, mm5
2080    jmp .do_top
2081.fix_tr_2:
2082    punpckhbw   mm3, mm3
2083    pshufw      mm1, mm3, 0xFF
2084    jmp .do_topright
2085.do_top:
2086    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2087    movq2dq    xmm4, mm4
2088    test         r2, r2
2089    jz .fix_tr_2
2090    movq        mm0, [r0+8]
2091    movq        mm5, mm0
2092    movq        mm2, mm0
2093    movq        mm4, mm0
2094    psrlq       mm5, 56
2095    PALIGNR     mm2, mm3, 7, mm3
2096    PALIGNR     mm5, mm4, 1, mm4
2097    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2098.do_topright:
2099    movq2dq   xmm3, mm1
2100    lea         r1, [r0+r3*2]
2101    pslldq    xmm3, 8
2102    por       xmm4, xmm3
2103    movdqa    xmm2, xmm4
2104    movdqa    xmm1, xmm4
2105    movdqa    xmm3, xmm4
2106    psrldq    xmm2, 1
2107    pslldq    xmm1, 1
2108    pavgb     xmm3, xmm2
2109    lea         r2, [r1+r3*2]
2110INIT_XMM
2111    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2112    psrldq    xmm0, 1
2113    movq [r0+r3*1], xmm3
2114    movq [r0+r3*2], xmm0
2115    lea         r0, [r2+r3*2]
2116    psrldq    xmm3, 1
2117    psrldq    xmm0, 1
2118    movq [r1+r3*1], xmm3
2119    movq [r1+r3*2], xmm0
2120    psrldq    xmm3, 1
2121    psrldq    xmm0, 1
2122    movq [r2+r3*1], xmm3
2123    movq [r2+r3*2], xmm0
2124    psrldq    xmm3, 1
2125    psrldq    xmm0, 1
2126    movq [r0+r3*1], xmm3
2127    movq [r0+r3*2], xmm0
2128    RET
2129%endmacro
2130
2131INIT_MMX
2132%define PALIGNR PALIGNR_MMX
2133PRED8x8L_VERTICAL_LEFT sse2
2134%define PALIGNR PALIGNR_SSSE3
2135INIT_MMX
2136PRED8x8L_VERTICAL_LEFT ssse3
2137
2138;-----------------------------------------------------------------------------
2139; void pred8x8l_horizontal_up(uint8_t *src, int has_topleft, int has_topright, int stride)
2140;-----------------------------------------------------------------------------
2141
2142%macro PRED8x8L_HORIZONTAL_UP 1
2143cglobal pred8x8l_horizontal_up_%1, 4,4
2144    sub          r0, r3
2145    lea          r2, [r0+r3*2]
2146    movq        mm0, [r0+r3*1-8]
2147    test         r1, r1
2148    lea          r1, [r0+r3]
2149    cmovnz       r1, r0
2150    punpckhbw   mm0, [r1+r3*0-8]
2151    movq        mm1, [r2+r3*1-8]
2152    punpckhbw   mm1, [r0+r3*2-8]
2153    mov          r2, r0
2154    punpckhwd   mm1, mm0
2155    lea          r0, [r0+r3*4]
2156    movq        mm2, [r0+r3*1-8]
2157    punpckhbw   mm2, [r0+r3*0-8]
2158    lea          r0, [r0+r3*2]
2159    movq        mm3, [r0+r3*1-8]
2160    punpckhbw   mm3, [r0+r3*0-8]
2161    punpckhwd   mm3, mm2
2162    punpckhdq   mm3, mm1
2163    lea          r0, [r0+r3*2]
2164    movq        mm0, [r0+r3*0-8]
2165    movq        mm1, [r1+r3*0-8]
2166    mov          r0, r2
2167    movq        mm4, mm3
2168    movq        mm2, mm3
2169    PALIGNR     mm4, mm0, 7, mm0
2170    PALIGNR     mm1, mm2, 1, mm2
2171    movq       mm0, mm4
2172    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2173    movq       mm4, mm0
2174    movq       mm7, mm2
2175    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2176    psllq      mm1, 56
2177    PALIGNR    mm7, mm1, 7, mm3
2178    lea         r1, [r0+r3*2]
2179    pshufw     mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2180    psllq      mm7, 56             ; l7 .. .. .. .. .. .. ..
2181    movq       mm2, mm0
2182    psllw      mm0, 8
2183    psrlw      mm2, 8
2184    por        mm2, mm0            ; l7 l6 l5 l4 l3 l2 l1 l0
2185    movq       mm3, mm2
2186    movq       mm4, mm2
2187    movq       mm5, mm2
2188    psrlq      mm2, 8
2189    psrlq      mm3, 16
2190    lea         r2, [r1+r3*2]
2191    por        mm2, mm7            ; l7 l7 l6 l5 l4 l3 l2 l1
2192    punpckhbw  mm7, mm7
2193    por        mm3, mm7            ; l7 l7 l7 l6 l5 l4 l3 l2
2194    pavgb      mm4, mm2
2195    PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2196    movq       mm5, mm4
2197    punpcklbw  mm4, mm1            ; p4 p3 p2 p1
2198    punpckhbw  mm5, mm1            ; p8 p7 p6 p5
2199    movq       mm6, mm5
2200    movq       mm7, mm5
2201    movq       mm0, mm5
2202    PALIGNR    mm5, mm4, 2, mm1
2203    pshufw     mm1, mm6, 11111001b
2204    PALIGNR    mm6, mm4, 4, mm2
2205    pshufw     mm2, mm7, 11111110b
2206    PALIGNR    mm7, mm4, 6, mm3
2207    pshufw     mm3, mm0, 11111111b
2208    movq [r0+r3*1], mm4
2209    movq [r0+r3*2], mm5
2210    lea         r0, [r2+r3*2]
2211    movq [r1+r3*1], mm6
2212    movq [r1+r3*2], mm7
2213    movq [r2+r3*1], mm0
2214    movq [r2+r3*2], mm1
2215    movq [r0+r3*1], mm2
2216    movq [r0+r3*2], mm3
2217    RET
2218%endmacro
2219
2220INIT_MMX
2221%define PALIGNR PALIGNR_MMX
2222PRED8x8L_HORIZONTAL_UP mmxext
2223%define PALIGNR PALIGNR_SSSE3
2224PRED8x8L_HORIZONTAL_UP ssse3
2225
2226;-----------------------------------------------------------------------------
2227;void pred8x8l_horizontal_down(uint8_t *src, int has_topleft, int has_topright, int stride)
2228;-----------------------------------------------------------------------------
2229
2230INIT_MMX
2231%define PALIGNR PALIGNR_MMX
2232cglobal pred8x8l_horizontal_down_mmxext, 4,5
2233    sub          r0, r3
2234    lea          r4, [r0+r3*2]
2235    movq        mm0, [r0+r3*1-8]
2236    punpckhbw   mm0, [r0+r3*0-8]
2237    movq        mm1, [r4+r3*1-8]
2238    punpckhbw   mm1, [r0+r3*2-8]
2239    mov          r4, r0
2240    punpckhwd   mm1, mm0
2241    lea          r0, [r0+r3*4]
2242    movq        mm2, [r0+r3*1-8]
2243    punpckhbw   mm2, [r0+r3*0-8]
2244    lea          r0, [r0+r3*2]
2245    movq        mm3, [r0+r3*1-8]
2246    punpckhbw   mm3, [r0+r3*0-8]
2247    punpckhwd   mm3, mm2
2248    punpckhdq   mm3, mm1
2249    lea          r0, [r0+r3*2]
2250    movq        mm0, [r0+r3*0-8]
2251    movq        mm1, [r4]
2252    mov          r0, r4
2253    movq        mm4, mm3
2254    movq        mm2, mm3
2255    PALIGNR     mm4, mm0, 7, mm0
2256    PALIGNR     mm1, mm2, 1, mm2
2257    test        r1, r1
2258    jnz .do_left
2259.fix_lt_1:
2260    movq        mm5, mm3
2261    pxor        mm5, mm4
2262    psrlq       mm5, 56
2263    psllq       mm5, 48
2264    pxor        mm1, mm5
2265    jmp .do_left
2266.fix_lt_2:
2267    movq        mm5, mm3
2268    pxor        mm5, mm2
2269    psllq       mm5, 56
2270    psrlq       mm5, 56
2271    pxor        mm2, mm5
2272    test         r2, r2
2273    jnz .do_top
2274.fix_tr_1:
2275    movq        mm5, mm3
2276    pxor        mm5, mm1
2277    psrlq       mm5, 56
2278    psllq       mm5, 56
2279    pxor        mm1, mm5
2280    jmp .do_top
2281.do_left:
2282    movq        mm0, mm4
2283    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2284    movq        mm4, mm0
2285    movq        mm7, mm2
2286    movq        mm6, mm2
2287    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2288    psllq       mm1, 56
2289    PALIGNR     mm7, mm1, 7, mm3
2290    movq        mm0, [r0-8]
2291    movq        mm3, [r0]
2292    movq        mm1, [r0+8]
2293    movq        mm2, mm3
2294    movq        mm4, mm3
2295    PALIGNR     mm2, mm0, 7, mm0
2296    PALIGNR     mm1, mm4, 1, mm4
2297    test         r1, r1
2298    jz .fix_lt_2
2299    test         r2, r2
2300    jz .fix_tr_1
2301.do_top:
2302    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2303    movq       mm5, mm4
2304    lea         r1, [r0+r3*2]
2305    psllq      mm7, 56
2306    movq       mm2, mm5
2307    movq       mm3, mm6
2308    movq       mm4, mm2
2309    PALIGNR    mm2, mm6, 7, mm5
2310    PALIGNR    mm6, mm7, 7, mm0
2311    lea         r2, [r1+r3*2]
2312    PALIGNR    mm4, mm3, 1, mm7
2313    movq       mm5, mm3
2314    pavgb      mm3, mm6
2315    PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2316    movq       mm4, mm2
2317    movq       mm1, mm2
2318    lea         r4, [r2+r3*2]
2319    psrlq      mm4, 16
2320    psrlq      mm1, 8
2321    PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2322    movq       mm7, mm3
2323    punpcklbw  mm3, mm0
2324    punpckhbw  mm7, mm0
2325    movq       mm1, mm7
2326    movq       mm0, mm7
2327    movq       mm4, mm7
2328    movq [r4+r3*2], mm3
2329    PALIGNR    mm7, mm3, 2, mm5
2330    movq [r4+r3*1], mm7
2331    PALIGNR    mm1, mm3, 4, mm5
2332    movq [r2+r3*2], mm1
2333    PALIGNR    mm0, mm3, 6, mm3
2334    movq [r2+r3*1], mm0
2335    movq       mm2, mm6
2336    movq       mm3, mm6
2337    movq [r1+r3*2], mm4
2338    PALIGNR    mm6, mm4, 2, mm5
2339    movq [r1+r3*1], mm6
2340    PALIGNR    mm2, mm4, 4, mm5
2341    movq [r0+r3*2], mm2
2342    PALIGNR    mm3, mm4, 6, mm4
2343    movq [r0+r3*1], mm3
2344    RET
2345
2346%macro PRED8x8L_HORIZONTAL_DOWN 1
2347cglobal pred8x8l_horizontal_down_%1, 4,5
2348    sub          r0, r3
2349    lea          r4, [r0+r3*2]
2350    movq        mm0, [r0+r3*1-8]
2351    punpckhbw   mm0, [r0+r3*0-8]
2352    movq        mm1, [r4+r3*1-8]
2353    punpckhbw   mm1, [r0+r3*2-8]
2354    mov          r4, r0
2355    punpckhwd   mm1, mm0
2356    lea          r0, [r0+r3*4]
2357    movq        mm2, [r0+r3*1-8]
2358    punpckhbw   mm2, [r0+r3*0-8]
2359    lea          r0, [r0+r3*2]
2360    movq        mm3, [r0+r3*1-8]
2361    punpckhbw   mm3, [r0+r3*0-8]
2362    punpckhwd   mm3, mm2
2363    punpckhdq   mm3, mm1
2364    lea          r0, [r0+r3*2]
2365    movq        mm0, [r0+r3*0-8]
2366    movq        mm1, [r4]
2367    mov          r0, r4
2368    movq        mm4, mm3
2369    movq        mm2, mm3
2370    PALIGNR     mm4, mm0, 7, mm0
2371    PALIGNR     mm1, mm2, 1, mm2
2372    test        r1, r1
2373    jnz .do_left
2374.fix_lt_1:
2375    movq        mm5, mm3
2376    pxor        mm5, mm4
2377    psrlq       mm5, 56
2378    psllq       mm5, 48
2379    pxor        mm1, mm5
2380    jmp .do_left
2381.fix_lt_2:
2382    movq        mm5, mm3
2383    pxor        mm5, mm2
2384    psllq       mm5, 56
2385    psrlq       mm5, 56
2386    pxor        mm2, mm5
2387    test         r2, r2
2388    jnz .do_top
2389.fix_tr_1:
2390    movq        mm5, mm3
2391    pxor        mm5, mm1
2392    psrlq       mm5, 56
2393    psllq       mm5, 56
2394    pxor        mm1, mm5
2395    jmp .do_top
2396.fix_tr_2:
2397    punpckhbw   mm3, mm3
2398    pshufw      mm1, mm3, 0xFF
2399    jmp .do_topright
2400.do_left:
2401    movq        mm0, mm4
2402    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2403    movq2dq    xmm0, mm2
2404    pslldq     xmm0, 8
2405    movq        mm4, mm0
2406    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2407    movq2dq    xmm2, mm1
2408    pslldq     xmm2, 15
2409    psrldq     xmm2, 8
2410    por        xmm0, xmm2
2411    movq        mm0, [r0-8]
2412    movq        mm3, [r0]
2413    movq        mm1, [r0+8]
2414    movq        mm2, mm3
2415    movq        mm4, mm3
2416    PALIGNR     mm2, mm0, 7, mm0
2417    PALIGNR     mm1, mm4, 1, mm4
2418    test         r1, r1
2419    jz .fix_lt_2
2420    test         r2, r2
2421    jz .fix_tr_1
2422.do_top:
2423    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2424    movq2dq    xmm1, mm4
2425    test         r2, r2
2426    jz .fix_tr_2
2427    movq        mm0, [r0+8]
2428    movq        mm5, mm0
2429    movq        mm2, mm0
2430    movq        mm4, mm0
2431    psrlq       mm5, 56
2432    PALIGNR     mm2, mm3, 7, mm3
2433    PALIGNR     mm5, mm4, 1, mm4
2434    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2435.do_topright:
2436    movq2dq    xmm5, mm1
2437    pslldq     xmm5, 8
2438    por        xmm1, xmm5
2439INIT_XMM
2440    lea         r2, [r4+r3*2]
2441    movdqa    xmm2, xmm1
2442    movdqa    xmm3, xmm1
2443    PALIGNR   xmm1, xmm0, 7, xmm4
2444    PALIGNR   xmm2, xmm0, 9, xmm5
2445    lea         r1, [r2+r3*2]
2446    PALIGNR   xmm3, xmm0, 8, xmm0
2447    movdqa    xmm4, xmm1
2448    pavgb     xmm4, xmm3
2449    lea         r0, [r1+r3*2]
2450    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2451    punpcklbw xmm4, xmm0
2452    movhlps   xmm0, xmm4
2453    movq   [r0+r3*2], xmm4
2454    movq   [r2+r3*2], xmm0
2455    psrldq xmm4, 2
2456    psrldq xmm0, 2
2457    movq   [r0+r3*1], xmm4
2458    movq   [r2+r3*1], xmm0
2459    psrldq xmm4, 2
2460    psrldq xmm0, 2
2461    movq   [r1+r3*2], xmm4
2462    movq   [r4+r3*2], xmm0
2463    psrldq xmm4, 2
2464    psrldq xmm0, 2
2465    movq   [r1+r3*1], xmm4
2466    movq   [r4+r3*1], xmm0
2467    RET
2468%endmacro
2469
2470INIT_MMX
2471%define PALIGNR PALIGNR_MMX
2472PRED8x8L_HORIZONTAL_DOWN sse2
2473INIT_MMX
2474%define PALIGNR PALIGNR_SSSE3
2475PRED8x8L_HORIZONTAL_DOWN ssse3
2476
2477;-----------------------------------------------------------------------------
2478; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2479;-----------------------------------------------------------------------------
2480
2481cglobal pred4x4_dc_mmxext, 3,5
2482    pxor   mm7, mm7
2483    mov     r4, r0
2484    sub     r0, r2
2485    movd   mm0, [r0]
2486    psadbw mm0, mm7
2487    movzx  r1d, byte [r0+r2*1-1]
2488    movd   r3d, mm0
2489    add    r3d, r1d
2490    movzx  r1d, byte [r0+r2*2-1]
2491    lea     r0, [r0+r2*2]
2492    add    r3d, r1d
2493    movzx  r1d, byte [r0+r2*1-1]
2494    add    r3d, r1d
2495    movzx  r1d, byte [r0+r2*2-1]
2496    add    r3d, r1d
2497    add    r3d, 4
2498    shr    r3d, 3
2499    imul   r3d, 0x01010101
2500    mov   [r4+r2*0], r3d
2501    mov   [r0+r2*0], r3d
2502    mov   [r0+r2*1], r3d
2503    mov   [r0+r2*2], r3d
2504    RET
2505
2506;-----------------------------------------------------------------------------
2507; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2508;-----------------------------------------------------------------------------
2509
2510%macro PRED4x4_TM_MMX 1
2511cglobal pred4x4_tm_vp8_%1, 3,6
2512    sub        r0, r2
2513    pxor      mm7, mm7
2514    movd      mm0, [r0]
2515    punpcklbw mm0, mm7
2516    movzx     r4d, byte [r0-1]
2517    mov       r5d, 2
2518.loop:
2519    movzx     r1d, byte [r0+r2*1-1]
2520    movzx     r3d, byte [r0+r2*2-1]
2521    sub       r1d, r4d
2522    sub       r3d, r4d
2523    movd      mm2, r1d
2524    movd      mm4, r3d
2525%ifidn %1, mmx
2526    punpcklwd mm2, mm2
2527    punpcklwd mm4, mm4
2528    punpckldq mm2, mm2
2529    punpckldq mm4, mm4
2530%else
2531    pshufw    mm2, mm2, 0
2532    pshufw    mm4, mm4, 0
2533%endif
2534    paddw     mm2, mm0
2535    paddw     mm4, mm0
2536    packuswb  mm2, mm2
2537    packuswb  mm4, mm4
2538    movd [r0+r2*1], mm2
2539    movd [r0+r2*2], mm4
2540    lea        r0, [r0+r2*2]
2541    dec       r5d
2542    jg .loop
2543    REP_RET
2544%endmacro
2545
2546PRED4x4_TM_MMX mmx
2547PRED4x4_TM_MMX mmxext
2548
2549cglobal pred4x4_tm_vp8_ssse3, 3,3
2550    sub         r0, r2
2551    movq       mm6, [tm_shuf]
2552    pxor       mm1, mm1
2553    movd       mm0, [r0]
2554    punpcklbw  mm0, mm1
2555    movd       mm7, [r0-4]
2556    pshufb     mm7, mm6
2557    lea         r1, [r0+r2*2]
2558    movd       mm2, [r0+r2*1-4]
2559    movd       mm3, [r0+r2*2-4]
2560    movd       mm4, [r1+r2*1-4]
2561    movd       mm5, [r1+r2*2-4]
2562    pshufb     mm2, mm6
2563    pshufb     mm3, mm6
2564    pshufb     mm4, mm6
2565    pshufb     mm5, mm6
2566    psubw      mm2, mm7
2567    psubw      mm3, mm7
2568    psubw      mm4, mm7
2569    psubw      mm5, mm7
2570    paddw      mm2, mm0
2571    paddw      mm3, mm0
2572    paddw      mm4, mm0
2573    paddw      mm5, mm0
2574    packuswb   mm2, mm2
2575    packuswb   mm3, mm3
2576    packuswb   mm4, mm4
2577    packuswb   mm5, mm5
2578    movd [r0+r2*1], mm2
2579    movd [r0+r2*2], mm3
2580    movd [r1+r2*1], mm4
2581    movd [r1+r2*2], mm5
2582    RET
2583
2584;-----------------------------------------------------------------------------
2585; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2586;-----------------------------------------------------------------------------
2587
2588INIT_MMX
2589cglobal pred4x4_vertical_vp8_mmxext, 3,3
2590    sub       r0, r2
2591    movd      m1, [r0-1]
2592    movd      m0, [r0]
2593    mova      m2, m0   ;t0 t1 t2 t3
2594    punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2595    lea       r1, [r0+r2*2]
2596    psrlq     m0, 8    ;t1 t2 t3 t4
2597    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2598    movd [r0+r2*1], m3
2599    movd [r0+r2*2], m3
2600    movd [r1+r2*1], m3
2601    movd [r1+r2*2], m3
2602    RET
2603
2604;-----------------------------------------------------------------------------
2605; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2606;-----------------------------------------------------------------------------
2607INIT_MMX
2608cglobal pred4x4_down_left_mmxext, 3,3
2609    sub       r0, r2
2610    movq      m1, [r0]
2611    punpckldq m1, [r1]
2612    movq      m2, m1
2613    movq      m3, m1
2614    psllq     m1, 8
2615    pxor      m2, m1
2616    psrlq     m2, 8
2617    pxor      m2, m3
2618    PRED4x4_LOWPASS m0, m1, m2, m3, m4
2619    lea       r1, [r0+r2*2]
2620    psrlq     m0, 8
2621    movd      [r0+r2*1], m0
2622    psrlq     m0, 8
2623    movd      [r0+r2*2], m0
2624    psrlq     m0, 8
2625    movd      [r1+r2*1], m0
2626    psrlq     m0, 8
2627    movd      [r1+r2*2], m0
2628    RET
2629
2630;-----------------------------------------------------------------------------
2631; void pred4x4_vertical_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2632;-----------------------------------------------------------------------------
2633
2634INIT_MMX
2635cglobal pred4x4_vertical_left_mmxext, 3,3
2636    sub       r0, r2
2637    movq      m1, [r0]
2638    punpckldq m1, [r1]
2639    movq      m3, m1
2640    movq      m2, m1
2641    psrlq     m3, 8
2642    psrlq     m2, 16
2643    movq      m4, m3
2644    pavgb     m4, m1
2645    PRED4x4_LOWPASS m0, m1, m2, m3, m5
2646    lea       r1, [r0+r2*2]
2647    movh      [r0+r2*1], m4
2648    movh      [r0+r2*2], m0
2649    psrlq     m4, 8
2650    psrlq     m0, 8
2651    movh      [r1+r2*1], m4
2652    movh      [r1+r2*2], m0
2653    RET
2654
2655;-----------------------------------------------------------------------------
2656; void pred4x4_horizontal_up_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2657;-----------------------------------------------------------------------------
2658
2659INIT_MMX
2660cglobal pred4x4_horizontal_up_mmxext, 3,3
2661    sub       r0, r2
2662    lea       r1, [r0+r2*2]
2663    movd      m0, [r0+r2*1-4]
2664    punpcklbw m0, [r0+r2*2-4]
2665    movd      m1, [r1+r2*1-4]
2666    punpcklbw m1, [r1+r2*2-4]
2667    punpckhwd m0, m1
2668    movq      m1, m0
2669    punpckhbw m1, m1
2670    pshufw    m1, m1, 0xFF
2671    punpckhdq m0, m1
2672    movq      m2, m0
2673    movq      m3, m0
2674    movq      m7, m0
2675    psrlq     m2, 16
2676    psrlq     m3, 8
2677    pavgb     m7, m3
2678    PRED4x4_LOWPASS m4, m0, m2, m3, m5
2679    punpcklbw m7, m4
2680    movd    [r0+r2*1], m7
2681    psrlq    m7, 16
2682    movd    [r0+r2*2], m7
2683    psrlq    m7, 16
2684    movd    [r1+r2*1], m7
2685    movd    [r1+r2*2], m1
2686    RET
2687
2688;-----------------------------------------------------------------------------
2689; void pred4x4_horizontal_down_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2690;-----------------------------------------------------------------------------
2691
2692INIT_MMX
2693%define PALIGNR PALIGNR_MMX
2694cglobal pred4x4_horizontal_down_mmxext, 3,3
2695    sub       r0, r2
2696    lea       r1, [r0+r2*2]
2697    movh      m0, [r0-4]      ; lt ..
2698    punpckldq m0, [r0]        ; t3 t2 t1 t0 lt .. .. ..
2699    psllq     m0, 8           ; t2 t1 t0 lt .. .. .. ..
2700    movd      m1, [r1+r2*2-4] ; l3
2701    punpcklbw m1, [r1+r2*1-4] ; l2 l3
2702    movd      m2, [r0+r2*2-4] ; l1
2703    punpcklbw m2, [r0+r2*1-4] ; l0 l1
2704    punpckhwd m1, m2          ; l0 l1 l2 l3
2705    punpckhdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
2706    movq      m0, m1
2707    movq      m2, m1
2708    movq      m5, m1
2709    psrlq     m0, 16          ; .. .. t2 t1 t0 lt l0 l1
2710    psrlq     m2, 8           ; .. t2 t1 t0 lt l0 l1 l2
2711    pavgb     m5, m2
2712    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2713    punpcklbw m5, m3
2714    psrlq     m3, 32
2715    PALIGNR   m3, m5, 6, m4
2716    movh      [r1+r2*2], m5
2717    psrlq     m5, 16
2718    movh      [r1+r2*1], m5
2719    psrlq     m5, 16
2720    movh      [r0+r2*2], m5
2721    movh      [r0+r2*1], m3
2722    RET
2723
2724;-----------------------------------------------------------------------------
2725; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2726;-----------------------------------------------------------------------------
2727
2728INIT_MMX
2729%define PALIGNR PALIGNR_MMX
2730cglobal pred4x4_vertical_right_mmxext, 3,3
2731    sub     r0, r2
2732    lea     r1, [r0+r2*2]
2733    movh    m0, [r0]                    ; ........t3t2t1t0
2734    movq    m5, m0
2735    PALIGNR m0, [r0-8], 7, m1           ; ......t3t2t1t0lt
2736    pavgb   m5, m0
2737    PALIGNR m0, [r0+r2*1-8], 7, m1      ; ....t3t2t1t0ltl0
2738    movq    m1, m0
2739    PALIGNR m0, [r0+r2*2-8], 7, m2      ; ..t3t2t1t0ltl0l1
2740    movq    m2, m0
2741    PALIGNR m0, [r1+r2*1-8], 7, m3      ; t3t2t1t0ltl0l1l2
2742    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2743    movq    m1, m3
2744    psrlq   m3, 16
2745    psllq   m1, 48
2746    movh    [r0+r2*1], m5
2747    movh    [r0+r2*2], m3
2748    PALIGNR m5, m1, 7, m2
2749    psllq   m1, 8
2750    movh    [r1+r2*1], m5
2751    PALIGNR m3, m1, 7, m1
2752    movh    [r1+r2*2], m3
2753    RET
2754
2755;-----------------------------------------------------------------------------
2756; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
2757;-----------------------------------------------------------------------------
2758
2759INIT_MMX
2760%define PALIGNR PALIGNR_MMX
2761cglobal pred4x4_down_right_mmxext, 3,3
2762    sub       r0, r2
2763    lea       r1, [r0+r2*2]
2764    movq      m1, [r1-8]
2765    movq      m2, [r0+r2*1-8]
2766    punpckhbw m2, [r0-8]
2767    movh      m3, [r0]
2768    punpckhwd m1, m2
2769    PALIGNR   m3, m1, 5, m1
2770    movq      m1, m3
2771    PALIGNR   m3, [r1+r2*1-8], 7, m4
2772    movq      m2, m3
2773    PALIGNR   m3, [r1+r2*2-8], 7, m4
2774    PRED4x4_LOWPASS m0, m3, m1, m2, m4
2775    movh      [r1+r2*2], m0
2776    psrlq     m0, 8
2777    movh      [r1+r2*1], m0
2778    psrlq     m0, 8
2779    movh      [r0+r2*2], m0
2780    psrlq     m0, 8
2781    movh      [r0+r2*1], m0
2782    RET
2783