1;******************************************************************************
2;* VP8 MMXEXT optimizations
3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5;*
6;* This file is part of Libav.
7;*
8;* Libav is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* Libav is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with Libav; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "x86inc.asm"
24%include "x86util.asm"
25
26SECTION_RODATA
27
28fourtap_filter_hw_m: times 4 dw  -6, 123
29                     times 4 dw  12,  -1
30                     times 4 dw  -9,  93
31                     times 4 dw  50,  -6
32                     times 4 dw  -6,  50
33                     times 4 dw  93,  -9
34                     times 4 dw  -1,  12
35                     times 4 dw 123,  -6
36
37sixtap_filter_hw_m:  times 4 dw   2, -11
38                     times 4 dw 108,  36
39                     times 4 dw  -8,   1
40                     times 4 dw   3, -16
41                     times 4 dw  77,  77
42                     times 4 dw -16,   3
43                     times 4 dw   1,  -8
44                     times 4 dw  36, 108
45                     times 4 dw -11,   2
46
47fourtap_filter_hb_m: times 8 db  -6, 123
48                     times 8 db  12,  -1
49                     times 8 db  -9,  93
50                     times 8 db  50,  -6
51                     times 8 db  -6,  50
52                     times 8 db  93,  -9
53                     times 8 db  -1,  12
54                     times 8 db 123,  -6
55
56sixtap_filter_hb_m:  times 8 db   2,   1
57                     times 8 db -11, 108
58                     times 8 db  36,  -8
59                     times 8 db   3,   3
60                     times 8 db -16,  77
61                     times 8 db  77, -16
62                     times 8 db   1,   2
63                     times 8 db  -8,  36
64                     times 8 db 108, -11
65
66fourtap_filter_v_m:  times 8 dw  -6
67                     times 8 dw 123
68                     times 8 dw  12
69                     times 8 dw  -1
70                     times 8 dw  -9
71                     times 8 dw  93
72                     times 8 dw  50
73                     times 8 dw  -6
74                     times 8 dw  -6
75                     times 8 dw  50
76                     times 8 dw  93
77                     times 8 dw  -9
78                     times 8 dw  -1
79                     times 8 dw  12
80                     times 8 dw 123
81                     times 8 dw  -6
82
83sixtap_filter_v_m:   times 8 dw   2
84                     times 8 dw -11
85                     times 8 dw 108
86                     times 8 dw  36
87                     times 8 dw  -8
88                     times 8 dw   1
89                     times 8 dw   3
90                     times 8 dw -16
91                     times 8 dw  77
92                     times 8 dw  77
93                     times 8 dw -16
94                     times 8 dw   3
95                     times 8 dw   1
96                     times 8 dw  -8
97                     times 8 dw  36
98                     times 8 dw 108
99                     times 8 dw -11
100                     times 8 dw   2
101
102bilinear_filter_vw_m: times 8 dw 1
103                      times 8 dw 2
104                      times 8 dw 3
105                      times 8 dw 4
106                      times 8 dw 5
107                      times 8 dw 6
108                      times 8 dw 7
109
110bilinear_filter_vb_m: times 8 db 7, 1
111                      times 8 db 6, 2
112                      times 8 db 5, 3
113                      times 8 db 4, 4
114                      times 8 db 3, 5
115                      times 8 db 2, 6
116                      times 8 db 1, 7
117
118%ifdef PIC
119%define fourtap_filter_hw    r11
120%define sixtap_filter_hw     r11
121%define fourtap_filter_hb    r11
122%define sixtap_filter_hb     r11
123%define fourtap_filter_v     r11
124%define sixtap_filter_v      r11
125%define bilinear_filter_vw   r11
126%define bilinear_filter_vb   r11
127%else
128%define fourtap_filter_hw fourtap_filter_hw_m
129%define sixtap_filter_hw  sixtap_filter_hw_m
130%define fourtap_filter_hb fourtap_filter_hb_m
131%define sixtap_filter_hb  sixtap_filter_hb_m
132%define fourtap_filter_v  fourtap_filter_v_m
133%define sixtap_filter_v   sixtap_filter_v_m
134%define bilinear_filter_vw bilinear_filter_vw_m
135%define bilinear_filter_vb bilinear_filter_vb_m
136%endif
137
138filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
139filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
140
141filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
142filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
143filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
144
145pw_20091: times 4 dw 20091
146pw_17734: times 4 dw 17734
147
148pb_27_63: times 8 db 27, 63
149pb_18_63: times 8 db 18, 63
150pb_9_63:  times 8 db  9, 63
151
152cextern pb_1
153cextern pw_3
154cextern pb_3
155cextern pw_4
156cextern pb_4
157cextern pw_9
158cextern pw_18
159cextern pw_27
160cextern pw_63
161cextern pw_64
162cextern pb_80
163cextern pb_F8
164cextern pb_FE
165
166SECTION .text
167
168;-----------------------------------------------------------------------------
169; subpel MC functions:
170;
171; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
172;                                              uint8_t *src, int srcstride,
173;                                              int height,   int mx, int my);
174;-----------------------------------------------------------------------------
175
176%macro FILTER_SSSE3 3
177cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
178    lea      r5d, [r5*3]
179    mova      m3, [filter_h6_shuf2]
180    mova      m4, [filter_h6_shuf3]
181%ifdef PIC
182    lea      r11, [sixtap_filter_hb_m]
183%endif
184    mova      m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
185    mova      m6, [sixtap_filter_hb+r5*8-32]
186    mova      m7, [sixtap_filter_hb+r5*8-16]
187
188.nextrow
189    movu      m0, [r2-2]
190    mova      m1, m0
191    mova      m2, m0
192%ifidn %1, 4
193; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
194; shuffle with a memory operand
195    punpcklbw m0, [r2+3]
196%else
197    pshufb    m0, [filter_h6_shuf1]
198%endif
199    pshufb    m1, m3
200    pshufb    m2, m4
201    pmaddubsw m0, m5
202    pmaddubsw m1, m6
203    pmaddubsw m2, m7
204    paddsw    m0, m1
205    paddsw    m0, m2
206    paddsw    m0, [pw_64]
207    psraw     m0, 7
208    packuswb  m0, m0
209    movh    [r0], m0        ; store
210
211    ; go to next line
212    add       r0, r1
213    add       r2, r3
214    dec      r4d            ; next row
215    jg .nextrow
216    REP_RET
217
218cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
219    shl      r5d, 4
220    mova      m2, [pw_64]
221    mova      m3, [filter_h2_shuf]
222    mova      m4, [filter_h4_shuf]
223%ifdef PIC
224    lea      r11, [fourtap_filter_hb_m]
225%endif
226    mova      m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
227    mova      m6, [fourtap_filter_hb+r5]
228
229.nextrow
230    movu      m0, [r2-1]
231    mova      m1, m0
232    pshufb    m0, m3
233    pshufb    m1, m4
234    pmaddubsw m0, m5
235    pmaddubsw m1, m6
236    paddsw    m0, m2
237    paddsw    m0, m1
238    psraw     m0, 7
239    packuswb  m0, m0
240    movh    [r0], m0        ; store
241
242    ; go to next line
243    add       r0, r1
244    add       r2, r3
245    dec      r4d            ; next row
246    jg .nextrow
247    REP_RET
248
249cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
250    shl      r6d, 4
251%ifdef PIC
252    lea      r11, [fourtap_filter_hb_m]
253%endif
254    mova      m5, [fourtap_filter_hb+r6-16]
255    mova      m6, [fourtap_filter_hb+r6]
256    mova      m7, [pw_64]
257
258    ; read 3 lines
259    sub       r2, r3
260    movh      m0, [r2]
261    movh      m1, [r2+  r3]
262    movh      m2, [r2+2*r3]
263    add       r2, r3
264
265.nextrow
266    movh      m3, [r2+2*r3]                ; read new row
267    mova      m4, m0
268    mova      m0, m1
269    punpcklbw m4, m1
270    mova      m1, m2
271    punpcklbw m2, m3
272    pmaddubsw m4, m5
273    pmaddubsw m2, m6
274    paddsw    m4, m2
275    mova      m2, m3
276    paddsw    m4, m7
277    psraw     m4, 7
278    packuswb  m4, m4
279    movh    [r0], m4
280
281    ; go to next line
282    add        r0, r1
283    add        r2, r3
284    dec       r4d                          ; next row
285    jg .nextrow
286    REP_RET
287
288cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
289    lea      r6d, [r6*3]
290%ifdef PIC
291    lea      r11, [sixtap_filter_hb_m]
292%endif
293    lea       r6, [sixtap_filter_hb+r6*8]
294
295    ; read 5 lines
296    sub       r2, r3
297    sub       r2, r3
298    movh      m0, [r2]
299    movh      m1, [r2+r3]
300    movh      m2, [r2+r3*2]
301    lea       r2, [r2+r3*2]
302    add       r2, r3
303    movh      m3, [r2]
304    movh      m4, [r2+r3]
305
306.nextrow
307    movh      m5, [r2+2*r3]                ; read new row
308    mova      m6, m0
309    punpcklbw m6, m5
310    mova      m0, m1
311    punpcklbw m1, m2
312    mova      m7, m3
313    punpcklbw m7, m4
314    pmaddubsw m6, [r6-48]
315    pmaddubsw m1, [r6-32]
316    pmaddubsw m7, [r6-16]
317    paddsw    m6, m1
318    paddsw    m6, m7
319    mova      m1, m2
320    paddsw    m6, [pw_64]
321    mova      m2, m3
322    psraw     m6, 7
323    mova      m3, m4
324    packuswb  m6, m6
325    mova      m4, m5
326    movh    [r0], m6
327
328    ; go to next line
329    add        r0, r1
330    add        r2, r3
331    dec       r4d                          ; next row
332    jg .nextrow
333    REP_RET
334%endmacro
335
336INIT_MMX
337FILTER_SSSE3 4, 0, 0
338INIT_XMM
339FILTER_SSSE3 8, 8, 7
340
341; 4x4 block, H-only 4-tap filter
342cglobal put_vp8_epel4_h4_mmxext, 6, 6
343    shl       r5d, 4
344%ifdef PIC
345    lea       r11, [fourtap_filter_hw_m]
346%endif
347    movq      mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
348    movq      mm5, [fourtap_filter_hw+r5]
349    movq      mm7, [pw_64]
350    pxor      mm6, mm6
351
352.nextrow
353    movq      mm1, [r2-1]                  ; (ABCDEFGH) load 8 horizontal pixels
354
355    ; first set of 2 pixels
356    movq      mm2, mm1                     ; byte ABCD..
357    punpcklbw mm1, mm6                     ; byte->word ABCD
358    pshufw    mm0, mm2, 9                  ; byte CDEF..
359    punpcklbw mm0, mm6                     ; byte->word CDEF
360    pshufw    mm3, mm1, 0x94               ; word ABBC
361    pshufw    mm1, mm0, 0x94               ; word CDDE
362    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
363    movq      mm0, mm1                     ; backup for second set of pixels
364    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
365    paddd     mm3, mm1                     ; finish 1st 2px
366
367    ; second set of 2 pixels, use backup of above
368    punpckhbw mm2, mm6                     ; byte->word EFGH
369    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
370    pshufw    mm1, mm2, 0x94               ; word EFFG
371    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
372    paddd     mm0, mm1                     ; finish 2nd 2px
373
374    ; merge two sets of 2 pixels into one set of 4, round/clip/store
375    packssdw  mm3, mm0                     ; merge dword->word (4px)
376    paddsw    mm3, mm7                     ; rounding
377    psraw     mm3, 7
378    packuswb  mm3, mm6                     ; clip and word->bytes
379    movd     [r0], mm3                     ; store
380
381    ; go to next line
382    add        r0, r1
383    add        r2, r3
384    dec       r4d                          ; next row
385    jg .nextrow
386    REP_RET
387
388; 4x4 block, H-only 6-tap filter
389cglobal put_vp8_epel4_h6_mmxext, 6, 6
390    lea       r5d, [r5*3]
391%ifdef PIC
392    lea       r11, [sixtap_filter_hw_m]
393%endif
394    movq      mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
395    movq      mm5, [sixtap_filter_hw+r5*8-32]
396    movq      mm6, [sixtap_filter_hw+r5*8-16]
397    movq      mm7, [pw_64]
398    pxor      mm3, mm3
399
400.nextrow
401    movq      mm1, [r2-2]                  ; (ABCDEFGH) load 8 horizontal pixels
402
403    ; first set of 2 pixels
404    movq      mm2, mm1                     ; byte ABCD..
405    punpcklbw mm1, mm3                     ; byte->word ABCD
406    pshufw    mm0, mm2, 0x9                ; byte CDEF..
407    punpckhbw mm2, mm3                     ; byte->word EFGH
408    punpcklbw mm0, mm3                     ; byte->word CDEF
409    pshufw    mm1, mm1, 0x94               ; word ABBC
410    pshufw    mm2, mm2, 0x94               ; word EFFG
411    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
412    pshufw    mm3, mm0, 0x94               ; word CDDE
413    movq      mm0, mm3                     ; backup for second set of pixels
414    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
415    paddd     mm1, mm3                     ; add to 1st 2px cache
416    movq      mm3, mm2                     ; backup for second set of pixels
417    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
418    paddd     mm1, mm2                     ; finish 1st 2px
419
420    ; second set of 2 pixels, use backup of above
421    movd      mm2, [r2+3]                  ; byte FGHI (prevent overreads)
422    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
423    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
424    paddd     mm0, mm3                     ; add to 2nd 2px cache
425    pxor      mm3, mm3
426    punpcklbw mm2, mm3                     ; byte->word FGHI
427    pshufw    mm2, mm2, 0xE9               ; word GHHI
428    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
429    paddd     mm0, mm2                     ; finish 2nd 2px
430
431    ; merge two sets of 2 pixels into one set of 4, round/clip/store
432    packssdw  mm1, mm0                     ; merge dword->word (4px)
433    paddsw    mm1, mm7                     ; rounding
434    psraw     mm1, 7
435    packuswb  mm1, mm3                     ; clip and word->bytes
436    movd     [r0], mm1                     ; store
437
438    ; go to next line
439    add        r0, r1
440    add        r2, r3
441    dec       r4d                          ; next row
442    jg .nextrow
443    REP_RET
444
445INIT_XMM
446cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
447    shl      r5d, 5
448%ifdef PIC
449    lea      r11, [fourtap_filter_v_m]
450%endif
451    lea       r5, [fourtap_filter_v+r5-32]
452    pxor      m7, m7
453    mova      m4, [pw_64]
454    mova      m5, [r5+ 0]
455    mova      m6, [r5+16]
456%ifdef m8
457    mova      m8, [r5+32]
458    mova      m9, [r5+48]
459%endif
460.nextrow
461    movq      m0, [r2-1]
462    movq      m1, [r2-0]
463    movq      m2, [r2+1]
464    movq      m3, [r2+2]
465    punpcklbw m0, m7
466    punpcklbw m1, m7
467    punpcklbw m2, m7
468    punpcklbw m3, m7
469    pmullw    m0, m5
470    pmullw    m1, m6
471%ifdef m8
472    pmullw    m2, m8
473    pmullw    m3, m9
474%else
475    pmullw    m2, [r5+32]
476    pmullw    m3, [r5+48]
477%endif
478    paddsw    m0, m1
479    paddsw    m2, m3
480    paddsw    m0, m2
481    paddsw    m0, m4
482    psraw     m0, 7
483    packuswb  m0, m7
484    movh    [r0], m0        ; store
485
486    ; go to next line
487    add       r0, r1
488    add       r2, r3
489    dec      r4d            ; next row
490    jg .nextrow
491    REP_RET
492
493cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
494    lea      r5d, [r5*3]
495    shl      r5d, 4
496%ifdef PIC
497    lea      r11, [sixtap_filter_v_m]
498%endif
499    lea       r5, [sixtap_filter_v+r5-96]
500    pxor      m7, m7
501    mova      m6, [pw_64]
502%ifdef m8
503    mova      m8, [r5+ 0]
504    mova      m9, [r5+16]
505    mova     m10, [r5+32]
506    mova     m11, [r5+48]
507    mova     m12, [r5+64]
508    mova     m13, [r5+80]
509%endif
510.nextrow
511    movq      m0, [r2-2]
512    movq      m1, [r2-1]
513    movq      m2, [r2-0]
514    movq      m3, [r2+1]
515    movq      m4, [r2+2]
516    movq      m5, [r2+3]
517    punpcklbw m0, m7
518    punpcklbw m1, m7
519    punpcklbw m2, m7
520    punpcklbw m3, m7
521    punpcklbw m4, m7
522    punpcklbw m5, m7
523%ifdef m8
524    pmullw    m0, m8
525    pmullw    m1, m9
526    pmullw    m2, m10
527    pmullw    m3, m11
528    pmullw    m4, m12
529    pmullw    m5, m13
530%else
531    pmullw    m0, [r5+ 0]
532    pmullw    m1, [r5+16]
533    pmullw    m2, [r5+32]
534    pmullw    m3, [r5+48]
535    pmullw    m4, [r5+64]
536    pmullw    m5, [r5+80]
537%endif
538    paddsw    m1, m4
539    paddsw    m0, m5
540    paddsw    m1, m2
541    paddsw    m0, m3
542    paddsw    m0, m1
543    paddsw    m0, m6
544    psraw     m0, 7
545    packuswb  m0, m7
546    movh    [r0], m0        ; store
547
548    ; go to next line
549    add       r0, r1
550    add       r2, r3
551    dec      r4d            ; next row
552    jg .nextrow
553    REP_RET
554
555%macro FILTER_V 3
556; 4x4 block, V-only 4-tap filter
557cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
558    shl      r6d, 5
559%ifdef PIC
560    lea      r11, [fourtap_filter_v_m]
561%endif
562    lea       r6, [fourtap_filter_v+r6-32]
563    mova      m6, [pw_64]
564    pxor      m7, m7
565    mova      m5, [r6+48]
566
567    ; read 3 lines
568    sub       r2, r3
569    movh      m0, [r2]
570    movh      m1, [r2+  r3]
571    movh      m2, [r2+2*r3]
572    add       r2, r3
573    punpcklbw m0, m7
574    punpcklbw m1, m7
575    punpcklbw m2, m7
576
577.nextrow
578    ; first calculate negative taps (to prevent losing positive overflows)
579    movh      m4, [r2+2*r3]                ; read new row
580    punpcklbw m4, m7
581    mova      m3, m4
582    pmullw    m0, [r6+0]
583    pmullw    m4, m5
584    paddsw    m4, m0
585
586    ; then calculate positive taps
587    mova      m0, m1
588    pmullw    m1, [r6+16]
589    paddsw    m4, m1
590    mova      m1, m2
591    pmullw    m2, [r6+32]
592    paddsw    m4, m2
593    mova      m2, m3
594
595    ; round/clip/store
596    paddsw    m4, m6
597    psraw     m4, 7
598    packuswb  m4, m7
599    movh    [r0], m4
600
601    ; go to next line
602    add       r0, r1
603    add       r2, r3
604    dec      r4d                           ; next row
605    jg .nextrow
606    REP_RET
607
608
609; 4x4 block, V-only 6-tap filter
610cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
611    shl      r6d, 4
612    lea       r6, [r6*3]
613%ifdef PIC
614    lea      r11, [sixtap_filter_v_m]
615%endif
616    lea       r6, [sixtap_filter_v+r6-96]
617    pxor      m7, m7
618
619    ; read 5 lines
620    sub       r2, r3
621    sub       r2, r3
622    movh      m0, [r2]
623    movh      m1, [r2+r3]
624    movh      m2, [r2+r3*2]
625    lea       r2, [r2+r3*2]
626    add       r2, r3
627    movh      m3, [r2]
628    movh      m4, [r2+r3]
629    punpcklbw m0, m7
630    punpcklbw m1, m7
631    punpcklbw m2, m7
632    punpcklbw m3, m7
633    punpcklbw m4, m7
634
635.nextrow
636    ; first calculate negative taps (to prevent losing positive overflows)
637    mova      m5, m1
638    pmullw    m5, [r6+16]
639    mova      m6, m4
640    pmullw    m6, [r6+64]
641    paddsw    m6, m5
642
643    ; then calculate positive taps
644    movh      m5, [r2+2*r3]                ; read new row
645    punpcklbw m5, m7
646    pmullw    m0, [r6+0]
647    paddsw    m6, m0
648    mova      m0, m1
649    mova      m1, m2
650    pmullw    m2, [r6+32]
651    paddsw    m6, m2
652    mova      m2, m3
653    pmullw    m3, [r6+48]
654    paddsw    m6, m3
655    mova      m3, m4
656    mova      m4, m5
657    pmullw    m5, [r6+80]
658    paddsw    m6, m5
659
660    ; round/clip/store
661    paddsw    m6, [pw_64]
662    psraw     m6, 7
663    packuswb  m6, m7
664    movh    [r0], m6
665
666    ; go to next line
667    add       r0, r1
668    add       r2, r3
669    dec      r4d                           ; next row
670    jg .nextrow
671    REP_RET
672%endmacro
673
674INIT_MMX
675FILTER_V mmxext, 4, 0
676INIT_XMM
677FILTER_V sse2,   8, 8
678
679%macro FILTER_BILINEAR 3
680cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
681    mov      r5d, 8*16
682    shl      r6d, 4
683    sub      r5d, r6d
684%ifdef PIC
685    lea      r11, [bilinear_filter_vw_m]
686%endif
687    pxor      m6, m6
688    mova      m4, [bilinear_filter_vw+r5-16]
689    mova      m5, [bilinear_filter_vw+r6-16]
690.nextrow
691    movh      m0, [r2+r3*0]
692    movh      m1, [r2+r3*1]
693    movh      m3, [r2+r3*2]
694    punpcklbw m0, m6
695    punpcklbw m1, m6
696    punpcklbw m3, m6
697    mova      m2, m1
698    pmullw    m0, m4
699    pmullw    m1, m5
700    pmullw    m2, m4
701    pmullw    m3, m5
702    paddsw    m0, m1
703    paddsw    m2, m3
704    psraw     m0, 2
705    psraw     m2, 2
706    pavgw     m0, m6
707    pavgw     m2, m6
708%ifidn %1, mmxext
709    packuswb  m0, m0
710    packuswb  m2, m2
711    movh [r0+r1*0], m0
712    movh [r0+r1*1], m2
713%else
714    packuswb  m0, m2
715    movh   [r0+r1*0], m0
716    movhps [r0+r1*1], m0
717%endif
718
719    lea       r0, [r0+r1*2]
720    lea       r2, [r2+r3*2]
721    sub      r4d, 2
722    jg .nextrow
723    REP_RET
724
725cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
726    mov      r6d, 8*16
727    shl      r5d, 4
728    sub      r6d, r5d
729%ifdef PIC
730    lea      r11, [bilinear_filter_vw_m]
731%endif
732    pxor      m6, m6
733    mova      m4, [bilinear_filter_vw+r6-16]
734    mova      m5, [bilinear_filter_vw+r5-16]
735.nextrow
736    movh      m0, [r2+r3*0+0]
737    movh      m1, [r2+r3*0+1]
738    movh      m2, [r2+r3*1+0]
739    movh      m3, [r2+r3*1+1]
740    punpcklbw m0, m6
741    punpcklbw m1, m6
742    punpcklbw m2, m6
743    punpcklbw m3, m6
744    pmullw    m0, m4
745    pmullw    m1, m5
746    pmullw    m2, m4
747    pmullw    m3, m5
748    paddsw    m0, m1
749    paddsw    m2, m3
750    psraw     m0, 2
751    psraw     m2, 2
752    pavgw     m0, m6
753    pavgw     m2, m6
754%ifidn %1, mmxext
755    packuswb  m0, m0
756    packuswb  m2, m2
757    movh [r0+r1*0], m0
758    movh [r0+r1*1], m2
759%else
760    packuswb  m0, m2
761    movh   [r0+r1*0], m0
762    movhps [r0+r1*1], m0
763%endif
764
765    lea       r0, [r0+r1*2]
766    lea       r2, [r2+r3*2]
767    sub      r4d, 2
768    jg .nextrow
769    REP_RET
770%endmacro
771
772INIT_MMX
773FILTER_BILINEAR mmxext, 4, 0
774INIT_XMM
775FILTER_BILINEAR   sse2, 8, 7
776
777%macro FILTER_BILINEAR_SSSE3 1
778cglobal put_vp8_bilinear%1_v_ssse3, 7,7
779    shl      r6d, 4
780%ifdef PIC
781    lea      r11, [bilinear_filter_vb_m]
782%endif
783    pxor      m4, m4
784    mova      m3, [bilinear_filter_vb+r6-16]
785.nextrow
786    movh      m0, [r2+r3*0]
787    movh      m1, [r2+r3*1]
788    movh      m2, [r2+r3*2]
789    punpcklbw m0, m1
790    punpcklbw m1, m2
791    pmaddubsw m0, m3
792    pmaddubsw m1, m3
793    psraw     m0, 2
794    psraw     m1, 2
795    pavgw     m0, m4
796    pavgw     m1, m4
797%if mmsize==8
798    packuswb  m0, m0
799    packuswb  m1, m1
800    movh [r0+r1*0], m0
801    movh [r0+r1*1], m1
802%else
803    packuswb  m0, m1
804    movh   [r0+r1*0], m0
805    movhps [r0+r1*1], m0
806%endif
807
808    lea       r0, [r0+r1*2]
809    lea       r2, [r2+r3*2]
810    sub      r4d, 2
811    jg .nextrow
812    REP_RET
813
814cglobal put_vp8_bilinear%1_h_ssse3, 7,7
815    shl      r5d, 4
816%ifdef PIC
817    lea      r11, [bilinear_filter_vb_m]
818%endif
819    pxor      m4, m4
820    mova      m2, [filter_h2_shuf]
821    mova      m3, [bilinear_filter_vb+r5-16]
822.nextrow
823    movu      m0, [r2+r3*0]
824    movu      m1, [r2+r3*1]
825    pshufb    m0, m2
826    pshufb    m1, m2
827    pmaddubsw m0, m3
828    pmaddubsw m1, m3
829    psraw     m0, 2
830    psraw     m1, 2
831    pavgw     m0, m4
832    pavgw     m1, m4
833%if mmsize==8
834    packuswb  m0, m0
835    packuswb  m1, m1
836    movh [r0+r1*0], m0
837    movh [r0+r1*1], m1
838%else
839    packuswb  m0, m1
840    movh   [r0+r1*0], m0
841    movhps [r0+r1*1], m0
842%endif
843
844    lea       r0, [r0+r1*2]
845    lea       r2, [r2+r3*2]
846    sub      r4d, 2
847    jg .nextrow
848    REP_RET
849%endmacro
850
851INIT_MMX
852FILTER_BILINEAR_SSSE3 4
853INIT_XMM
854FILTER_BILINEAR_SSSE3 8
855
856cglobal put_vp8_pixels8_mmx, 5,5
857.nextrow:
858    movq  mm0, [r2+r3*0]
859    movq  mm1, [r2+r3*1]
860    lea    r2, [r2+r3*2]
861    movq [r0+r1*0], mm0
862    movq [r0+r1*1], mm1
863    lea    r0, [r0+r1*2]
864    sub   r4d, 2
865    jg .nextrow
866    REP_RET
867
868cglobal put_vp8_pixels16_mmx, 5,5
869.nextrow:
870    movq  mm0, [r2+r3*0+0]
871    movq  mm1, [r2+r3*0+8]
872    movq  mm2, [r2+r3*1+0]
873    movq  mm3, [r2+r3*1+8]
874    lea    r2, [r2+r3*2]
875    movq [r0+r1*0+0], mm0
876    movq [r0+r1*0+8], mm1
877    movq [r0+r1*1+0], mm2
878    movq [r0+r1*1+8], mm3
879    lea    r0, [r0+r1*2]
880    sub   r4d, 2
881    jg .nextrow
882    REP_RET
883
884cglobal put_vp8_pixels16_sse, 5,5,2
885.nextrow:
886    movups xmm0, [r2+r3*0]
887    movups xmm1, [r2+r3*1]
888    lea     r2, [r2+r3*2]
889    movaps [r0+r1*0], xmm0
890    movaps [r0+r1*1], xmm1
891    lea     r0, [r0+r1*2]
892    sub    r4d, 2
893    jg .nextrow
894    REP_RET
895
896;-----------------------------------------------------------------------------
897; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
898;-----------------------------------------------------------------------------
899
900%macro ADD_DC 4
901    %4        m2, [r0+%3]
902    %4        m3, [r0+r2+%3]
903    %4        m4, [r1+%3]
904    %4        m5, [r1+r2+%3]
905    paddusb   m2, %1
906    paddusb   m3, %1
907    paddusb   m4, %1
908    paddusb   m5, %1
909    psubusb   m2, %2
910    psubusb   m3, %2
911    psubusb   m4, %2
912    psubusb   m5, %2
913    %4    [r0+%3], m2
914    %4 [r0+r2+%3], m3
915    %4    [r1+%3], m4
916    %4 [r1+r2+%3], m5
917%endmacro
918
919INIT_MMX
920cglobal vp8_idct_dc_add_mmx, 3, 3
921    ; load data
922    movd       m0, [r1]
923
924    ; calculate DC
925    paddw      m0, [pw_4]
926    pxor       m1, m1
927    psraw      m0, 3
928    movd      [r1], m1
929    psubw      m1, m0
930    packuswb   m0, m0
931    packuswb   m1, m1
932    punpcklbw  m0, m0
933    punpcklbw  m1, m1
934    punpcklwd  m0, m0
935    punpcklwd  m1, m1
936
937    ; add DC
938    lea        r1, [r0+r2*2]
939    ADD_DC     m0, m1, 0, movh
940    RET
941
942INIT_XMM
943cglobal vp8_idct_dc_add_sse4, 3, 3, 6
944    ; load data
945    movd       m0, [r1]
946    pxor       m1, m1
947
948    ; calculate DC
949    paddw      m0, [pw_4]
950    movd     [r1], m1
951    lea        r1, [r0+r2*2]
952    movd       m2, [r0]
953    movd       m3, [r0+r2]
954    movd       m4, [r1]
955    movd       m5, [r1+r2]
956    psraw      m0, 3
957    pshuflw    m0, m0, 0
958    punpcklqdq m0, m0
959    punpckldq  m2, m3
960    punpckldq  m4, m5
961    punpcklbw  m2, m1
962    punpcklbw  m4, m1
963    paddw      m2, m0
964    paddw      m4, m0
965    packuswb   m2, m4
966    movd      [r0], m2
967    pextrd [r0+r2], m2, 1
968    pextrd    [r1], m2, 2
969    pextrd [r1+r2], m2, 3
970    RET
971
972;-----------------------------------------------------------------------------
973; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
974;-----------------------------------------------------------------------------
975
976INIT_MMX
977cglobal vp8_idct_dc_add4y_mmx, 3, 3
978    ; load data
979    movd      m0, [r1+32*0] ; A
980    movd      m1, [r1+32*2] ; C
981    punpcklwd m0, [r1+32*1] ; A B
982    punpcklwd m1, [r1+32*3] ; C D
983    punpckldq m0, m1        ; A B C D
984    pxor      m6, m6
985
986    ; calculate DC
987    paddw     m0, [pw_4]
988    movd [r1+32*0], m6
989    movd [r1+32*1], m6
990    movd [r1+32*2], m6
991    movd [r1+32*3], m6
992    psraw     m0, 3
993    psubw     m6, m0
994    packuswb  m0, m0
995    packuswb  m6, m6
996    punpcklbw m0, m0 ; AABBCCDD
997    punpcklbw m6, m6 ; AABBCCDD
998    movq      m1, m0
999    movq      m7, m6
1000    punpcklbw m0, m0 ; AAAABBBB
1001    punpckhbw m1, m1 ; CCCCDDDD
1002    punpcklbw m6, m6 ; AAAABBBB
1003    punpckhbw m7, m7 ; CCCCDDDD
1004
1005    ; add DC
1006    lea       r1, [r0+r2*2]
1007    ADD_DC    m0, m6, 0, mova
1008    ADD_DC    m1, m7, 8, mova
1009    RET
1010
1011INIT_XMM
1012cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
1013    ; load data
1014    movd      m0, [r1+32*0] ; A
1015    movd      m1, [r1+32*2] ; C
1016    punpcklwd m0, [r1+32*1] ; A B
1017    punpcklwd m1, [r1+32*3] ; C D
1018    punpckldq m0, m1        ; A B C D
1019    pxor      m1, m1
1020
1021    ; calculate DC
1022    paddw     m0, [pw_4]
1023    movd [r1+32*0], m1
1024    movd [r1+32*1], m1
1025    movd [r1+32*2], m1
1026    movd [r1+32*3], m1
1027    psraw     m0, 3
1028    psubw     m1, m0
1029    packuswb  m0, m0
1030    packuswb  m1, m1
1031    punpcklbw m0, m0
1032    punpcklbw m1, m1
1033    punpcklbw m0, m0
1034    punpcklbw m1, m1
1035
1036    ; add DC
1037    lea       r1, [r0+r2*2]
1038    ADD_DC    m0, m1, 0, mova
1039    RET
1040
1041;-----------------------------------------------------------------------------
1042; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
1043;-----------------------------------------------------------------------------
1044
1045INIT_MMX
1046cglobal vp8_idct_dc_add4uv_mmx, 3, 3
1047    ; load data
1048    movd      m0, [r1+32*0] ; A
1049    movd      m1, [r1+32*2] ; C
1050    punpcklwd m0, [r1+32*1] ; A B
1051    punpcklwd m1, [r1+32*3] ; C D
1052    punpckldq m0, m1        ; A B C D
1053    pxor      m6, m6
1054
1055    ; calculate DC
1056    paddw     m0, [pw_4]
1057    movd [r1+32*0], m6
1058    movd [r1+32*1], m6
1059    movd [r1+32*2], m6
1060    movd [r1+32*3], m6
1061    psraw     m0, 3
1062    psubw     m6, m0
1063    packuswb  m0, m0
1064    packuswb  m6, m6
1065    punpcklbw m0, m0 ; AABBCCDD
1066    punpcklbw m6, m6 ; AABBCCDD
1067    movq      m1, m0
1068    movq      m7, m6
1069    punpcklbw m0, m0 ; AAAABBBB
1070    punpckhbw m1, m1 ; CCCCDDDD
1071    punpcklbw m6, m6 ; AAAABBBB
1072    punpckhbw m7, m7 ; CCCCDDDD
1073
1074    ; add DC
1075    lea       r1, [r0+r2*2]
1076    ADD_DC    m0, m6, 0, mova
1077    lea       r0, [r0+r2*4]
1078    lea       r1, [r1+r2*4]
1079    ADD_DC    m1, m7, 0, mova
1080    RET
1081
1082;-----------------------------------------------------------------------------
1083; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
1084;-----------------------------------------------------------------------------
1085
1086; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
1087;           this macro assumes that m6/m7 have words for 20091/17734 loaded
1088%macro VP8_MULTIPLY_SUMSUB 4
1089    mova      %3, %1
1090    mova      %4, %2
1091    pmulhw    %3, m6 ;20091(1)
1092    pmulhw    %4, m6 ;20091(2)
1093    paddw     %3, %1
1094    paddw     %4, %2
1095    paddw     %1, %1
1096    paddw     %2, %2
1097    pmulhw    %1, m7 ;35468(1)
1098    pmulhw    %2, m7 ;35468(2)
1099    psubw     %1, %4
1100    paddw     %2, %3
1101%endmacro
1102
1103; calculate x0=%1+%3; x1=%1-%3
1104;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
1105;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
1106;           %5/%6 are temporary registers
1107;           we assume m6/m7 have constant words 20091/17734 loaded in them
1108%macro VP8_IDCT_TRANSFORM4x4_1D 6
1109    SUMSUB_BA         w, %3,  %1,  %5     ;t0, t1
1110    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1111    SUMSUB_BA         w, %4,  %3,  %5     ;tmp0, tmp3
1112    SUMSUB_BA         w, %2,  %1,  %5     ;tmp1, tmp2
1113    SWAP                 %4,  %1
1114    SWAP                 %4,  %3
1115%endmacro
1116
1117INIT_MMX
1118%macro VP8_IDCT_ADD 1
1119cglobal vp8_idct_add_%1, 3, 3
1120    ; load block data
1121    movq         m0, [r1+ 0]
1122    movq         m1, [r1+ 8]
1123    movq         m2, [r1+16]
1124    movq         m3, [r1+24]
1125    movq         m6, [pw_20091]
1126    movq         m7, [pw_17734]
1127%ifidn %1, sse
1128    xorps      xmm0, xmm0
1129    movaps  [r1+ 0], xmm0
1130    movaps  [r1+16], xmm0
1131%else
1132    pxor         m4, m4
1133    movq    [r1+ 0], m4
1134    movq    [r1+ 8], m4
1135    movq    [r1+16], m4
1136    movq    [r1+24], m4
1137%endif
1138
1139    ; actual IDCT
1140    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1141    TRANSPOSE4x4W            0, 1, 2, 3, 4
1142    paddw        m0, [pw_4]
1143    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1144    TRANSPOSE4x4W            0, 1, 2, 3, 4
1145
1146    ; store
1147    pxor         m4, m4
1148    lea          r1, [r0+2*r2]
1149    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
1150    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
1151
1152    RET
1153%endmacro
1154
1155VP8_IDCT_ADD mmx
1156VP8_IDCT_ADD sse
1157
1158;-----------------------------------------------------------------------------
1159; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
1160;-----------------------------------------------------------------------------
1161
1162%macro SCATTER_WHT 3
1163    movd  r1d, m%1
1164    movd  r2d, m%2
1165    mov [r0+2*16*(0+%3)], r1w
1166    mov [r0+2*16*(1+%3)], r2w
1167    shr   r1d, 16
1168    shr   r2d, 16
1169    psrlq m%1, 32
1170    psrlq m%2, 32
1171    mov [r0+2*16*(4+%3)], r1w
1172    mov [r0+2*16*(5+%3)], r2w
1173    movd  r1d, m%1
1174    movd  r2d, m%2
1175    mov [r0+2*16*(8+%3)], r1w
1176    mov [r0+2*16*(9+%3)], r2w
1177    shr   r1d, 16
1178    shr   r2d, 16
1179    mov [r0+2*16*(12+%3)], r1w
1180    mov [r0+2*16*(13+%3)], r2w
1181%endmacro
1182
1183%macro HADAMARD4_1D 4
1184    SUMSUB_BADC w, %2, %1, %4, %3
1185    SUMSUB_BADC w, %4, %2, %3, %1
1186    SWAP %1, %4, %3
1187%endmacro
1188
1189%macro VP8_DC_WHT 1
1190cglobal vp8_luma_dc_wht_%1, 2,3
1191    movq          m0, [r1]
1192    movq          m1, [r1+8]
1193    movq          m2, [r1+16]
1194    movq          m3, [r1+24]
1195%ifidn %1, sse
1196    xorps      xmm0, xmm0
1197    movaps  [r1+ 0], xmm0
1198    movaps  [r1+16], xmm0
1199%else
1200    pxor         m4, m4
1201    movq    [r1+ 0], m4
1202    movq    [r1+ 8], m4
1203    movq    [r1+16], m4
1204    movq    [r1+24], m4
1205%endif
1206    HADAMARD4_1D  0, 1, 2, 3
1207    TRANSPOSE4x4W 0, 1, 2, 3, 4
1208    paddw         m0, [pw_3]
1209    HADAMARD4_1D  0, 1, 2, 3
1210    psraw         m0, 3
1211    psraw         m1, 3
1212    psraw         m2, 3
1213    psraw         m3, 3
1214    SCATTER_WHT   0, 1, 0
1215    SCATTER_WHT   2, 3, 2
1216    RET
1217%endmacro
1218
1219INIT_MMX
1220VP8_DC_WHT mmx
1221VP8_DC_WHT sse
1222
1223;-----------------------------------------------------------------------------
1224; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
1225;-----------------------------------------------------------------------------
1226
1227; macro called with 7 mm register indexes as argument, and 4 regular registers
1228;
1229; first 4 mm registers will carry the transposed pixel data
1230; the other three are scratchspace (one would be sufficient, but this allows
1231; for more spreading/pipelining and thus faster execution on OOE CPUs)
1232;
1233; first two regular registers are buf+4*stride and buf+5*stride
1234; third is -stride, fourth is +stride
1235%macro READ_8x4_INTERLEAVED 11
1236    ; interleave 8 (A-H) rows of 4 pixels each
1237    movd          m%1, [%8+%10*4]   ; A0-3
1238    movd          m%5, [%9+%10*4]   ; B0-3
1239    movd          m%2, [%8+%10*2]   ; C0-3
1240    movd          m%6, [%8+%10]     ; D0-3
1241    movd          m%3, [%8]         ; E0-3
1242    movd          m%7, [%9]         ; F0-3
1243    movd          m%4, [%9+%11]     ; G0-3
1244    punpcklbw     m%1, m%5          ; A/B interleaved
1245    movd          m%5, [%9+%11*2]   ; H0-3
1246    punpcklbw     m%2, m%6          ; C/D interleaved
1247    punpcklbw     m%3, m%7          ; E/F interleaved
1248    punpcklbw     m%4, m%5          ; G/H interleaved
1249%endmacro
1250
1251; macro called with 7 mm register indexes as argument, and 5 regular registers
1252; first 11 mean the same as READ_8x4_TRANSPOSED above
1253; fifth regular register is scratchspace to reach the bottom 8 rows, it
1254; will be set to second regular register + 8*stride at the end
1255%macro READ_16x4_INTERLEAVED 12
1256    ; transpose 16 (A-P) rows of 4 pixels each
1257    lea           %12, [r0+8*r2]
1258
1259    ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
1260    movd          m%1, [%8+%10*4]   ; A0-3
1261    movd          m%3, [%12+%10*4]  ; I0-3
1262    movd          m%2, [%8+%10*2]   ; C0-3
1263    movd          m%4, [%12+%10*2]  ; K0-3
1264    movd          m%6, [%8+%10]     ; D0-3
1265    movd          m%5, [%12+%10]    ; L0-3
1266    movd          m%7, [%12]        ; M0-3
1267    add           %12, %11
1268    punpcklbw     m%1, m%3          ; A/I
1269    movd          m%3, [%8]         ; E0-3
1270    punpcklbw     m%2, m%4          ; C/K
1271    punpcklbw     m%6, m%5          ; D/L
1272    punpcklbw     m%3, m%7          ; E/M
1273    punpcklbw     m%2, m%6          ; C/D/K/L interleaved
1274
1275    ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
1276    movd         m%5, [%9+%10*4]   ; B0-3
1277    movd         m%4, [%12+%10*4]  ; J0-3
1278    movd         m%7, [%9]         ; F0-3
1279    movd         m%6, [%12]        ; N0-3
1280    punpcklbw    m%5, m%4          ; B/J
1281    punpcklbw    m%7, m%6          ; F/N
1282    punpcklbw    m%1, m%5          ; A/B/I/J interleaved
1283    punpcklbw    m%3, m%7          ; E/F/M/N interleaved
1284    movd         m%4, [%9+%11]     ; G0-3
1285    movd         m%6, [%12+%11]    ; O0-3
1286    movd         m%5, [%9+%11*2]   ; H0-3
1287    movd         m%7, [%12+%11*2]  ; P0-3
1288    punpcklbw    m%4, m%6          ; G/O
1289    punpcklbw    m%5, m%7          ; H/P
1290    punpcklbw    m%4, m%5          ; G/H/O/P interleaved
1291%endmacro
1292
1293; write 4 mm registers of 2 dwords each
1294; first four arguments are mm register indexes containing source data
1295; last four are registers containing buf+4*stride, buf+5*stride,
1296; -stride and +stride
1297%macro WRITE_4x2D 8
1298    ; write out (2 dwords per register)
1299    movd    [%5+%7*4], m%1
1300    movd    [%5+%7*2], m%2
1301    movd         [%5], m%3
1302    movd      [%6+%8], m%4
1303    punpckhdq     m%1, m%1
1304    punpckhdq     m%2, m%2
1305    punpckhdq     m%3, m%3
1306    punpckhdq     m%4, m%4
1307    movd    [%6+%7*4], m%1
1308    movd      [%5+%7], m%2
1309    movd         [%6], m%3
1310    movd    [%6+%8*2], m%4
1311%endmacro
1312
1313; write 4 xmm registers of 4 dwords each
1314; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
1315; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
1316; we add 1*stride to the third regular registry in the process
1317; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
1318; same memory region), or 8 if they cover two separate buffers (third one points to
1319; a different memory region than the first two), allowing for more optimal code for
1320; the 16-width case
1321%macro WRITE_4x4D 10
1322    ; write out (4 dwords per register), start with dwords zero
1323    movd    [%5+%8*4], m%1
1324    movd         [%5], m%2
1325    movd    [%7+%8*4], m%3
1326    movd         [%7], m%4
1327
1328    ; store dwords 1
1329    psrldq        m%1, 4
1330    psrldq        m%2, 4
1331    psrldq        m%3, 4
1332    psrldq        m%4, 4
1333    movd    [%6+%8*4], m%1
1334    movd         [%6], m%2
1335%if %10 == 16
1336    movd    [%6+%9*4], m%3
1337%endif
1338    movd      [%7+%9], m%4
1339
1340    ; write dwords 2
1341    psrldq        m%1, 4
1342    psrldq        m%2, 4
1343%if %10 == 8
1344    movd    [%5+%8*2], m%1
1345    movd          %5d, m%3
1346%endif
1347    psrldq        m%3, 4
1348    psrldq        m%4, 4
1349%if %10 == 16
1350    movd    [%5+%8*2], m%1
1351%endif
1352    movd      [%6+%9], m%2
1353    movd    [%7+%8*2], m%3
1354    movd    [%7+%9*2], m%4
1355    add            %7, %9
1356
1357    ; store dwords 3
1358    psrldq        m%1, 4
1359    psrldq        m%2, 4
1360    psrldq        m%3, 4
1361    psrldq        m%4, 4
1362%if %10 == 8
1363    mov     [%7+%8*4], %5d
1364    movd    [%6+%8*2], m%1
1365%else
1366    movd      [%5+%8], m%1
1367%endif
1368    movd    [%6+%9*2], m%2
1369    movd    [%7+%8*2], m%3
1370    movd    [%7+%9*2], m%4
1371%endmacro
1372
1373; write 4 or 8 words in the mmx/xmm registers as 8 lines
1374; 1 and 2 are the registers to write, this can be the same (for SSE2)
1375; for pre-SSE4:
1376; 3 is a general-purpose register that we will clobber
1377; for SSE4:
1378; 3 is a pointer to the destination's 5th line
1379; 4 is a pointer to the destination's 4th line
1380; 5/6 is -stride and +stride
1381%macro WRITE_2x4W 6
1382    movd            %3d, %1
1383    punpckhdq        %1, %1
1384    mov       [%4+%5*4], %3w
1385    shr              %3, 16
1386    add              %4, %6
1387    mov       [%4+%5*4], %3w
1388
1389    movd            %3d, %1
1390    add              %4, %5
1391    mov       [%4+%5*2], %3w
1392    shr              %3, 16
1393    mov       [%4+%5  ], %3w
1394
1395    movd            %3d, %2
1396    punpckhdq        %2, %2
1397    mov       [%4     ], %3w
1398    shr              %3, 16
1399    mov       [%4+%6  ], %3w
1400
1401    movd            %3d, %2
1402    add              %4, %6
1403    mov       [%4+%6  ], %3w
1404    shr              %3, 16
1405    mov       [%4+%6*2], %3w
1406    add              %4, %5
1407%endmacro
1408
1409%macro WRITE_8W_SSE2 5
1410    movd            %2d, %1
1411    psrldq           %1, 4
1412    mov       [%3+%4*4], %2w
1413    shr              %2, 16
1414    add              %3, %5
1415    mov       [%3+%4*4], %2w
1416
1417    movd            %2d, %1
1418    psrldq           %1, 4
1419    add              %3, %4
1420    mov       [%3+%4*2], %2w
1421    shr              %2, 16
1422    mov       [%3+%4  ], %2w
1423
1424    movd            %2d, %1
1425    psrldq           %1, 4
1426    mov       [%3     ], %2w
1427    shr              %2, 16
1428    mov       [%3+%5  ], %2w
1429
1430    movd            %2d, %1
1431    add              %3, %5
1432    mov       [%3+%5  ], %2w
1433    shr              %2, 16
1434    mov       [%3+%5*2], %2w
1435%endmacro
1436
1437%macro WRITE_8W_SSE4 5
1438    pextrw    [%3+%4*4], %1, 0
1439    pextrw    [%2+%4*4], %1, 1
1440    pextrw    [%3+%4*2], %1, 2
1441    pextrw    [%3+%4  ], %1, 3
1442    pextrw    [%3     ], %1, 4
1443    pextrw    [%2     ], %1, 5
1444    pextrw    [%2+%5  ], %1, 6
1445    pextrw    [%2+%5*2], %1, 7
1446%endmacro
1447
1448%macro SPLATB_REG_MMX 2-3
1449    movd           %1, %2d
1450    punpcklbw      %1, %1
1451    punpcklwd      %1, %1
1452    punpckldq      %1, %1
1453%endmacro
1454
1455%macro SPLATB_REG_MMXEXT 2-3
1456    movd           %1, %2d
1457    punpcklbw      %1, %1
1458    pshufw         %1, %1, 0x0
1459%endmacro
1460
1461%macro SPLATB_REG_SSE2 2-3
1462    movd           %1, %2d
1463    punpcklbw      %1, %1
1464    pshuflw        %1, %1, 0x0
1465    punpcklqdq     %1, %1
1466%endmacro
1467
1468%macro SPLATB_REG_SSSE3 3
1469    movd           %1, %2d
1470    pshufb         %1, %3
1471%endmacro
1472
1473%macro SIMPLE_LOOPFILTER 4
1474cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
1475%if mmsize == 8 ; mmx/mmxext
1476    mov            r3, 2
1477%endif
1478%ifnidn %1, sse2
1479%if mmsize == 16
1480    pxor           m0, m0
1481%endif
1482%endif
1483    SPLATB_REG     m7, r2, m0       ; splat "flim" into register
1484
1485    ; set up indexes to address 4 rows
1486    mov            r2, r1
1487    neg            r1
1488%ifidn %2, h
1489    lea            r0, [r0+4*r2-2]
1490%endif
1491
1492%if mmsize == 8 ; mmx / mmxext
1493.next8px
1494%endif
1495%ifidn %2, v
1496    ; read 4 half/full rows of pixels
1497    mova           m0, [r0+r1*2]    ; p1
1498    mova           m1, [r0+r1]      ; p0
1499    mova           m2, [r0]         ; q0
1500    mova           m3, [r0+r2]      ; q1
1501%else ; h
1502    lea            r4, [r0+r2]
1503
1504%if mmsize == 8 ; mmx/mmxext
1505    READ_8x4_INTERLEAVED  0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
1506%else ; sse2
1507    READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
1508%endif
1509    TRANSPOSE4x4W         0, 1, 2, 3, 4
1510%endif
1511
1512    ; simple_limit
1513    mova           m5, m2           ; m5=backup of q0
1514    mova           m6, m1           ; m6=backup of p0
1515    psubusb        m1, m2           ; p0-q0
1516    psubusb        m2, m6           ; q0-p0
1517    por            m1, m2           ; FFABS(p0-q0)
1518    paddusb        m1, m1           ; m1=FFABS(p0-q0)*2
1519
1520    mova           m4, m3
1521    mova           m2, m0
1522    psubusb        m3, m0           ; q1-p1
1523    psubusb        m0, m4           ; p1-q1
1524    por            m3, m0           ; FFABS(p1-q1)
1525    mova           m0, [pb_80]
1526    pxor           m2, m0
1527    pxor           m4, m0
1528    psubsb         m2, m4           ; m2=p1-q1 (signed) backup for below
1529    pand           m3, [pb_FE]
1530    psrlq          m3, 1            ; m3=FFABS(p1-q1)/2, this can be used signed
1531    paddusb        m3, m1
1532    psubusb        m3, m7
1533    pxor           m1, m1
1534    pcmpeqb        m3, m1           ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
1535
1536    ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
1537    mova           m4, m5
1538    pxor           m5, m0
1539    pxor           m0, m6
1540    psubsb         m5, m0           ; q0-p0 (signed)
1541    paddsb         m2, m5
1542    paddsb         m2, m5
1543    paddsb         m2, m5           ; a=(p1-q1) + 3*(q0-p0)
1544    pand           m2, m3           ; apply filter mask (m3)
1545
1546    mova           m3, [pb_F8]
1547    mova           m1, m2
1548    paddsb         m2, [pb_4]       ; f1<<3=a+4
1549    paddsb         m1, [pb_3]       ; f2<<3=a+3
1550    pand           m2, m3
1551    pand           m1, m3           ; cache f2<<3
1552
1553    pxor           m0, m0
1554    pxor           m3, m3
1555    pcmpgtb        m0, m2           ; which values are <0?
1556    psubb          m3, m2           ; -f1<<3
1557    psrlq          m2, 3            ; +f1
1558    psrlq          m3, 3            ; -f1
1559    pand           m3, m0
1560    pandn          m0, m2
1561    psubusb        m4, m0
1562    paddusb        m4, m3           ; q0-f1
1563
1564    pxor           m0, m0
1565    pxor           m3, m3
1566    pcmpgtb        m0, m1           ; which values are <0?
1567    psubb          m3, m1           ; -f2<<3
1568    psrlq          m1, 3            ; +f2
1569    psrlq          m3, 3            ; -f2
1570    pand           m3, m0
1571    pandn          m0, m1
1572    paddusb        m6, m0
1573    psubusb        m6, m3           ; p0+f2
1574
1575    ; store
1576%ifidn %2, v
1577    mova         [r0], m4
1578    mova      [r0+r1], m6
1579%else ; h
1580    inc           r0
1581    SBUTTERFLY    bw, 6, 4, 0
1582
1583%if mmsize == 16 ; sse2
1584%ifidn %1, sse4
1585    inc            r4
1586%endif
1587    WRITE_8W       m6, r4, r0, r1, r2
1588    lea            r4, [r3+r1+1]
1589%ifidn %1, sse4
1590    inc            r3
1591%endif
1592    WRITE_8W       m4, r3, r4, r1, r2
1593%else ; mmx/mmxext
1594    WRITE_2x4W     m6, m4, r4, r0, r1, r2
1595%endif
1596%endif
1597
1598%if mmsize == 8 ; mmx/mmxext
1599    ; next 8 pixels
1600%ifidn %2, v
1601    add            r0, 8            ; advance 8 cols = pixels
1602%else ; h
1603    lea            r0, [r0+r2*8-1]  ; advance 8 rows = lines
1604%endif
1605    dec            r3
1606    jg .next8px
1607    REP_RET
1608%else ; sse2
1609    RET
1610%endif
1611%endmacro
1612
1613INIT_MMX
1614%define SPLATB_REG SPLATB_REG_MMX
1615SIMPLE_LOOPFILTER mmx,    v, 4, 0
1616SIMPLE_LOOPFILTER mmx,    h, 5, 0
1617%define SPLATB_REG SPLATB_REG_MMXEXT
1618SIMPLE_LOOPFILTER mmxext, v, 4, 0
1619SIMPLE_LOOPFILTER mmxext, h, 5, 0
1620INIT_XMM
1621%define SPLATB_REG SPLATB_REG_SSE2
1622%define WRITE_8W   WRITE_8W_SSE2
1623SIMPLE_LOOPFILTER sse2,   v, 3, 8
1624SIMPLE_LOOPFILTER sse2,   h, 5, 8
1625%define SPLATB_REG SPLATB_REG_SSSE3
1626SIMPLE_LOOPFILTER ssse3,  v, 3, 8
1627SIMPLE_LOOPFILTER ssse3,  h, 5, 8
1628%define WRITE_8W   WRITE_8W_SSE4
1629SIMPLE_LOOPFILTER sse4,   h, 5, 8
1630
1631;-----------------------------------------------------------------------------
1632; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1633;                                            int flimE, int flimI, int hev_thr);
1634;-----------------------------------------------------------------------------
1635
1636%macro INNER_LOOPFILTER 5
1637%if %4 == 8 ; chroma
1638cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
1639%define dst8_reg    r1
1640%define mstride_reg r2
1641%define E_reg       r3
1642%define I_reg       r4
1643%define hev_thr_reg r5
1644%else ; luma
1645cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
1646%define mstride_reg r1
1647%define E_reg       r2
1648%define I_reg       r3
1649%define hev_thr_reg r4
1650%ifdef m8 ; x86-64, sse2
1651%define dst8_reg    r4
1652%elif mmsize == 16 ; x86-32, sse2
1653%define dst8_reg    r5
1654%else ; x86-32, mmx/mmxext
1655%define cnt_reg     r5
1656%endif
1657%endif
1658%define dst_reg     r0
1659%define stride_reg  E_reg
1660%define dst2_reg    I_reg
1661%ifndef m8
1662%define stack_reg   hev_thr_reg
1663%endif
1664
1665%ifnidn %1, sse2
1666%if mmsize == 16
1667    pxor             m7, m7
1668%endif
1669%endif
1670
1671%ifndef m8 ; mmx/mmxext or sse2 on x86-32
1672    ; splat function arguments
1673    SPLATB_REG       m0, E_reg, m7   ; E
1674    SPLATB_REG       m1, I_reg, m7   ; I
1675    SPLATB_REG       m2, hev_thr_reg, m7 ; hev_thresh
1676
1677    ; align stack
1678    mov       stack_reg, rsp         ; backup stack pointer
1679    and             rsp, ~(mmsize-1) ; align stack
1680%ifidn %2, v
1681    sub             rsp, mmsize * 4  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
1682                                     ;               [3]=hev() result
1683%else ; h
1684    sub             rsp, mmsize * 5  ; extra storage space for transposes
1685%endif
1686
1687%define flim_E   [rsp]
1688%define flim_I   [rsp+mmsize]
1689%define hev_thr  [rsp+mmsize*2]
1690%define mask_res [rsp+mmsize*3]
1691%define p0backup [rsp+mmsize*3]
1692%define q0backup [rsp+mmsize*4]
1693
1694    mova         flim_E, m0
1695    mova         flim_I, m1
1696    mova        hev_thr, m2
1697
1698%else ; sse2 on x86-64
1699
1700%define flim_E   m9
1701%define flim_I   m10
1702%define hev_thr  m11
1703%define mask_res m12
1704%define p0backup m12
1705%define q0backup m8
1706
1707    ; splat function arguments
1708    SPLATB_REG   flim_E, E_reg, m7   ; E
1709    SPLATB_REG   flim_I, I_reg, m7   ; I
1710    SPLATB_REG  hev_thr, hev_thr_reg, m7 ; hev_thresh
1711%endif
1712
1713%if mmsize == 8 && %4 == 16 ; mmx/mmxext
1714    mov         cnt_reg, 2
1715%endif
1716    mov      stride_reg, mstride_reg
1717    neg     mstride_reg
1718%ifidn %2, h
1719    lea         dst_reg, [dst_reg + stride_reg*4-4]
1720%if %4 == 8
1721    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
1722%endif
1723%endif
1724
1725%if mmsize == 8
1726.next8px
1727%endif
1728    ; read
1729    lea        dst2_reg, [dst_reg + stride_reg]
1730%ifidn %2, v
1731%if %4 == 8 && mmsize == 16
1732%define movrow movh
1733%else
1734%define movrow mova
1735%endif
1736    movrow           m0, [dst_reg +mstride_reg*4] ; p3
1737    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
1738    movrow           m2, [dst_reg +mstride_reg*2] ; p1
1739    movrow           m5, [dst2_reg]               ; q1
1740    movrow           m6, [dst2_reg+ stride_reg]   ; q2
1741    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
1742%if mmsize == 16 && %4 == 8
1743    movhps           m0, [dst8_reg+mstride_reg*4]
1744    movhps           m2, [dst8_reg+mstride_reg*2]
1745    add        dst8_reg, stride_reg
1746    movhps           m1, [dst8_reg+mstride_reg*4]
1747    movhps           m5, [dst8_reg]
1748    movhps           m6, [dst8_reg+ stride_reg]
1749    movhps           m7, [dst8_reg+ stride_reg*2]
1750    add        dst8_reg, mstride_reg
1751%endif
1752%elif mmsize == 8 ; mmx/mmxext (h)
1753    ; read 8 rows of 8px each
1754    movu             m0, [dst_reg +mstride_reg*4]
1755    movu             m1, [dst2_reg+mstride_reg*4]
1756    movu             m2, [dst_reg +mstride_reg*2]
1757    movu             m3, [dst_reg +mstride_reg]
1758    movu             m4, [dst_reg]
1759    movu             m5, [dst2_reg]
1760    movu             m6, [dst2_reg+ stride_reg]
1761
1762    ; 8x8 transpose
1763    TRANSPOSE4x4B     0, 1, 2, 3, 7
1764    mova       q0backup, m1
1765    movu             m7, [dst2_reg+ stride_reg*2]
1766    TRANSPOSE4x4B     4, 5, 6, 7, 1
1767    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1768    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1769    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1770    mova             m1, q0backup
1771    mova       q0backup, m2          ; store q0
1772    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1773    mova       p0backup, m5          ; store p0
1774    SWAP              1, 4
1775    SWAP              2, 4
1776    SWAP              6, 3
1777    SWAP              5, 3
1778%else ; sse2 (h)
1779%if %4 == 16
1780    lea        dst8_reg, [dst_reg + stride_reg*8]
1781%endif
1782
1783    ; read 16 rows of 8px each, interleave
1784    movh             m0, [dst_reg +mstride_reg*4]
1785    movh             m1, [dst8_reg+mstride_reg*4]
1786    movh             m2, [dst_reg +mstride_reg*2]
1787    movh             m5, [dst8_reg+mstride_reg*2]
1788    movh             m3, [dst_reg +mstride_reg]
1789    movh             m6, [dst8_reg+mstride_reg]
1790    movh             m4, [dst_reg]
1791    movh             m7, [dst8_reg]
1792    punpcklbw        m0, m1          ; A/I
1793    punpcklbw        m2, m5          ; C/K
1794    punpcklbw        m3, m6          ; D/L
1795    punpcklbw        m4, m7          ; E/M
1796
1797    add        dst8_reg, stride_reg
1798    movh             m1, [dst2_reg+mstride_reg*4]
1799    movh             m6, [dst8_reg+mstride_reg*4]
1800    movh             m5, [dst2_reg]
1801    movh             m7, [dst8_reg]
1802    punpcklbw        m1, m6          ; B/J
1803    punpcklbw        m5, m7          ; F/N
1804    movh             m6, [dst2_reg+ stride_reg]
1805    movh             m7, [dst8_reg+ stride_reg]
1806    punpcklbw        m6, m7          ; G/O
1807
1808    ; 8x16 transpose
1809    TRANSPOSE4x4B     0, 1, 2, 3, 7
1810%ifdef m8
1811    SWAP              1, 8
1812%else
1813    mova       q0backup, m1
1814%endif
1815    movh             m7, [dst2_reg+ stride_reg*2]
1816    movh             m1, [dst8_reg+ stride_reg*2]
1817    punpcklbw        m7, m1          ; H/P
1818    TRANSPOSE4x4B     4, 5, 6, 7, 1
1819    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1820    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1821    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1822%ifdef m8
1823    SWAP              1, 8
1824    SWAP              2, 8
1825%else
1826    mova             m1, q0backup
1827    mova       q0backup, m2          ; store q0
1828%endif
1829    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1830%ifdef m12
1831    SWAP              5, 12
1832%else
1833    mova       p0backup, m5          ; store p0
1834%endif
1835    SWAP              1, 4
1836    SWAP              2, 4
1837    SWAP              6, 3
1838    SWAP              5, 3
1839%endif
1840
1841    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
1842    mova             m4, m1
1843    SWAP              4, 1
1844    psubusb          m4, m0          ; p2-p3
1845    psubusb          m0, m1          ; p3-p2
1846    por              m0, m4          ; abs(p3-p2)
1847
1848    mova             m4, m2
1849    SWAP              4, 2
1850    psubusb          m4, m1          ; p1-p2
1851    psubusb          m1, m2          ; p2-p1
1852    por              m1, m4          ; abs(p2-p1)
1853
1854    mova             m4, m6
1855    SWAP              4, 6
1856    psubusb          m4, m7          ; q2-q3
1857    psubusb          m7, m6          ; q3-q2
1858    por              m7, m4          ; abs(q3-q2)
1859
1860    mova             m4, m5
1861    SWAP              4, 5
1862    psubusb          m4, m6          ; q1-q2
1863    psubusb          m6, m5          ; q2-q1
1864    por              m6, m4          ; abs(q2-q1)
1865
1866%ifidn %1, mmx
1867    mova             m4, flim_I
1868    pxor             m3, m3
1869    psubusb          m0, m4
1870    psubusb          m1, m4
1871    psubusb          m7, m4
1872    psubusb          m6, m4
1873    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
1874    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
1875    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
1876    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
1877    pand             m0, m1
1878    pand             m7, m6
1879    pand             m0, m7
1880%else ; mmxext/sse2
1881    pmaxub           m0, m1
1882    pmaxub           m6, m7
1883    pmaxub           m0, m6
1884%endif
1885
1886    ; normal_limit and high_edge_variance for p1-p0, q1-q0
1887    SWAP              7, 3           ; now m7 is zero
1888%ifidn %2, v
1889    movrow           m3, [dst_reg +mstride_reg] ; p0
1890%if mmsize == 16 && %4 == 8
1891    movhps           m3, [dst8_reg+mstride_reg]
1892%endif
1893%elifdef m12
1894    SWAP              3, 12
1895%else
1896    mova             m3, p0backup
1897%endif
1898
1899    mova             m1, m2
1900    SWAP              1, 2
1901    mova             m6, m3
1902    SWAP              3, 6
1903    psubusb          m1, m3          ; p1-p0
1904    psubusb          m6, m2          ; p0-p1
1905    por              m1, m6          ; abs(p1-p0)
1906%ifidn %1, mmx
1907    mova             m6, m1
1908    psubusb          m1, m4
1909    psubusb          m6, hev_thr
1910    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
1911    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
1912    pand             m0, m1
1913    mova       mask_res, m6
1914%else ; mmxext/sse2
1915    pmaxub           m0, m1          ; max_I
1916    SWAP              1, 4           ; max_hev_thresh
1917%endif
1918
1919    SWAP              6, 4           ; now m6 is I
1920%ifidn %2, v
1921    movrow           m4, [dst_reg]   ; q0
1922%if mmsize == 16 && %4 == 8
1923    movhps           m4, [dst8_reg]
1924%endif
1925%elifdef m8
1926    SWAP              4, 8
1927%else
1928    mova             m4, q0backup
1929%endif
1930    mova             m1, m4
1931    SWAP              1, 4
1932    mova             m7, m5
1933    SWAP              7, 5
1934    psubusb          m1, m5          ; q0-q1
1935    psubusb          m7, m4          ; q1-q0
1936    por              m1, m7          ; abs(q1-q0)
1937%ifidn %1, mmx
1938    mova             m7, m1
1939    psubusb          m1, m6
1940    psubusb          m7, hev_thr
1941    pxor             m6, m6
1942    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
1943    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
1944    mova             m6, mask_res
1945    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
1946    pand             m6, m7
1947%else ; mmxext/sse2
1948    pxor             m7, m7
1949    pmaxub           m0, m1
1950    pmaxub           m6, m1
1951    psubusb          m0, flim_I
1952    psubusb          m6, hev_thr
1953    pcmpeqb          m0, m7          ; max(abs(..)) <= I
1954    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
1955%endif
1956%ifdef m12
1957    SWAP              6, 12
1958%else
1959    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1960%endif
1961
1962    ; simple_limit
1963    mova             m1, m3
1964    SWAP              1, 3
1965    mova             m6, m4          ; keep copies of p0/q0 around for later use
1966    SWAP              6, 4
1967    psubusb          m1, m4          ; p0-q0
1968    psubusb          m6, m3          ; q0-p0
1969    por              m1, m6          ; abs(q0-p0)
1970    paddusb          m1, m1          ; m1=2*abs(q0-p0)
1971
1972    mova             m7, m2
1973    SWAP              7, 2
1974    mova             m6, m5
1975    SWAP              6, 5
1976    psubusb          m7, m5          ; p1-q1
1977    psubusb          m6, m2          ; q1-p1
1978    por              m7, m6          ; abs(q1-p1)
1979    pxor             m6, m6
1980    pand             m7, [pb_FE]
1981    psrlq            m7, 1           ; abs(q1-p1)/2
1982    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
1983    psubusb          m7, flim_E
1984    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
1985    pand             m0, m7          ; normal_limit result
1986
1987    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
1988%ifdef m8 ; x86-64 && sse2
1989    mova             m8, [pb_80]
1990%define pb_80_var m8
1991%else ; x86-32 or mmx/mmxext
1992%define pb_80_var [pb_80]
1993%endif
1994    mova             m1, m4
1995    mova             m7, m3
1996    pxor             m1, pb_80_var
1997    pxor             m7, pb_80_var
1998    psubsb           m1, m7          ; (signed) q0-p0
1999    mova             m6, m2
2000    mova             m7, m5
2001    pxor             m6, pb_80_var
2002    pxor             m7, pb_80_var
2003    psubsb           m6, m7          ; (signed) p1-q1
2004    mova             m7, mask_res
2005    pandn            m7, m6
2006    paddsb           m7, m1
2007    paddsb           m7, m1
2008    paddsb           m7, m1          ; 3*(q0-p0)+is4tap?(p1-q1)
2009
2010    pand             m7, m0
2011    mova             m1, [pb_F8]
2012    mova             m6, m7
2013    paddsb           m7, [pb_3]
2014    paddsb           m6, [pb_4]
2015    pand             m7, m1
2016    pand             m6, m1
2017
2018    pxor             m1, m1
2019    pxor             m0, m0
2020    pcmpgtb          m1, m7
2021    psubb            m0, m7
2022    psrlq            m7, 3           ; +f2
2023    psrlq            m0, 3           ; -f2
2024    pand             m0, m1
2025    pandn            m1, m7
2026    psubusb          m3, m0
2027    paddusb          m3, m1          ; p0+f2
2028
2029    pxor             m1, m1
2030    pxor             m0, m0
2031    pcmpgtb          m0, m6
2032    psubb            m1, m6
2033    psrlq            m6, 3           ; +f1
2034    psrlq            m1, 3           ; -f1
2035    pand             m1, m0
2036    pandn            m0, m6
2037    psubusb          m4, m0
2038    paddusb          m4, m1          ; q0-f1
2039
2040%ifdef m12
2041    SWAP              6, 12
2042%else
2043    mova             m6, mask_res
2044%endif
2045%ifidn %1, mmx
2046    mova             m7, [pb_1]
2047%else ; mmxext/sse2
2048    pxor             m7, m7
2049%endif
2050    pand             m0, m6
2051    pand             m1, m6
2052%ifidn %1, mmx
2053    paddusb          m0, m7
2054    pand             m1, [pb_FE]
2055    pandn            m7, m0
2056    psrlq            m1, 1
2057    psrlq            m7, 1
2058    SWAP              0, 7
2059%else ; mmxext/sse2
2060    psubusb          m1, [pb_1]
2061    pavgb            m0, m7          ; a
2062    pavgb            m1, m7          ; -a
2063%endif
2064    psubusb          m5, m0
2065    psubusb          m2, m1
2066    paddusb          m5, m1          ; q1-a
2067    paddusb          m2, m0          ; p1+a
2068
2069    ; store
2070%ifidn %2, v
2071    movrow [dst_reg +mstride_reg*2], m2
2072    movrow [dst_reg +mstride_reg  ], m3
2073    movrow    [dst_reg], m4
2074    movrow [dst_reg + stride_reg  ], m5
2075%if mmsize == 16 && %4 == 8
2076    movhps [dst8_reg+mstride_reg*2], m2
2077    movhps [dst8_reg+mstride_reg  ], m3
2078    movhps   [dst8_reg], m4
2079    movhps [dst8_reg+ stride_reg  ], m5
2080%endif
2081%else ; h
2082    add         dst_reg, 2
2083    add        dst2_reg, 2
2084
2085    ; 4x8/16 transpose
2086    TRANSPOSE4x4B     2, 3, 4, 5, 6
2087
2088%if mmsize == 8 ; mmx/mmxext (h)
2089    WRITE_4x2D        2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
2090%else ; sse2 (h)
2091    lea        dst8_reg, [dst8_reg+mstride_reg+2]
2092    WRITE_4x4D        2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2093%endif
2094%endif
2095
2096%if mmsize == 8
2097%if %4 == 8 ; chroma
2098%ifidn %2, h
2099    sub         dst_reg, 2
2100%endif
2101    cmp         dst_reg, dst8_reg
2102    mov         dst_reg, dst8_reg
2103    jnz .next8px
2104%else
2105%ifidn %2, h
2106    lea         dst_reg, [dst_reg + stride_reg*8-2]
2107%else ; v
2108    add         dst_reg, 8
2109%endif
2110    dec         cnt_reg
2111    jg .next8px
2112%endif
2113%endif
2114
2115%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
2116    mov             rsp, stack_reg   ; restore stack pointer
2117%endif
2118    RET
2119%endmacro
2120
2121INIT_MMX
2122%define SPLATB_REG SPLATB_REG_MMX
2123INNER_LOOPFILTER mmx,    v, 6, 16, 0
2124INNER_LOOPFILTER mmx,    h, 6, 16, 0
2125INNER_LOOPFILTER mmx,    v, 6,  8, 0
2126INNER_LOOPFILTER mmx,    h, 6,  8, 0
2127
2128%define SPLATB_REG SPLATB_REG_MMXEXT
2129INNER_LOOPFILTER mmxext, v, 6, 16, 0
2130INNER_LOOPFILTER mmxext, h, 6, 16, 0
2131INNER_LOOPFILTER mmxext, v, 6,  8, 0
2132INNER_LOOPFILTER mmxext, h, 6,  8, 0
2133
2134INIT_XMM
2135%define SPLATB_REG SPLATB_REG_SSE2
2136INNER_LOOPFILTER sse2,   v, 5, 16, 13
2137%ifdef m8
2138INNER_LOOPFILTER sse2,   h, 5, 16, 13
2139%else
2140INNER_LOOPFILTER sse2,   h, 6, 16, 13
2141%endif
2142INNER_LOOPFILTER sse2,   v, 6,  8, 13
2143INNER_LOOPFILTER sse2,   h, 6,  8, 13
2144
2145%define SPLATB_REG SPLATB_REG_SSSE3
2146INNER_LOOPFILTER ssse3,  v, 5, 16, 13
2147%ifdef m8
2148INNER_LOOPFILTER ssse3,  h, 5, 16, 13
2149%else
2150INNER_LOOPFILTER ssse3,  h, 6, 16, 13
2151%endif
2152INNER_LOOPFILTER ssse3,  v, 6,  8, 13
2153INNER_LOOPFILTER ssse3,  h, 6,  8, 13
2154
2155;-----------------------------------------------------------------------------
2156; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
2157;                                            int flimE, int flimI, int hev_thr);
2158;-----------------------------------------------------------------------------
2159
2160%macro MBEDGE_LOOPFILTER 5
2161%if %4 == 8 ; chroma
2162cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5
2163%define dst8_reg    r1
2164%define mstride_reg r2
2165%define E_reg       r3
2166%define I_reg       r4
2167%define hev_thr_reg r5
2168%else ; luma
2169cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
2170%define mstride_reg r1
2171%define E_reg       r2
2172%define I_reg       r3
2173%define hev_thr_reg r4
2174%ifdef m8 ; x86-64, sse2
2175%define dst8_reg    r4
2176%elif mmsize == 16 ; x86-32, sse2
2177%define dst8_reg    r5
2178%else ; x86-32, mmx/mmxext
2179%define cnt_reg     r5
2180%endif
2181%endif
2182%define dst_reg     r0
2183%define stride_reg  E_reg
2184%define dst2_reg    I_reg
2185%ifndef m8
2186%define stack_reg   hev_thr_reg
2187%endif
2188
2189%define ssse3_or_higher 0
2190%ifnidn %1, sse2
2191%if mmsize == 16
2192%define ssse3_or_higher 1
2193%endif
2194%endif
2195
2196%if ssse3_or_higher
2197    pxor             m7, m7
2198%endif
2199
2200%ifndef m8 ; mmx/mmxext or sse2 on x86-32
2201    ; splat function arguments
2202    SPLATB_REG       m0, E_reg, m7   ; E
2203    SPLATB_REG       m1, I_reg, m7   ; I
2204    SPLATB_REG       m2, hev_thr_reg, m7 ; hev_thresh
2205
2206    ; align stack
2207    mov       stack_reg, rsp         ; backup stack pointer
2208    and             rsp, ~(mmsize-1) ; align stack
2209%if mmsize == 16
2210    sub             rsp, mmsize * 7
2211%else
2212    sub             rsp, mmsize * 8  ; stack layout: [0]=E, [1]=I, [2]=hev_thr
2213                                     ;               [3]=hev() result
2214                                     ;               [4]=filter tmp result
2215                                     ;               [5]/[6] = p2/q2 backup
2216                                     ;               [7]=lim_res sign result
2217%endif
2218
2219%define flim_E   [rsp]
2220%define flim_I   [rsp+mmsize]
2221%define hev_thr  [rsp+mmsize*2]
2222%define mask_res [rsp+mmsize*3]
2223%define lim_res  [rsp+mmsize*4]
2224%define p0backup [rsp+mmsize*3]
2225%define q0backup [rsp+mmsize*4]
2226%define p2backup [rsp+mmsize*5]
2227%define q2backup [rsp+mmsize*6]
2228%if mmsize == 16
2229%define lim_sign [rsp]
2230%else
2231%define lim_sign [rsp+mmsize*7]
2232%endif
2233
2234    mova         flim_E, m0
2235    mova         flim_I, m1
2236    mova        hev_thr, m2
2237
2238%else ; sse2 on x86-64
2239
2240%define flim_E   m9
2241%define flim_I   m10
2242%define hev_thr  m11
2243%define mask_res m12
2244%define lim_res  m8
2245%define p0backup m12
2246%define q0backup m8
2247%define p2backup m13
2248%define q2backup m14
2249%define lim_sign m9
2250
2251    ; splat function arguments
2252    SPLATB_REG   flim_E, E_reg, m7   ; E
2253    SPLATB_REG   flim_I, I_reg, m7   ; I
2254    SPLATB_REG  hev_thr, hev_thr_reg, m7 ; hev_thresh
2255%endif
2256
2257%if mmsize == 8 && %4 == 16 ; mmx/mmxext
2258    mov         cnt_reg, 2
2259%endif
2260    mov      stride_reg, mstride_reg
2261    neg     mstride_reg
2262%ifidn %2, h
2263    lea         dst_reg, [dst_reg + stride_reg*4-4]
2264%if %4 == 8
2265    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
2266%endif
2267%endif
2268
2269%if mmsize == 8
2270.next8px
2271%endif
2272    ; read
2273    lea        dst2_reg, [dst_reg + stride_reg]
2274%ifidn %2, v
2275%if %4 == 8 && mmsize == 16
2276%define movrow movh
2277%else
2278%define movrow mova
2279%endif
2280    movrow           m0, [dst_reg +mstride_reg*4] ; p3
2281    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
2282    movrow           m2, [dst_reg +mstride_reg*2] ; p1
2283    movrow           m5, [dst2_reg]               ; q1
2284    movrow           m6, [dst2_reg+ stride_reg]   ; q2
2285    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
2286%if mmsize == 16 && %4 == 8
2287    movhps           m0, [dst8_reg+mstride_reg*4]
2288    movhps           m2, [dst8_reg+mstride_reg*2]
2289    add        dst8_reg, stride_reg
2290    movhps           m1, [dst8_reg+mstride_reg*4]
2291    movhps           m5, [dst8_reg]
2292    movhps           m6, [dst8_reg+ stride_reg]
2293    movhps           m7, [dst8_reg+ stride_reg*2]
2294    add        dst8_reg, mstride_reg
2295%endif
2296%elif mmsize == 8 ; mmx/mmxext (h)
2297    ; read 8 rows of 8px each
2298    movu             m0, [dst_reg +mstride_reg*4]
2299    movu             m1, [dst2_reg+mstride_reg*4]
2300    movu             m2, [dst_reg +mstride_reg*2]
2301    movu             m3, [dst_reg +mstride_reg]
2302    movu             m4, [dst_reg]
2303    movu             m5, [dst2_reg]
2304    movu             m6, [dst2_reg+ stride_reg]
2305
2306    ; 8x8 transpose
2307    TRANSPOSE4x4B     0, 1, 2, 3, 7
2308    mova       q0backup, m1
2309    movu             m7, [dst2_reg+ stride_reg*2]
2310    TRANSPOSE4x4B     4, 5, 6, 7, 1
2311    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
2312    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
2313    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
2314    mova             m1, q0backup
2315    mova       q0backup, m2          ; store q0
2316    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
2317    mova       p0backup, m5          ; store p0
2318    SWAP              1, 4
2319    SWAP              2, 4
2320    SWAP              6, 3
2321    SWAP              5, 3
2322%else ; sse2 (h)
2323%if %4 == 16
2324    lea        dst8_reg, [dst_reg + stride_reg*8]
2325%endif
2326
2327    ; read 16 rows of 8px each, interleave
2328    movh             m0, [dst_reg +mstride_reg*4]
2329    movh             m1, [dst8_reg+mstride_reg*4]
2330    movh             m2, [dst_reg +mstride_reg*2]
2331    movh             m5, [dst8_reg+mstride_reg*2]
2332    movh             m3, [dst_reg +mstride_reg]
2333    movh             m6, [dst8_reg+mstride_reg]
2334    movh             m4, [dst_reg]
2335    movh             m7, [dst8_reg]
2336    punpcklbw        m0, m1          ; A/I
2337    punpcklbw        m2, m5          ; C/K
2338    punpcklbw        m3, m6          ; D/L
2339    punpcklbw        m4, m7          ; E/M
2340
2341    add        dst8_reg, stride_reg
2342    movh             m1, [dst2_reg+mstride_reg*4]
2343    movh             m6, [dst8_reg+mstride_reg*4]
2344    movh             m5, [dst2_reg]
2345    movh             m7, [dst8_reg]
2346    punpcklbw        m1, m6          ; B/J
2347    punpcklbw        m5, m7          ; F/N
2348    movh             m6, [dst2_reg+ stride_reg]
2349    movh             m7, [dst8_reg+ stride_reg]
2350    punpcklbw        m6, m7          ; G/O
2351
2352    ; 8x16 transpose
2353    TRANSPOSE4x4B     0, 1, 2, 3, 7
2354%ifdef m8
2355    SWAP              1, 8
2356%else
2357    mova       q0backup, m1
2358%endif
2359    movh             m7, [dst2_reg+ stride_reg*2]
2360    movh             m1, [dst8_reg+ stride_reg*2]
2361    punpcklbw        m7, m1          ; H/P
2362    TRANSPOSE4x4B     4, 5, 6, 7, 1
2363    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
2364    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
2365    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
2366%ifdef m8
2367    SWAP              1, 8
2368    SWAP              2, 8
2369%else
2370    mova             m1, q0backup
2371    mova       q0backup, m2          ; store q0
2372%endif
2373    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
2374%ifdef m12
2375    SWAP              5, 12
2376%else
2377    mova       p0backup, m5          ; store p0
2378%endif
2379    SWAP              1, 4
2380    SWAP              2, 4
2381    SWAP              6, 3
2382    SWAP              5, 3
2383%endif
2384
2385    ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
2386    mova             m4, m1
2387    SWAP              4, 1
2388    psubusb          m4, m0          ; p2-p3
2389    psubusb          m0, m1          ; p3-p2
2390    por              m0, m4          ; abs(p3-p2)
2391
2392    mova             m4, m2
2393    SWAP              4, 2
2394    psubusb          m4, m1          ; p1-p2
2395    mova       p2backup, m1
2396    psubusb          m1, m2          ; p2-p1
2397    por              m1, m4          ; abs(p2-p1)
2398
2399    mova             m4, m6
2400    SWAP              4, 6
2401    psubusb          m4, m7          ; q2-q3
2402    psubusb          m7, m6          ; q3-q2
2403    por              m7, m4          ; abs(q3-q2)
2404
2405    mova             m4, m5
2406    SWAP              4, 5
2407    psubusb          m4, m6          ; q1-q2
2408    mova       q2backup, m6
2409    psubusb          m6, m5          ; q2-q1
2410    por              m6, m4          ; abs(q2-q1)
2411
2412%ifidn %1, mmx
2413    mova             m4, flim_I
2414    pxor             m3, m3
2415    psubusb          m0, m4
2416    psubusb          m1, m4
2417    psubusb          m7, m4
2418    psubusb          m6, m4
2419    pcmpeqb          m0, m3          ; abs(p3-p2) <= I
2420    pcmpeqb          m1, m3          ; abs(p2-p1) <= I
2421    pcmpeqb          m7, m3          ; abs(q3-q2) <= I
2422    pcmpeqb          m6, m3          ; abs(q2-q1) <= I
2423    pand             m0, m1
2424    pand             m7, m6
2425    pand             m0, m7
2426%else ; mmxext/sse2
2427    pmaxub           m0, m1
2428    pmaxub           m6, m7
2429    pmaxub           m0, m6
2430%endif
2431
2432    ; normal_limit and high_edge_variance for p1-p0, q1-q0
2433    SWAP              7, 3           ; now m7 is zero
2434%ifidn %2, v
2435    movrow           m3, [dst_reg +mstride_reg] ; p0
2436%if mmsize == 16 && %4 == 8
2437    movhps           m3, [dst8_reg+mstride_reg]
2438%endif
2439%elifdef m12
2440    SWAP              3, 12
2441%else
2442    mova             m3, p0backup
2443%endif
2444
2445    mova             m1, m2
2446    SWAP              1, 2
2447    mova             m6, m3
2448    SWAP              3, 6
2449    psubusb          m1, m3          ; p1-p0
2450    psubusb          m6, m2          ; p0-p1
2451    por              m1, m6          ; abs(p1-p0)
2452%ifidn %1, mmx
2453    mova             m6, m1
2454    psubusb          m1, m4
2455    psubusb          m6, hev_thr
2456    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
2457    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
2458    pand             m0, m1
2459    mova       mask_res, m6
2460%else ; mmxext/sse2
2461    pmaxub           m0, m1          ; max_I
2462    SWAP              1, 4           ; max_hev_thresh
2463%endif
2464
2465    SWAP              6, 4           ; now m6 is I
2466%ifidn %2, v
2467    movrow           m4, [dst_reg]   ; q0
2468%if mmsize == 16 && %4 == 8
2469    movhps           m4, [dst8_reg]
2470%endif
2471%elifdef m8
2472    SWAP              4, 8
2473%else
2474    mova             m4, q0backup
2475%endif
2476    mova             m1, m4
2477    SWAP              1, 4
2478    mova             m7, m5
2479    SWAP              7, 5
2480    psubusb          m1, m5          ; q0-q1
2481    psubusb          m7, m4          ; q1-q0
2482    por              m1, m7          ; abs(q1-q0)
2483%ifidn %1, mmx
2484    mova             m7, m1
2485    psubusb          m1, m6
2486    psubusb          m7, hev_thr
2487    pxor             m6, m6
2488    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
2489    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
2490    mova             m6, mask_res
2491    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
2492    pand             m6, m7
2493%else ; mmxext/sse2
2494    pxor             m7, m7
2495    pmaxub           m0, m1
2496    pmaxub           m6, m1
2497    psubusb          m0, flim_I
2498    psubusb          m6, hev_thr
2499    pcmpeqb          m0, m7          ; max(abs(..)) <= I
2500    pcmpeqb          m6, m7          ; !(max(abs..) > thresh)
2501%endif
2502%ifdef m12
2503    SWAP              6, 12
2504%else
2505    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
2506%endif
2507
2508    ; simple_limit
2509    mova             m1, m3
2510    SWAP              1, 3
2511    mova             m6, m4          ; keep copies of p0/q0 around for later use
2512    SWAP              6, 4
2513    psubusb          m1, m4          ; p0-q0
2514    psubusb          m6, m3          ; q0-p0
2515    por              m1, m6          ; abs(q0-p0)
2516    paddusb          m1, m1          ; m1=2*abs(q0-p0)
2517
2518    mova             m7, m2
2519    SWAP              7, 2
2520    mova             m6, m5
2521    SWAP              6, 5
2522    psubusb          m7, m5          ; p1-q1
2523    psubusb          m6, m2          ; q1-p1
2524    por              m7, m6          ; abs(q1-p1)
2525    pxor             m6, m6
2526    pand             m7, [pb_FE]
2527    psrlq            m7, 1           ; abs(q1-p1)/2
2528    paddusb          m7, m1          ; abs(q0-p0)*2+abs(q1-p1)/2
2529    psubusb          m7, flim_E
2530    pcmpeqb          m7, m6          ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
2531    pand             m0, m7          ; normal_limit result
2532
2533    ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
2534%ifdef m8 ; x86-64 && sse2
2535    mova             m8, [pb_80]
2536%define pb_80_var m8
2537%else ; x86-32 or mmx/mmxext
2538%define pb_80_var [pb_80]
2539%endif
2540    mova             m1, m4
2541    mova             m7, m3
2542    pxor             m1, pb_80_var
2543    pxor             m7, pb_80_var
2544    psubsb           m1, m7          ; (signed) q0-p0
2545    mova             m6, m2
2546    mova             m7, m5
2547    pxor             m6, pb_80_var
2548    pxor             m7, pb_80_var
2549    psubsb           m6, m7          ; (signed) p1-q1
2550    mova             m7, mask_res
2551    paddsb           m6, m1
2552    paddsb           m6, m1
2553    paddsb           m6, m1
2554    pand             m6, m0
2555%ifdef m8
2556    mova        lim_res, m6          ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
2557    pand        lim_res, m7
2558%else
2559    mova             m0, m6
2560    pand             m0, m7
2561    mova        lim_res, m0
2562%endif
2563    pandn            m7, m6          ; 3*(q0-p0)+(p1-q1) masked for filter_common
2564
2565    mova             m1, [pb_F8]
2566    mova             m6, m7
2567    paddsb           m7, [pb_3]
2568    paddsb           m6, [pb_4]
2569    pand             m7, m1
2570    pand             m6, m1
2571
2572    pxor             m1, m1
2573    pxor             m0, m0
2574    pcmpgtb          m1, m7
2575    psubb            m0, m7
2576    psrlq            m7, 3           ; +f2
2577    psrlq            m0, 3           ; -f2
2578    pand             m0, m1
2579    pandn            m1, m7
2580    psubusb          m3, m0
2581    paddusb          m3, m1          ; p0+f2
2582
2583    pxor             m1, m1
2584    pxor             m0, m0
2585    pcmpgtb          m0, m6
2586    psubb            m1, m6
2587    psrlq            m6, 3           ; +f1
2588    psrlq            m1, 3           ; -f1
2589    pand             m1, m0
2590    pandn            m0, m6
2591    psubusb          m4, m0
2592    paddusb          m4, m1          ; q0-f1
2593
2594    ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
2595%if ssse3_or_higher
2596    mova             m7, [pb_1]
2597%else
2598    mova             m7, [pw_63]
2599%endif
2600%ifdef m8
2601    SWAP              1, 8
2602%else
2603    mova             m1, lim_res
2604%endif
2605    pxor             m0, m0
2606    mova             m6, m1
2607    pcmpgtb          m0, m1         ; which are negative
2608%if ssse3_or_higher
2609    punpcklbw        m6, m7         ; interleave with "1" for rounding
2610    punpckhbw        m1, m7
2611%else
2612    punpcklbw        m6, m0         ; signed byte->word
2613    punpckhbw        m1, m0
2614%endif
2615    mova       lim_sign, m0
2616%if ssse3_or_higher
2617    mova             m7, [pb_27_63]
2618%ifndef m8
2619    mova        lim_res, m1
2620%endif
2621%ifdef m10
2622    SWAP              0, 10         ; don't lose lim_sign copy
2623%endif
2624    mova             m0, m7
2625    pmaddubsw        m7, m6
2626    SWAP              6, 7
2627    pmaddubsw        m0, m1
2628    SWAP              1, 0
2629%ifdef m10
2630    SWAP              0, 10
2631%else
2632    mova             m0, lim_sign
2633%endif
2634%else
2635    mova       mask_res, m6         ; backup for later in filter
2636    mova        lim_res, m1
2637    pmullw          m6, [pw_27]
2638    pmullw          m1, [pw_27]
2639    paddw           m6, m7
2640    paddw           m1, m7
2641%endif
2642    psraw           m6, 7
2643    psraw           m1, 7
2644    packsswb        m6, m1          ; a0
2645    pxor            m1, m1
2646    psubb           m1, m6
2647    pand            m1, m0          ; -a0
2648    pandn           m0, m6          ; +a0
2649%if ssse3_or_higher
2650    mova            m6, [pb_18_63]  ; pipelining
2651%endif
2652    psubusb         m3, m1
2653    paddusb         m4, m1
2654    paddusb         m3, m0          ; p0+a0
2655    psubusb         m4, m0          ; q0-a0
2656
2657%if ssse3_or_higher
2658    SWAP             6, 7
2659%ifdef m10
2660    SWAP             1, 10
2661%else
2662    mova            m1, lim_res
2663%endif
2664    mova            m0, m7
2665    pmaddubsw       m7, m6
2666    SWAP             6, 7
2667    pmaddubsw       m0, m1
2668    SWAP             1, 0
2669%ifdef m10
2670    SWAP             0, 10
2671%endif
2672    mova            m0, lim_sign
2673%else
2674    mova            m6, mask_res
2675    mova            m1, lim_res
2676    pmullw          m6, [pw_18]
2677    pmullw          m1, [pw_18]
2678    paddw           m6, m7
2679    paddw           m1, m7
2680%endif
2681    mova            m0, lim_sign
2682    psraw           m6, 7
2683    psraw           m1, 7
2684    packsswb        m6, m1          ; a1
2685    pxor            m1, m1
2686    psubb           m1, m6
2687    pand            m1, m0          ; -a1
2688    pandn           m0, m6          ; +a1
2689%if ssse3_or_higher
2690    mova            m6, [pb_9_63]
2691%endif
2692    psubusb         m2, m1
2693    paddusb         m5, m1
2694    paddusb         m2, m0          ; p1+a1
2695    psubusb         m5, m0          ; q1-a1
2696
2697%if ssse3_or_higher
2698    SWAP             6, 7
2699%ifdef m10
2700    SWAP             1, 10
2701%else
2702    mova            m1, lim_res
2703%endif
2704    mova            m0, m7
2705    pmaddubsw       m7, m6
2706    SWAP             6, 7
2707    pmaddubsw       m0, m1
2708    SWAP             1, 0
2709%else
2710%ifdef m8
2711    SWAP             6, 12
2712    SWAP             1, 8
2713%else
2714    mova            m6, mask_res
2715    mova            m1, lim_res
2716%endif
2717    pmullw          m6, [pw_9]
2718    pmullw          m1, [pw_9]
2719    paddw           m6, m7
2720    paddw           m1, m7
2721%endif
2722%ifdef m9
2723    SWAP             7, 9
2724%else
2725    mova            m7, lim_sign
2726%endif
2727    psraw           m6, 7
2728    psraw           m1, 7
2729    packsswb        m6, m1          ; a1
2730    pxor            m0, m0
2731    psubb           m0, m6
2732    pand            m0, m7          ; -a1
2733    pandn           m7, m6          ; +a1
2734%ifdef m8
2735    SWAP             1, 13
2736    SWAP             6, 14
2737%else
2738    mova            m1, p2backup
2739    mova            m6, q2backup
2740%endif
2741    psubusb         m1, m0
2742    paddusb         m6, m0
2743    paddusb         m1, m7          ; p1+a1
2744    psubusb         m6, m7          ; q1-a1
2745
2746    ; store
2747%ifidn %2, v
2748    movrow [dst2_reg+mstride_reg*4], m1
2749    movrow [dst_reg +mstride_reg*2], m2
2750    movrow [dst_reg +mstride_reg  ], m3
2751    movrow    [dst_reg], m4
2752    movrow   [dst2_reg], m5
2753    movrow [dst2_reg+ stride_reg  ], m6
2754%if mmsize == 16 && %4 == 8
2755    add        dst8_reg, mstride_reg
2756    movhps [dst8_reg+mstride_reg*2], m1
2757    movhps [dst8_reg+mstride_reg  ], m2
2758    movhps   [dst8_reg], m3
2759    add        dst8_reg, stride_reg
2760    movhps   [dst8_reg], m4
2761    movhps [dst8_reg+ stride_reg  ], m5
2762    movhps [dst8_reg+ stride_reg*2], m6
2763%endif
2764%else ; h
2765    inc         dst_reg
2766    inc        dst2_reg
2767
2768    ; 4x8/16 transpose
2769    TRANSPOSE4x4B     1, 2, 3, 4, 0
2770    SBUTTERFLY       bw, 5, 6, 0
2771
2772%if mmsize == 8 ; mmx/mmxext (h)
2773    WRITE_4x2D        1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg
2774    add         dst_reg, 4
2775    WRITE_2x4W       m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg
2776%else ; sse2 (h)
2777    lea        dst8_reg, [dst8_reg+mstride_reg+1]
2778    WRITE_4x4D        1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
2779    lea         dst_reg, [dst2_reg+mstride_reg+4]
2780    lea        dst8_reg, [dst8_reg+mstride_reg+4]
2781%ifidn %1, sse4
2782    add        dst2_reg, 4
2783%endif
2784    WRITE_8W         m5, dst2_reg, dst_reg,  mstride_reg, stride_reg
2785%ifidn %1, sse4
2786    lea        dst2_reg, [dst8_reg+ stride_reg]
2787%endif
2788    WRITE_8W         m6, dst2_reg, dst8_reg, mstride_reg, stride_reg
2789%endif
2790%endif
2791
2792%if mmsize == 8
2793%if %4 == 8 ; chroma
2794%ifidn %2, h
2795    sub         dst_reg, 5
2796%endif
2797    cmp         dst_reg, dst8_reg
2798    mov         dst_reg, dst8_reg
2799    jnz .next8px
2800%else
2801%ifidn %2, h
2802    lea         dst_reg, [dst_reg + stride_reg*8-5]
2803%else ; v
2804    add         dst_reg, 8
2805%endif
2806    dec         cnt_reg
2807    jg .next8px
2808%endif
2809%endif
2810
2811%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
2812    mov             rsp, stack_reg   ; restore stack pointer
2813%endif
2814    RET
2815%endmacro
2816
2817INIT_MMX
2818%define SPLATB_REG SPLATB_REG_MMX
2819MBEDGE_LOOPFILTER mmx,    v, 6, 16, 0
2820MBEDGE_LOOPFILTER mmx,    h, 6, 16, 0
2821MBEDGE_LOOPFILTER mmx,    v, 6,  8, 0
2822MBEDGE_LOOPFILTER mmx,    h, 6,  8, 0
2823
2824%define SPLATB_REG SPLATB_REG_MMXEXT
2825MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
2826MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
2827MBEDGE_LOOPFILTER mmxext, v, 6,  8, 0
2828MBEDGE_LOOPFILTER mmxext, h, 6,  8, 0
2829
2830INIT_XMM
2831%define SPLATB_REG SPLATB_REG_SSE2
2832%define WRITE_8W   WRITE_8W_SSE2
2833MBEDGE_LOOPFILTER sse2,   v, 5, 16, 15
2834%ifdef m8
2835MBEDGE_LOOPFILTER sse2,   h, 5, 16, 15
2836%else
2837MBEDGE_LOOPFILTER sse2,   h, 6, 16, 15
2838%endif
2839MBEDGE_LOOPFILTER sse2,   v, 6,  8, 15
2840MBEDGE_LOOPFILTER sse2,   h, 6,  8, 15
2841
2842%define SPLATB_REG SPLATB_REG_SSSE3
2843MBEDGE_LOOPFILTER ssse3,  v, 5, 16, 15
2844%ifdef m8
2845MBEDGE_LOOPFILTER ssse3,  h, 5, 16, 15
2846%else
2847MBEDGE_LOOPFILTER ssse3,  h, 6, 16, 15
2848%endif
2849MBEDGE_LOOPFILTER ssse3,  v, 6,  8, 15
2850MBEDGE_LOOPFILTER ssse3,  h, 6,  8, 15
2851
2852%define WRITE_8W   WRITE_8W_SSE4
2853%ifdef m8
2854MBEDGE_LOOPFILTER sse4,   h, 5, 16, 15
2855%else
2856MBEDGE_LOOPFILTER sse4,   h, 6, 16, 15
2857%endif
2858MBEDGE_LOOPFILTER sse4,   h, 6,  8, 15
2859