1;*****************************************************************************
2;* x86-optimized functions for pullup filter
3;*
4;* This file is part of FFmpeg.
5;*
6;* FFmpeg is free software; you can redistribute it and/or modify
7;* it under the terms of the GNU General Public License as published by
8;* the Free Software Foundation; either version 2 of the License, or
9;* (at your option) any later version.
10;*
11;* FFmpeg is distributed in the hope that it will be useful,
12;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14;* GNU General Public License for more details.
15;*
16;* You should have received a copy of the GNU General Public License along
17;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19;******************************************************************************
20
21%include "libavutil/x86/x86util.asm"
22
23SECTION_TEXT
24
25INIT_MMX mmx
26cglobal pullup_filter_diff, 3, 5, 8, first, second, size
27    mov        r3, 4
28    pxor       m4, m4
29    pxor       m7, m7
30
31.loop:
32    movq       m0, [firstq]
33    movq       m2, [firstq]
34    add        firstq, sizeq
35    movq       m1, [secondq]
36    add        secondq, sizeq
37    psubusb    m2, m1
38    psubusb    m1, m0
39    movq       m0, m2
40    movq       m3, m1
41    punpcklbw  m0, m7
42    punpcklbw  m1, m7
43    punpckhbw  m2, m7
44    punpckhbw  m3, m7
45    paddw      m4, m0
46    paddw      m4, m1
47    paddw      m4, m2
48    paddw      m4, m3
49
50    dec        r3
51    jnz .loop
52
53    movq       m3, m4
54    punpcklwd  m4, m7
55    punpckhwd  m3, m7
56    paddd      m3, m4
57    movd      eax, m3
58    psrlq      m3, 32
59    movd      r4d, m3
60    add       eax, r4d
61    RET
62
63INIT_MMX mmx
64cglobal pullup_filter_comb, 3, 5, 8, first, second, size
65    mov        r3, 4
66    pxor       m6, m6
67    pxor       m7, m7
68    sub        secondq, sizeq
69
70.loop:
71    movq       m0, [firstq]
72    movq       m1, [secondq]
73    punpcklbw  m0, m7
74    movq       m2, [secondq+sizeq]
75    punpcklbw  m1, m7
76    punpcklbw  m2, m7
77    paddw      m0, m0
78    paddw      m1, m2
79    movq       m2, m0
80    psubusw    m0, m1
81    psubusw    m1, m2
82    paddw      m6, m0
83    paddw      m6, m1
84
85    movq       m0, [firstq]
86    movq       m1, [secondq]
87    punpckhbw  m0, m7
88    movq       m2, [secondq+sizeq]
89    punpckhbw  m1, m7
90    punpckhbw  m2, m7
91    paddw      m0, m0
92    paddw      m1, m2
93    movq       m2, m0
94    psubusw    m0, m1
95    psubusw    m1, m2
96    paddw      m6, m0
97    paddw      m6, m1
98
99    movq       m0, [secondq+sizeq]
100    movq       m1, [firstq]
101    punpcklbw  m0, m7
102    movq       m2, [firstq+sizeq]
103    punpcklbw  m1, m7
104    punpcklbw  m2, m7
105    paddw      m0, m0
106    paddw      m1, m2
107    movq       m2, m0
108    psubusw    m0, m1
109    psubusw    m1, m2
110    paddw      m6, m0
111    paddw      m6, m1
112
113    movq       m0, [secondq+sizeq]
114    movq       m1, [firstq]
115    punpckhbw  m0, m7
116    movq       m2, [firstq+sizeq]
117    punpckhbw  m1, m7
118    punpckhbw  m2, m7
119    paddw      m0, m0
120    paddw      m1, m2
121    movq       m2, m0
122    psubusw    m0, m1
123    psubusw    m1, m2
124    paddw      m6, m0
125    paddw      m6, m1
126
127    add        firstq, sizeq
128    add        secondq, sizeq
129    dec        r3
130    jnz .loop
131
132    movq       m5, m6
133    punpcklwd  m6, m7
134    punpckhwd  m5, m7
135    paddd      m5, m6
136    movd      eax, m5
137    psrlq      m5, 32
138    movd      r4d, m5
139    add       eax, r4d
140    RET
141
142INIT_MMX mmx
143cglobal pullup_filter_var, 3, 5, 8, first, second, size
144    mov        r3, 3
145    pxor       m4, m4
146    pxor       m7, m7
147
148.loop:
149    movq       m0, [firstq]
150    movq       m2, [firstq]
151    movq       m1, [firstq+sizeq]
152    add        firstq, sizeq
153    psubusb    m2, m1
154    psubusb    m1, m0
155    movq       m0, m2
156    movq       m3, m1
157    punpcklbw  m0, m7
158    punpcklbw  m1, m7
159    punpckhbw  m2, m7
160    punpckhbw  m3, m7
161    paddw      m4, m0
162    paddw      m4, m1
163    paddw      m4, m2
164    paddw      m4, m3
165
166    dec        r3
167    jnz .loop
168
169    movq       m3, m4
170    punpcklwd  m4, m7
171    punpckhwd  m3, m7
172    paddd      m3, m4
173    movd      eax, m3
174    psrlq      m3, 32
175    movd      r4d, m3
176    add       eax, r4d
177    shl       eax, 2
178    RET
179