1;******************************************************************************
2;* VP9 MC SIMD optimizations
3;*
4;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27; FIXME share with vp8dsp.asm
28pw_256:   times 8 dw 256
29
30%macro F8_TAPS 8
31times 8 db %1, %2
32times 8 db %3, %4
33times 8 db %5, %6
34times 8 db %7, %8
35%endmacro
36; int8_t ff_filters_ssse3[3][15][4][16]
37const filters_ssse3 ; smooth
38                    F8_TAPS -3, -1,  32,  64,  38,   1, -3,  0
39                    F8_TAPS -2, -2,  29,  63,  41,   2, -3,  0
40                    F8_TAPS -2, -2,  26,  63,  43,   4, -4,  0
41                    F8_TAPS -2, -3,  24,  62,  46,   5, -4,  0
42                    F8_TAPS -2, -3,  21,  60,  49,   7, -4,  0
43                    F8_TAPS -1, -4,  18,  59,  51,   9, -4,  0
44                    F8_TAPS -1, -4,  16,  57,  53,  12, -4, -1
45                    F8_TAPS -1, -4,  14,  55,  55,  14, -4, -1
46                    F8_TAPS -1, -4,  12,  53,  57,  16, -4, -1
47                    F8_TAPS  0, -4,   9,  51,  59,  18, -4, -1
48                    F8_TAPS  0, -4,   7,  49,  60,  21, -3, -2
49                    F8_TAPS  0, -4,   5,  46,  62,  24, -3, -2
50                    F8_TAPS  0, -4,   4,  43,  63,  26, -2, -2
51                    F8_TAPS  0, -3,   2,  41,  63,  29, -2, -2
52                    F8_TAPS  0, -3,   1,  38,  64,  32, -1, -3
53                    ; regular
54                    F8_TAPS  0,  1,  -5, 126,   8,  -3,  1,  0
55                    F8_TAPS -1,  3, -10, 122,  18,  -6,  2,  0
56                    F8_TAPS -1,  4, -13, 118,  27,  -9,  3, -1
57                    F8_TAPS -1,  4, -16, 112,  37, -11,  4, -1
58                    F8_TAPS -1,  5, -18, 105,  48, -14,  4, -1
59                    F8_TAPS -1,  5, -19,  97,  58, -16,  5, -1
60                    F8_TAPS -1,  6, -19,  88,  68, -18,  5, -1
61                    F8_TAPS -1,  6, -19,  78,  78, -19,  6, -1
62                    F8_TAPS -1,  5, -18,  68,  88, -19,  6, -1
63                    F8_TAPS -1,  5, -16,  58,  97, -19,  5, -1
64                    F8_TAPS -1,  4, -14,  48, 105, -18,  5, -1
65                    F8_TAPS -1,  4, -11,  37, 112, -16,  4, -1
66                    F8_TAPS -1,  3,  -9,  27, 118, -13,  4, -1
67                    F8_TAPS  0,  2,  -6,  18, 122, -10,  3, -1
68                    F8_TAPS  0,  1,  -3,   8, 126,  -5,  1,  0
69                    ; sharp
70                    F8_TAPS -1,  3,  -7, 127,   8,  -3,  1,  0
71                    F8_TAPS -2,  5, -13, 125,  17,  -6,  3, -1
72                    F8_TAPS -3,  7, -17, 121,  27, -10,  5, -2
73                    F8_TAPS -4,  9, -20, 115,  37, -13,  6, -2
74                    F8_TAPS -4, 10, -23, 108,  48, -16,  8, -3
75                    F8_TAPS -4, 10, -24, 100,  59, -19,  9, -3
76                    F8_TAPS -4, 11, -24,  90,  70, -21, 10, -4
77                    F8_TAPS -4, 11, -23,  80,  80, -23, 11, -4
78                    F8_TAPS -4, 10, -21,  70,  90, -24, 11, -4
79                    F8_TAPS -3,  9, -19,  59, 100, -24, 10, -4
80                    F8_TAPS -3,  8, -16,  48, 108, -23, 10, -4
81                    F8_TAPS -2,  6, -13,  37, 115, -20,  9, -4
82                    F8_TAPS -2,  5, -10,  27, 121, -17,  7, -3
83                    F8_TAPS -1,  3,  -6,  17, 125, -13,  5, -2
84                    F8_TAPS  0,  1,  -3,   8, 127,  -7,  3, -1
85
86SECTION .text
87
88%macro filter_h_fn 1
89%assign %%px mmsize/2
90cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery
91    mova        m6, [pw_256]
92    mova        m7, [filteryq+ 0]
93%if ARCH_X86_64 && mmsize > 8
94    mova        m8, [filteryq+16]
95    mova        m9, [filteryq+32]
96    mova       m10, [filteryq+48]
97%endif
98.loop:
99    movh        m0, [srcq-3]
100    movh        m1, [srcq-2]
101    movh        m2, [srcq-1]
102    movh        m3, [srcq+0]
103    movh        m4, [srcq+1]
104    movh        m5, [srcq+2]
105    punpcklbw   m0, m1
106    punpcklbw   m2, m3
107    movh        m1, [srcq+3]
108    movh        m3, [srcq+4]
109    add       srcq, sstrideq
110    punpcklbw   m4, m5
111    punpcklbw   m1, m3
112    pmaddubsw   m0, m7
113%if ARCH_X86_64 && mmsize > 8
114    pmaddubsw   m2, m8
115    pmaddubsw   m4, m9
116    pmaddubsw   m1, m10
117%else
118    pmaddubsw   m2, [filteryq+16]
119    pmaddubsw   m4, [filteryq+32]
120    pmaddubsw   m1, [filteryq+48]
121%endif
122    paddw       m0, m2
123    paddw       m4, m1
124    paddsw      m0, m4
125    pmulhrsw    m0, m6
126%ifidn %1, avg
127    movh        m1, [dstq]
128%endif
129    packuswb    m0, m0
130%ifidn %1, avg
131    pavgb       m0, m1
132%endif
133    movh    [dstq], m0
134    add       dstq, dstrideq
135    dec         hd
136    jg .loop
137    RET
138%endmacro
139
140INIT_MMX ssse3
141filter_h_fn put
142filter_h_fn avg
143
144INIT_XMM ssse3
145filter_h_fn put
146filter_h_fn avg
147
148%if ARCH_X86_64
149%macro filter_hx2_fn 1
150%assign %%px mmsize
151cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery
152    mova       m13, [pw_256]
153    mova        m8, [filteryq+ 0]
154    mova        m9, [filteryq+16]
155    mova       m10, [filteryq+32]
156    mova       m11, [filteryq+48]
157.loop:
158    movu        m0, [srcq-3]
159    movu        m1, [srcq-2]
160    movu        m2, [srcq-1]
161    movu        m3, [srcq+0]
162    movu        m4, [srcq+1]
163    movu        m5, [srcq+2]
164    movu        m6, [srcq+3]
165    movu        m7, [srcq+4]
166    add       srcq, sstrideq
167    SBUTTERFLY  bw, 0, 1, 12
168    SBUTTERFLY  bw, 2, 3, 12
169    SBUTTERFLY  bw, 4, 5, 12
170    SBUTTERFLY  bw, 6, 7, 12
171    pmaddubsw   m0, m8
172    pmaddubsw   m1, m8
173    pmaddubsw   m2, m9
174    pmaddubsw   m3, m9
175    pmaddubsw   m4, m10
176    pmaddubsw   m5, m10
177    pmaddubsw   m6, m11
178    pmaddubsw   m7, m11
179    paddw       m0, m2
180    paddw       m1, m3
181    paddw       m4, m6
182    paddw       m5, m7
183    paddsw      m0, m4
184    paddsw      m1, m5
185    pmulhrsw    m0, m13
186    pmulhrsw    m1, m13
187    packuswb    m0, m1
188%ifidn %1, avg
189    pavgb       m0, [dstq]
190%endif
191    mova    [dstq], m0
192    add       dstq, dstrideq
193    dec         hd
194    jg .loop
195    RET
196%endmacro
197
198INIT_XMM ssse3
199filter_hx2_fn put
200filter_hx2_fn avg
201
202%endif ; ARCH_X86_64
203
204%macro filter_v_fn 1
205%assign %%px mmsize/2
206%if ARCH_X86_64
207cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
208%else
209cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
210    mov   filteryq, r5mp
211%define hd r4mp
212%endif
213    mova        m6, [pw_256]
214    lea  sstride3q, [sstrideq*3]
215    lea      src4q, [srcq+sstrideq]
216    sub       srcq, sstride3q
217    mova        m7, [filteryq+ 0]
218%if ARCH_X86_64 && mmsize > 8
219    mova        m8, [filteryq+16]
220    mova        m9, [filteryq+32]
221    mova       m10, [filteryq+48]
222%endif
223.loop:
224    ; FIXME maybe reuse loads from previous rows, or just
225    ; more generally unroll this to prevent multiple loads of
226    ; the same data?
227    movh        m0, [srcq]
228    movh        m1, [srcq+sstrideq]
229    movh        m2, [srcq+sstrideq*2]
230    movh        m3, [srcq+sstride3q]
231    movh        m4, [src4q]
232    movh        m5, [src4q+sstrideq]
233    punpcklbw   m0, m1
234    punpcklbw   m2, m3
235    movh        m1, [src4q+sstrideq*2]
236    movh        m3, [src4q+sstride3q]
237    add       srcq, sstrideq
238    add      src4q, sstrideq
239    punpcklbw   m4, m5
240    punpcklbw   m1, m3
241    pmaddubsw   m0, m7
242%if ARCH_X86_64 && mmsize > 8
243    pmaddubsw   m2, m8
244    pmaddubsw   m4, m9
245    pmaddubsw   m1, m10
246%else
247    pmaddubsw   m2, [filteryq+16]
248    pmaddubsw   m4, [filteryq+32]
249    pmaddubsw   m1, [filteryq+48]
250%endif
251    paddw       m0, m2
252    paddw       m4, m1
253    paddsw      m0, m4
254    pmulhrsw    m0, m6
255%ifidn %1, avg
256    movh        m1, [dstq]
257%endif
258    packuswb    m0, m0
259%ifidn %1, avg
260    pavgb       m0, m1
261%endif
262    movh    [dstq], m0
263    add       dstq, dstrideq
264    dec         hd
265    jg .loop
266    RET
267%endmacro
268
269INIT_MMX ssse3
270filter_v_fn put
271filter_v_fn avg
272
273INIT_XMM ssse3
274filter_v_fn put
275filter_v_fn avg
276
277%if ARCH_X86_64
278
279%macro filter_vx2_fn 1
280%assign %%px mmsize
281cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
282    mova       m13, [pw_256]
283    lea  sstride3q, [sstrideq*3]
284    lea      src4q, [srcq+sstrideq]
285    sub       srcq, sstride3q
286    mova        m8, [filteryq+ 0]
287    mova        m9, [filteryq+16]
288    mova       m10, [filteryq+32]
289    mova       m11, [filteryq+48]
290.loop:
291    ; FIXME maybe reuse loads from previous rows, or just
292    ; more generally unroll this to prevent multiple loads of
293    ; the same data?
294    movu        m0, [srcq]
295    movu        m1, [srcq+sstrideq]
296    movu        m2, [srcq+sstrideq*2]
297    movu        m3, [srcq+sstride3q]
298    movu        m4, [src4q]
299    movu        m5, [src4q+sstrideq]
300    movu        m6, [src4q+sstrideq*2]
301    movu        m7, [src4q+sstride3q]
302    add       srcq, sstrideq
303    add      src4q, sstrideq
304    SBUTTERFLY  bw, 0, 1, 12
305    SBUTTERFLY  bw, 2, 3, 12
306    SBUTTERFLY  bw, 4, 5, 12
307    SBUTTERFLY  bw, 6, 7, 12
308    pmaddubsw   m0, m8
309    pmaddubsw   m1, m8
310    pmaddubsw   m2, m9
311    pmaddubsw   m3, m9
312    pmaddubsw   m4, m10
313    pmaddubsw   m5, m10
314    pmaddubsw   m6, m11
315    pmaddubsw   m7, m11
316    paddw       m0, m2
317    paddw       m1, m3
318    paddw       m4, m6
319    paddw       m5, m7
320    paddsw      m0, m4
321    paddsw      m1, m5
322    pmulhrsw    m0, m13
323    pmulhrsw    m1, m13
324    packuswb    m0, m1
325%ifidn %1, avg
326    pavgb       m0, [dstq]
327%endif
328    mova    [dstq], m0
329    add       dstq, dstrideq
330    dec         hd
331    jg .loop
332    RET
333%endmacro
334
335INIT_XMM ssse3
336filter_vx2_fn put
337filter_vx2_fn avg
338
339%endif ; ARCH_X86_64
340
341%macro fpel_fn 6
342%if %2 == 4
343%define %%srcfn movh
344%define %%dstfn movh
345%else
346%define %%srcfn movu
347%define %%dstfn mova
348%endif
349
350%if %2 <= 16
351cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
352    lea  sstride3q, [sstrideq*3]
353    lea  dstride3q, [dstrideq*3]
354%else
355cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h
356%endif
357.loop:
358    %%srcfn     m0, [srcq]
359    %%srcfn     m1, [srcq+s%3]
360    %%srcfn     m2, [srcq+s%4]
361    %%srcfn     m3, [srcq+s%5]
362    lea       srcq, [srcq+sstrideq*%6]
363%ifidn %1, avg
364    pavgb       m0, [dstq]
365    pavgb       m1, [dstq+d%3]
366    pavgb       m2, [dstq+d%4]
367    pavgb       m3, [dstq+d%5]
368%endif
369    %%dstfn [dstq], m0
370    %%dstfn [dstq+d%3], m1
371    %%dstfn [dstq+d%4], m2
372    %%dstfn [dstq+d%5], m3
373    lea       dstq, [dstq+dstrideq*%6]
374    sub         hd, %6
375    jnz .loop
376    RET
377%endmacro
378
379%define d16 16
380%define s16 16
381INIT_MMX mmx
382fpel_fn put, 4,  strideq, strideq*2, stride3q, 4
383fpel_fn put, 8,  strideq, strideq*2, stride3q, 4
384INIT_MMX mmxext
385fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4
386fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4
387INIT_XMM sse
388fpel_fn put, 16, strideq, strideq*2, stride3q, 4
389fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
390fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
391INIT_XMM sse2
392fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
393fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2
394fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1
395%undef s16
396%undef d16
397