1;******************************************************************************
2;* MMX/SSE2-optimized functions for the RV40 decoder
3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28align 16
29pw_1024:   times 8 dw 1 << (16 - 6) ; pw_1024
30
31sixtap_filter_hb_m:  times 8 db   1, -5
32                     times 8 db  52, 20
33                     ; multiplied by 2 to have the same shift
34                     times 8 db   2, -10
35                     times 8 db  40,  40
36                     ; back to normal
37                     times 8 db   1, -5
38                     times 8 db  20, 52
39
40sixtap_filter_v_m:   times 8 dw   1
41                     times 8 dw  -5
42                     times 8 dw  52
43                     times 8 dw  20
44                     ; multiplied by 2 to have the same shift
45                     times 8 dw   2
46                     times 8 dw -10
47                     times 8 dw  40
48                     times 8 dw  40
49                     ; back to normal
50                     times 8 dw   1
51                     times 8 dw  -5
52                     times 8 dw  20
53                     times 8 dw  52
54
55%ifdef PIC
56%define sixtap_filter_hw   picregq
57%define sixtap_filter_hb   picregq
58%define sixtap_filter_v    picregq
59%define npicregs 1
60%else
61%define sixtap_filter_hw   sixtap_filter_hw_m
62%define sixtap_filter_hb   sixtap_filter_hb_m
63%define sixtap_filter_v    sixtap_filter_v_m
64%define npicregs 0
65%endif
66
67filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,  5, 6,  6,  7,  7,  8
68filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7,  7, 8,  8,  9,  9, 10
69filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
70
71cextern  pw_32
72cextern  pw_16
73cextern  pw_512
74
75SECTION .text
76
77;-----------------------------------------------------------------------------
78; subpel MC functions:
79;
80; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
81;                                          uint8_t *src, int srcstride,
82;                                          int len, int m);
83;----------------------------------------------------------------------
84%macro LOAD  2
85%if WIN64
86   movsxd   %1q, %1d
87%endif
88%ifdef PIC
89   add      %1q, picregq
90%else
91   add      %1q, %2
92%endif
93%endmacro
94
95%macro STORE 3
96%ifidn %3, avg
97    movh      %2, [dstq]
98%endif
99    packuswb  %1, %1
100%ifidn %3, avg
101    PAVGB     %1, %2
102%endif
103    movh  [dstq], %1
104%endmacro
105
106%macro FILTER_V 1
107cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
108%ifdef PIC
109    lea  picregq, [sixtap_filter_v_m]
110%endif
111    pxor      m7, m7
112    LOAD      my, sixtap_filter_v
113
114    ; read 5 lines
115    sub     srcq, srcstrideq
116    sub     srcq, srcstrideq
117    movh      m0, [srcq]
118    movh      m1, [srcq+srcstrideq]
119    movh      m2, [srcq+srcstrideq*2]
120    lea     srcq, [srcq+srcstrideq*2]
121    add     srcq, srcstrideq
122    movh      m3, [srcq]
123    movh      m4, [srcq+srcstrideq]
124    punpcklbw m0, m7
125    punpcklbw m1, m7
126    punpcklbw m2, m7
127    punpcklbw m3, m7
128    punpcklbw m4, m7
129
130%ifdef m8
131    mova      m8, [myq+ 0]
132    mova      m9, [myq+16]
133    mova     m10, [myq+32]
134    mova     m11, [myq+48]
135%define COEFF05  m8
136%define COEFF14  m9
137%define COEFF2   m10
138%define COEFF3   m11
139%else
140%define COEFF05  [myq+ 0]
141%define COEFF14  [myq+16]
142%define COEFF2   [myq+32]
143%define COEFF3   [myq+48]
144%endif
145.nextrow:
146    mova      m6, m1
147    movh      m5, [srcq+2*srcstrideq]      ; read new row
148    paddw     m6, m4
149    punpcklbw m5, m7
150    pmullw    m6, COEFF14
151    paddw     m0, m5
152    pmullw    m0, COEFF05
153    paddw     m6, m0
154    mova      m0, m1
155    paddw     m6, [pw_32]
156    mova      m1, m2
157    pmullw    m2, COEFF2
158    paddw     m6, m2
159    mova      m2, m3
160    pmullw    m3, COEFF3
161    paddw     m6, m3
162
163    ; round/clip/store
164    mova      m3, m4
165    psraw     m6, 6
166    mova      m4, m5
167    STORE     m6, m5, %1
168
169    ; go to next line
170    add     dstq, dststrideq
171    add     srcq, srcstrideq
172    dec  heightd                           ; next row
173    jg .nextrow
174    REP_RET
175%endmacro
176
177%macro FILTER_H  1
178cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
179%ifdef PIC
180    lea  picregq, [sixtap_filter_v_m]
181%endif
182    pxor      m7, m7
183    LOAD      mx, sixtap_filter_v
184    mova      m6, [pw_32]
185%ifdef m8
186    mova      m8, [mxq+ 0]
187    mova      m9, [mxq+16]
188    mova     m10, [mxq+32]
189    mova     m11, [mxq+48]
190%define COEFF05  m8
191%define COEFF14  m9
192%define COEFF2   m10
193%define COEFF3   m11
194%else
195%define COEFF05  [mxq+ 0]
196%define COEFF14  [mxq+16]
197%define COEFF2   [mxq+32]
198%define COEFF3   [mxq+48]
199%endif
200.nextrow:
201    movq      m0, [srcq-2]
202    movq      m5, [srcq+3]
203    movq      m1, [srcq-1]
204    movq      m4, [srcq+2]
205    punpcklbw m0, m7
206    punpcklbw m5, m7
207    punpcklbw m1, m7
208    punpcklbw m4, m7
209    movq      m2, [srcq-0]
210    movq      m3, [srcq+1]
211    paddw     m0, m5
212    paddw     m1, m4
213    punpcklbw m2, m7
214    punpcklbw m3, m7
215    pmullw    m0, COEFF05
216    pmullw    m1, COEFF14
217    pmullw    m2, COEFF2
218    pmullw    m3, COEFF3
219    paddw     m0, m6
220    paddw     m1, m2
221    paddw     m0, m3
222    paddw     m0, m1
223    psraw     m0, 6
224    STORE     m0, m1, %1
225
226    ; go to next line
227    add     dstq, dststrideq
228    add     srcq, srcstrideq
229    dec  heightd            ; next row
230    jg .nextrow
231    REP_RET
232%endmacro
233
234%if ARCH_X86_32
235INIT_MMX  mmx
236FILTER_V  put
237FILTER_H  put
238
239INIT_MMX  mmxext
240FILTER_V  avg
241FILTER_H  avg
242
243INIT_MMX  3dnow
244FILTER_V  avg
245FILTER_H  avg
246%endif
247
248INIT_XMM  sse2
249FILTER_H  put
250FILTER_H  avg
251FILTER_V  put
252FILTER_V  avg
253
254%macro FILTER_SSSE3 1
255cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
256%ifdef PIC
257    lea  picregq, [sixtap_filter_hb_m]
258%endif
259
260    ; read 5 lines
261    sub     srcq, srcstrideq
262    LOAD      my, sixtap_filter_hb
263    sub     srcq, srcstrideq
264    movh      m0, [srcq]
265    movh      m1, [srcq+srcstrideq]
266    movh      m2, [srcq+srcstrideq*2]
267    lea     srcq, [srcq+srcstrideq*2]
268    add     srcq, srcstrideq
269    mova      m5, [myq]
270    movh      m3, [srcq]
271    movh      m4, [srcq+srcstrideq]
272    lea     srcq, [srcq+2*srcstrideq]
273
274.nextrow:
275    mova      m6, m2
276    punpcklbw m0, m1
277    punpcklbw m6, m3
278    pmaddubsw m0, m5
279    pmaddubsw m6, [myq+16]
280    movh      m7, [srcq]      ; read new row
281    paddw     m6, m0
282    mova      m0, m1
283    mova      m1, m2
284    mova      m2, m3
285    mova      m3, m4
286    mova      m4, m7
287    punpcklbw m7, m3
288    pmaddubsw m7, m5
289    paddw     m6, m7
290    pmulhrsw  m6, [pw_512]
291    STORE     m6, m7, %1
292
293    ; go to next line
294    add     dstq, dststrideq
295    add     srcq, srcstrideq
296    dec       heightd                          ; next row
297    jg       .nextrow
298    REP_RET
299
300cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
301%ifdef PIC
302    lea  picregq, [sixtap_filter_hb_m]
303%endif
304    mova      m3, [filter_h6_shuf2]
305    mova      m4, [filter_h6_shuf3]
306    LOAD      mx, sixtap_filter_hb
307    mova      m5, [mxq] ; set up 6tap filter in bytes
308    mova      m6, [mxq+16]
309    mova      m7, [filter_h6_shuf1]
310
311.nextrow:
312    movu      m0, [srcq-2]
313    mova      m1, m0
314    mova      m2, m0
315    pshufb    m0, m7
316    pshufb    m1, m3
317    pshufb    m2, m4
318    pmaddubsw m0, m5
319    pmaddubsw m1, m6
320    pmaddubsw m2, m5
321    paddw     m0, m1
322    paddw     m0, m2
323    pmulhrsw  m0, [pw_512]
324    STORE     m0, m1, %1
325
326    ; go to next line
327    add     dstq, dststrideq
328    add     srcq, srcstrideq
329    dec  heightd            ; next row
330    jg .nextrow
331    REP_RET
332%endmacro
333
334INIT_XMM ssse3
335FILTER_SSSE3  put
336FILTER_SSSE3  avg
337
338; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
339%macro RV40_WCORE  4-5
340    movh       m4, [%3 + r6 + 0]
341    movh       m5, [%4 + r6 + 0]
342%if %0 == 4
343%define OFFSET r6 + mmsize / 2
344%else
345    ; 8x8 block and sse2, stride was provided
346%define OFFSET r6
347    add        r6, r5
348%endif
349    movh       m6, [%3 + OFFSET]
350    movh       m7, [%4 + OFFSET]
351
352%if %1 == 0
353    ; 14bits weights
354    punpcklbw  m4, m0
355    punpcklbw  m5, m0
356    punpcklbw  m6, m0
357    punpcklbw  m7, m0
358
359    psllw      m4, 7
360    psllw      m5, 7
361    psllw      m6, 7
362    psllw      m7, 7
363    pmulhw     m4, m3
364    pmulhw     m5, m2
365    pmulhw     m6, m3
366    pmulhw     m7, m2
367
368    paddw      m4, m5
369    paddw      m6, m7
370%else
371    ; 5bits weights
372%if cpuflag(ssse3)
373    punpcklbw  m4, m5
374    punpcklbw  m6, m7
375
376    pmaddubsw  m4, m3
377    pmaddubsw  m6, m3
378%else
379    punpcklbw  m4, m0
380    punpcklbw  m5, m0
381    punpcklbw  m6, m0
382    punpcklbw  m7, m0
383
384    pmullw     m4, m3
385    pmullw     m5, m2
386    pmullw     m6, m3
387    pmullw     m7, m2
388    paddw      m4, m5
389    paddw      m6, m7
390%endif
391
392%endif
393
394    ; bias and shift down
395%if cpuflag(ssse3)
396    pmulhrsw   m4, m1
397    pmulhrsw   m6, m1
398%else
399    paddw      m4, m1
400    paddw      m6, m1
401    psrlw      m4, 5
402    psrlw      m6, 5
403%endif
404
405    packuswb   m4, m6
406%if %0 == 5
407    ; Only called for 8x8 blocks and sse2
408    sub        r6, r5
409    movh       [%2 + r6], m4
410    add        r6, r5
411    movhps     [%2 + r6], m4
412%else
413    mova       [%2 + r6], m4
414%endif
415%endmacro
416
417
418%macro MAIN_LOOP   2
419%if mmsize == 8
420    RV40_WCORE %2, r0, r1, r2
421%if %1 == 16
422    RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
423%endif
424
425    ; Prepare for next loop
426    add        r6, r5
427%else
428%ifidn %1, 8
429    RV40_WCORE %2, r0, r1, r2, r5
430    ; Prepare 2 next lines
431    add        r6, r5
432%else
433    RV40_WCORE %2, r0, r1, r2
434    ; Prepare single next line
435    add        r6, r5
436%endif
437%endif
438
439%endmacro
440
441; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
442; %1=size  %2=num of xmm regs
443; The weights are FP0.14 notation of fractions depending on pts.
444; For timebases without rounding error (i.e. PAL), the fractions
445; can be simplified, and several operations can be avoided.
446; Therefore, we check here whether they are multiples of 2^9 for
447; those simplifications to occur.
448%macro RV40_WEIGHT  3
449cglobal rv40_weight_func_%1_%2, 6, 7, 8
450%if cpuflag(ssse3)
451    mova       m1, [pw_1024]
452%else
453    mova       m1, [pw_16]
454%endif
455    pxor       m0, m0
456    ; Set loop counter and increments
457    mov        r6, r5
458    shl        r6, %3
459    add        r0, r6
460    add        r1, r6
461    add        r2, r6
462    neg        r6
463
464    movd       m2, r3d
465    movd       m3, r4d
466%ifidn %1,rnd
467%define  RND   0
468    SPLATW     m2, m2
469%else
470%define  RND   1
471%if cpuflag(ssse3)
472    punpcklbw  m3, m2
473%else
474    SPLATW     m2, m2
475%endif
476%endif
477    SPLATW     m3, m3
478
479.loop:
480    MAIN_LOOP  %2, RND
481    jnz        .loop
482    REP_RET
483%endmacro
484
485INIT_MMX mmxext
486RV40_WEIGHT   rnd,    8, 3
487RV40_WEIGHT   rnd,   16, 4
488RV40_WEIGHT   nornd,  8, 3
489RV40_WEIGHT   nornd, 16, 4
490
491INIT_XMM sse2
492RV40_WEIGHT   rnd,    8, 3
493RV40_WEIGHT   rnd,   16, 4
494RV40_WEIGHT   nornd,  8, 3
495RV40_WEIGHT   nornd, 16, 4
496
497INIT_XMM ssse3
498RV40_WEIGHT   rnd,    8, 3
499RV40_WEIGHT   rnd,   16, 4
500RV40_WEIGHT   nornd,  8, 3
501RV40_WEIGHT   nornd, 16, 4
502