1;*****************************************************************************
2;* SSE2-optimized weighted prediction code
3;*****************************************************************************
4;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION .text
27
28;-----------------------------------------------------------------------------
29; biweight pred:
30;
31; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
32;                               int height, int log2_denom, int weightd,
33;                               int weights, int offset);
34; and
35; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height,
36;                             int log2_denom, int weight, int offset);
37;-----------------------------------------------------------------------------
38
39%macro WEIGHT_SETUP 0
40    add        r5, r5
41    inc        r5
42    movd       m3, r4d
43    movd       m5, r5d
44    movd       m6, r3d
45    pslld      m5, m6
46    psrld      m5, 1
47%if mmsize == 16
48    pshuflw    m3, m3, 0
49    pshuflw    m5, m5, 0
50    punpcklqdq m3, m3
51    punpcklqdq m5, m5
52%else
53    pshufw     m3, m3, 0
54    pshufw     m5, m5, 0
55%endif
56    pxor       m7, m7
57%endmacro
58
59%macro WEIGHT_OP 2
60    movh          m0, [r0+%1]
61    movh          m1, [r0+%2]
62    punpcklbw     m0, m7
63    punpcklbw     m1, m7
64    pmullw        m0, m3
65    pmullw        m1, m3
66    paddsw        m0, m5
67    paddsw        m1, m5
68    psraw         m0, m6
69    psraw         m1, m6
70    packuswb      m0, m1
71%endmacro
72
73INIT_MMX mmxext
74cglobal h264_weight_16, 6, 6, 0
75    WEIGHT_SETUP
76.nextrow:
77    WEIGHT_OP 0,  4
78    mova     [r0  ], m0
79    WEIGHT_OP 8, 12
80    mova     [r0+8], m0
81    add        r0, r1
82    dec        r2d
83    jnz .nextrow
84    REP_RET
85
86%macro WEIGHT_FUNC_MM 2
87cglobal h264_weight_%1, 6, 6, %2
88    WEIGHT_SETUP
89.nextrow:
90    WEIGHT_OP 0, mmsize/2
91    mova     [r0], m0
92    add        r0, r1
93    dec        r2d
94    jnz .nextrow
95    REP_RET
96%endmacro
97
98INIT_MMX mmxext
99WEIGHT_FUNC_MM  8, 0
100INIT_XMM sse2
101WEIGHT_FUNC_MM 16, 8
102
103%macro WEIGHT_FUNC_HALF_MM 2
104cglobal h264_weight_%1, 6, 6, %2
105    WEIGHT_SETUP
106    sar       r2d, 1
107    lea        r3, [r1*2]
108.nextrow:
109    WEIGHT_OP 0, r1
110    movh     [r0], m0
111%if mmsize == 16
112    movhps   [r0+r1], m0
113%else
114    psrlq      m0, 32
115    movh     [r0+r1], m0
116%endif
117    add        r0, r3
118    dec        r2d
119    jnz .nextrow
120    REP_RET
121%endmacro
122
123INIT_MMX mmxext
124WEIGHT_FUNC_HALF_MM 4, 0
125INIT_XMM sse2
126WEIGHT_FUNC_HALF_MM 8, 8
127
128%macro BIWEIGHT_SETUP 0
129%if ARCH_X86_64
130%define off_regd r7d
131%else
132%define off_regd r3d
133%endif
134    mov  off_regd, r7m
135    add  off_regd, 1
136    or   off_regd, 1
137    add        r4, 1
138    cmp        r5, 128
139     jne .normal
140    sar        r5, 1
141    sar        r6, 1
142    sar  off_regd, 1
143    sub        r4, 1
144.normal
145%if cpuflag(ssse3)
146    movd       m4, r5d
147    movd       m0, r6d
148%else
149    movd       m3, r5d
150    movd       m4, r6d
151%endif
152    movd       m5, off_regd
153    movd       m6, r4d
154    pslld      m5, m6
155    psrld      m5, 1
156%if cpuflag(ssse3)
157    punpcklbw  m4, m0
158    pshuflw    m4, m4, 0
159    pshuflw    m5, m5, 0
160    punpcklqdq m4, m4
161    punpcklqdq m5, m5
162
163%else
164%if mmsize == 16
165    pshuflw    m3, m3, 0
166    pshuflw    m4, m4, 0
167    pshuflw    m5, m5, 0
168    punpcklqdq m3, m3
169    punpcklqdq m4, m4
170    punpcklqdq m5, m5
171%else
172    pshufw     m3, m3, 0
173    pshufw     m4, m4, 0
174    pshufw     m5, m5, 0
175%endif
176    pxor       m7, m7
177%endif
178%endmacro
179
180%macro BIWEIGHT_STEPA 3
181    movh       m%1, [r0+%3]
182    movh       m%2, [r1+%3]
183    punpcklbw  m%1, m7
184    punpcklbw  m%2, m7
185    pmullw     m%1, m3
186    pmullw     m%2, m4
187    paddsw     m%1, m%2
188%endmacro
189
190%macro BIWEIGHT_STEPB 0
191    paddsw     m0, m5
192    paddsw     m1, m5
193    psraw      m0, m6
194    psraw      m1, m6
195    packuswb   m0, m1
196%endmacro
197
198INIT_MMX mmxext
199cglobal h264_biweight_16, 7, 8, 0
200    BIWEIGHT_SETUP
201    movifnidn r3d, r3m
202.nextrow:
203    BIWEIGHT_STEPA 0, 1, 0
204    BIWEIGHT_STEPA 1, 2, 4
205    BIWEIGHT_STEPB
206    mova       [r0], m0
207    BIWEIGHT_STEPA 0, 1, 8
208    BIWEIGHT_STEPA 1, 2, 12
209    BIWEIGHT_STEPB
210    mova     [r0+8], m0
211    add        r0, r2
212    add        r1, r2
213    dec        r3d
214    jnz .nextrow
215    REP_RET
216
217%macro BIWEIGHT_FUNC_MM 2
218cglobal h264_biweight_%1, 7, 8, %2
219    BIWEIGHT_SETUP
220    movifnidn r3d, r3m
221.nextrow:
222    BIWEIGHT_STEPA 0, 1, 0
223    BIWEIGHT_STEPA 1, 2, mmsize/2
224    BIWEIGHT_STEPB
225    mova       [r0], m0
226    add        r0, r2
227    add        r1, r2
228    dec        r3d
229    jnz .nextrow
230    REP_RET
231%endmacro
232
233INIT_MMX mmxext
234BIWEIGHT_FUNC_MM  8, 0
235INIT_XMM sse2
236BIWEIGHT_FUNC_MM 16, 8
237
238%macro BIWEIGHT_FUNC_HALF_MM 2
239cglobal h264_biweight_%1, 7, 8, %2
240    BIWEIGHT_SETUP
241    movifnidn r3d, r3m
242    sar        r3, 1
243    lea        r4, [r2*2]
244.nextrow:
245    BIWEIGHT_STEPA 0, 1, 0
246    BIWEIGHT_STEPA 1, 2, r2
247    BIWEIGHT_STEPB
248    movh       [r0], m0
249%if mmsize == 16
250    movhps     [r0+r2], m0
251%else
252    psrlq      m0, 32
253    movh       [r0+r2], m0
254%endif
255    add        r0, r4
256    add        r1, r4
257    dec        r3d
258    jnz .nextrow
259    REP_RET
260%endmacro
261
262INIT_MMX mmxext
263BIWEIGHT_FUNC_HALF_MM 4, 0
264INIT_XMM sse2
265BIWEIGHT_FUNC_HALF_MM 8, 8
266
267%macro BIWEIGHT_SSSE3_OP 0
268    pmaddubsw  m0, m4
269    pmaddubsw  m2, m4
270    paddsw     m0, m5
271    paddsw     m2, m5
272    psraw      m0, m6
273    psraw      m2, m6
274    packuswb   m0, m2
275%endmacro
276
277INIT_XMM ssse3
278cglobal h264_biweight_16, 7, 8, 8
279    BIWEIGHT_SETUP
280    movifnidn r3d, r3m
281
282.nextrow:
283    movh       m0, [r0]
284    movh       m2, [r0+8]
285    movh       m3, [r1+8]
286    punpcklbw  m0, [r1]
287    punpcklbw  m2, m3
288    BIWEIGHT_SSSE3_OP
289    mova       [r0], m0
290    add        r0, r2
291    add        r1, r2
292    dec        r3d
293    jnz .nextrow
294    REP_RET
295
296INIT_XMM ssse3
297cglobal h264_biweight_8, 7, 8, 8
298    BIWEIGHT_SETUP
299    movifnidn r3d, r3m
300    sar        r3, 1
301    lea        r4, [r2*2]
302
303.nextrow:
304    movh       m0, [r0]
305    movh       m1, [r1]
306    movh       m2, [r0+r2]
307    movh       m3, [r1+r2]
308    punpcklbw  m0, m1
309    punpcklbw  m2, m3
310    BIWEIGHT_SSSE3_OP
311    movh       [r0], m0
312    movhps     [r0+r2], m0
313    add        r0, r4
314    add        r1, r4
315    dec        r3d
316    jnz .nextrow
317    REP_RET
318