1;******************************************************************************
2;* x86-optimized vertical line scaling functions
3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4;*                    Kieran Kunhya <kieran@kunhya.com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27minshort:      times 8 dw 0x8000
28yuv2yuvX_16_start:  times 4 dd 0x4000 - 0x40000000
29yuv2yuvX_10_start:  times 4 dd 0x10000
30yuv2yuvX_9_start:   times 4 dd 0x20000
31yuv2yuvX_10_upper:  times 8 dw 0x3ff
32yuv2yuvX_9_upper:   times 8 dw 0x1ff
33pd_4:          times 4 dd 4
34pd_4min0x40000:times 4 dd 4 - (0x40000)
35pw_16:         times 8 dw 16
36pw_32:         times 8 dw 32
37pw_512:        times 8 dw 512
38pw_1024:       times 8 dw 1024
39
40SECTION .text
41
42;-----------------------------------------------------------------------------
43; vertical line scaling
44;
45; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
46;                                     const uint8_t *dither, int offset)
47; and
48; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
49;                                     const int16_t **src, uint8_t *dst, int dstW,
50;                                     const uint8_t *dither, int offset)
51;
52; Scale one or $filterSize lines of source data to generate one line of output
53; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
54; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
55; of 2. $offset is either 0 or 3. $dither holds 8 values.
56;-----------------------------------------------------------------------------
57
58%macro yuv2planeX_fn 3
59
60%if ARCH_X86_32
61%define cntr_reg fltsizeq
62%define movsx mov
63%else
64%define cntr_reg r7
65%define movsx movsxd
66%endif
67
68cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
69%if %1 == 8 || %1 == 9 || %1 == 10
70    pxor            m6,  m6
71%endif ; %1 == 8/9/10
72
73%if %1 == 8
74%if ARCH_X86_32
75%assign pad 0x2c - (stack_offset & 15)
76    SUB             rsp, pad
77%define m_dith m7
78%else ; x86-64
79%define m_dith m9
80%endif ; x86-32
81
82    ; create registers holding dither
83    movq        m_dith, [ditherq]        ; dither
84    test        offsetd, offsetd
85    jz              .no_rot
86%if mmsize == 16
87    punpcklqdq  m_dith,  m_dith
88%endif ; mmsize == 16
89    PALIGNR     m_dith,  m_dith,  3,  m0
90.no_rot:
91%if mmsize == 16
92    punpcklbw   m_dith,  m6
93%if ARCH_X86_64
94    punpcklwd       m8,  m_dith,  m6
95    pslld           m8,  12
96%else ; x86-32
97    punpcklwd       m5,  m_dith,  m6
98    pslld           m5,  12
99%endif ; x86-32/64
100    punpckhwd   m_dith,  m6
101    pslld       m_dith,  12
102%if ARCH_X86_32
103    mova      [rsp+ 0],  m5
104    mova      [rsp+16],  m_dith
105%endif
106%else ; mmsize == 8
107    punpcklbw       m5,  m_dith,  m6
108    punpckhbw   m_dith,  m6
109    punpcklwd       m4,  m5,  m6
110    punpckhwd       m5,  m6
111    punpcklwd       m3,  m_dith,  m6
112    punpckhwd   m_dith,  m6
113    pslld           m4,  12
114    pslld           m5,  12
115    pslld           m3,  12
116    pslld       m_dith,  12
117    mova      [rsp+ 0],  m4
118    mova      [rsp+ 8],  m5
119    mova      [rsp+16],  m3
120    mova      [rsp+24],  m_dith
121%endif ; mmsize == 8/16
122%endif ; %1 == 8
123
124    xor             r5,  r5
125
126.pixelloop:
127%assign %%i 0
128    ; the rep here is for the 8bit output mmx case, where dither covers
129    ; 8 pixels but we can only handle 2 pixels per register, and thus 4
130    ; pixels per iteration. In order to not have to keep track of where
131    ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
132%if %1 == 8
133%assign %%repcnt 16/mmsize
134%else
135%assign %%repcnt 1
136%endif
137
138%rep %%repcnt
139
140%if %1 == 8
141%if ARCH_X86_32
142    mova            m2, [rsp+mmsize*(0+%%i)]
143    mova            m1, [rsp+mmsize*(1+%%i)]
144%else ; x86-64
145    mova            m2,  m8
146    mova            m1,  m_dith
147%endif ; x86-32/64
148%else ; %1 == 9/10/16
149    mova            m1, [yuv2yuvX_%1_start]
150    mova            m2,  m1
151%endif ; %1 == 8/9/10/16
152    movsx     cntr_reg,  fltsizem
153.filterloop_ %+ %%i:
154    ; input pixels
155    mov             r6, [srcq+gprsize*cntr_reg-2*gprsize]
156%if %1 == 16
157    mova            m3, [r6+r5*4]
158    mova            m5, [r6+r5*4+mmsize]
159%else ; %1 == 8/9/10
160    mova            m3, [r6+r5*2]
161%endif ; %1 == 8/9/10/16
162    mov             r6, [srcq+gprsize*cntr_reg-gprsize]
163%if %1 == 16
164    mova            m4, [r6+r5*4]
165    mova            m6, [r6+r5*4+mmsize]
166%else ; %1 == 8/9/10
167    mova            m4, [r6+r5*2]
168%endif ; %1 == 8/9/10/16
169
170    ; coefficients
171    movd            m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
172%if %1 == 16
173    pshuflw         m7,  m0,  0          ; coeff[0]
174    pshuflw         m0,  m0,  0x55       ; coeff[1]
175    pmovsxwd        m7,  m7              ; word -> dword
176    pmovsxwd        m0,  m0              ; word -> dword
177
178    pmulld          m3,  m7
179    pmulld          m5,  m7
180    pmulld          m4,  m0
181    pmulld          m6,  m0
182
183    paddd           m2,  m3
184    paddd           m1,  m5
185    paddd           m2,  m4
186    paddd           m1,  m6
187%else ; %1 == 10/9/8
188    punpcklwd       m5,  m3,  m4
189    punpckhwd       m3,  m4
190    SPLATD          m0
191
192    pmaddwd         m5,  m0
193    pmaddwd         m3,  m0
194
195    paddd           m2,  m5
196    paddd           m1,  m3
197%endif ; %1 == 8/9/10/16
198
199    sub       cntr_reg,  2
200    jg .filterloop_ %+ %%i
201
202%if %1 == 16
203    psrad           m2,  31 - %1
204    psrad           m1,  31 - %1
205%else ; %1 == 10/9/8
206    psrad           m2,  27 - %1
207    psrad           m1,  27 - %1
208%endif ; %1 == 8/9/10/16
209
210%if %1 == 8
211    packssdw        m2,  m1
212    packuswb        m2,  m2
213    movh   [dstq+r5*1],  m2
214%else ; %1 == 9/10/16
215%if %1 == 16
216    packssdw        m2,  m1
217    paddw           m2, [minshort]
218%else ; %1 == 9/10
219%if cpuflag(sse4)
220    packusdw        m2,  m1
221%else ; mmxext/sse2
222    packssdw        m2,  m1
223    pmaxsw          m2,  m6
224%endif ; mmxext/sse2/sse4/avx
225    pminsw          m2, [yuv2yuvX_%1_upper]
226%endif ; %1 == 9/10/16
227    mova   [dstq+r5*2],  m2
228%endif ; %1 == 8/9/10/16
229
230    add             r5,  mmsize/2
231    sub             wd,  mmsize/2
232
233%assign %%i %%i+2
234%endrep
235    jg .pixelloop
236
237%if %1 == 8
238%if ARCH_X86_32
239    ADD             rsp, pad
240    RET
241%else ; x86-64
242    REP_RET
243%endif ; x86-32/64
244%else ; %1 == 9/10/16
245    REP_RET
246%endif ; %1 == 8/9/10/16
247%endmacro
248
249%if ARCH_X86_32
250INIT_MMX mmxext
251yuv2planeX_fn  8,  0, 7
252yuv2planeX_fn  9,  0, 5
253yuv2planeX_fn 10,  0, 5
254%endif
255
256INIT_XMM sse2
257yuv2planeX_fn  8, 10, 7
258yuv2planeX_fn  9,  7, 5
259yuv2planeX_fn 10,  7, 5
260
261INIT_XMM sse4
262yuv2planeX_fn  8, 10, 7
263yuv2planeX_fn  9,  7, 5
264yuv2planeX_fn 10,  7, 5
265yuv2planeX_fn 16,  8, 5
266
267%if HAVE_AVX_EXTERNAL
268INIT_XMM avx
269yuv2planeX_fn  8, 10, 7
270yuv2planeX_fn  9,  7, 5
271yuv2planeX_fn 10,  7, 5
272%endif
273
274; %1=outout-bpc, %2=alignment (u/a)
275%macro yuv2plane1_mainloop 2
276.loop_%2:
277%if %1 == 8
278    paddsw          m0, m2, [srcq+wq*2+mmsize*0]
279    paddsw          m1, m3, [srcq+wq*2+mmsize*1]
280    psraw           m0, 7
281    psraw           m1, 7
282    packuswb        m0, m1
283    mov%2    [dstq+wq], m0
284%elif %1 == 16
285    paddd           m0, m4, [srcq+wq*4+mmsize*0]
286    paddd           m1, m4, [srcq+wq*4+mmsize*1]
287    paddd           m2, m4, [srcq+wq*4+mmsize*2]
288    paddd           m3, m4, [srcq+wq*4+mmsize*3]
289    psrad           m0, 3
290    psrad           m1, 3
291    psrad           m2, 3
292    psrad           m3, 3
293%if cpuflag(sse4) ; avx/sse4
294    packusdw        m0, m1
295    packusdw        m2, m3
296%else ; mmx/sse2
297    packssdw        m0, m1
298    packssdw        m2, m3
299    paddw           m0, m5
300    paddw           m2, m5
301%endif ; mmx/sse2/sse4/avx
302    mov%2    [dstq+wq*2+mmsize*0], m0
303    mov%2    [dstq+wq*2+mmsize*1], m2
304%else ; %1 == 9/10
305    paddsw          m0, m2, [srcq+wq*2+mmsize*0]
306    paddsw          m1, m2, [srcq+wq*2+mmsize*1]
307    psraw           m0, 15 - %1
308    psraw           m1, 15 - %1
309    pmaxsw          m0, m4
310    pmaxsw          m1, m4
311    pminsw          m0, m3
312    pminsw          m1, m3
313    mov%2    [dstq+wq*2+mmsize*0], m0
314    mov%2    [dstq+wq*2+mmsize*1], m1
315%endif
316    add             wq, mmsize
317    jl .loop_%2
318%endmacro
319
320%macro yuv2plane1_fn 3
321cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
322    movsxdifnidn    wq, wd
323    add             wq, mmsize - 1
324    and             wq, ~(mmsize - 1)
325%if %1 == 8
326    add           dstq, wq
327%else ; %1 != 8
328    lea           dstq, [dstq+wq*2]
329%endif ; %1 == 8
330%if %1 == 16
331    lea           srcq, [srcq+wq*4]
332%else ; %1 != 16
333    lea           srcq, [srcq+wq*2]
334%endif ; %1 == 16
335    neg             wq
336
337%if %1 == 8
338    pxor            m4, m4               ; zero
339
340    ; create registers holding dither
341    movq            m3, [ditherq]        ; dither
342    test       offsetd, offsetd
343    jz              .no_rot
344%if mmsize == 16
345    punpcklqdq      m3, m3
346%endif ; mmsize == 16
347    PALIGNR         m3, m3, 3, m2
348.no_rot:
349%if mmsize == 8
350    mova            m2, m3
351    punpckhbw       m3, m4               ; byte->word
352    punpcklbw       m2, m4               ; byte->word
353%else
354    punpcklbw       m3, m4
355    mova            m2, m3
356%endif
357%elif %1 == 9
358    pxor            m4, m4
359    mova            m3, [pw_512]
360    mova            m2, [pw_32]
361%elif %1 == 10
362    pxor            m4, m4
363    mova            m3, [pw_1024]
364    mova            m2, [pw_16]
365%else ; %1 == 16
366%if cpuflag(sse4) ; sse4/avx
367    mova            m4, [pd_4]
368%else ; mmx/sse2
369    mova            m4, [pd_4min0x40000]
370    mova            m5, [minshort]
371%endif ; mmx/sse2/sse4/avx
372%endif ; %1 == ..
373
374    ; actual pixel scaling
375%if mmsize == 8
376    yuv2plane1_mainloop %1, a
377%else ; mmsize == 16
378    test          dstq, 15
379    jnz .unaligned
380    yuv2plane1_mainloop %1, a
381    REP_RET
382.unaligned:
383    yuv2plane1_mainloop %1, u
384%endif ; mmsize == 8/16
385    REP_RET
386%endmacro
387
388%if ARCH_X86_32
389INIT_MMX mmx
390yuv2plane1_fn  8, 0, 5
391yuv2plane1_fn 16, 0, 3
392
393INIT_MMX mmxext
394yuv2plane1_fn  9, 0, 3
395yuv2plane1_fn 10, 0, 3
396%endif
397
398INIT_XMM sse2
399yuv2plane1_fn  8, 5, 5
400yuv2plane1_fn  9, 5, 3
401yuv2plane1_fn 10, 5, 3
402yuv2plane1_fn 16, 6, 3
403
404INIT_XMM sse4
405yuv2plane1_fn 16, 5, 3
406
407%if HAVE_AVX_EXTERNAL
408INIT_XMM avx
409yuv2plane1_fn  8, 5, 5
410yuv2plane1_fn  9, 5, 3
411yuv2plane1_fn 10, 5, 3
412yuv2plane1_fn 16, 5, 3
413%endif
414