1;******************************************************************************
2;* x86-optimized horizontal line scaling functions
3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25
26max_19bit_int: times 4 dd 0x7ffff
27max_19bit_flt: times 4 dd 524287.0
28minshort:      times 8 dw 0x8000
29unicoeff:      times 4 dd 0x20000000
30
31SECTION .text
32
33;-----------------------------------------------------------------------------
34; horizontal line scaling
35;
36; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
37;                               (SwsContext *c, int{16,32}_t *dst,
38;                                int dstW, const uint{8,16}_t *src,
39;                                const int16_t *filter,
40;                                const int32_t *filterPos, int filterSize);
41;
42; Scale one horizontal line. Input is either 8-bits width or 16-bits width
43; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
44; downscale before multiplying). Filter is 14-bits. Output is either 15bits
45; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
46; output pixel is generated from $filterSize input pixels, the position of
47; the first pixel is given in filterPos[nOutputPixel].
48;-----------------------------------------------------------------------------
49
50; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
51%macro SCALE_FUNC 6
52%ifnidn %3, X
53cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
54%else
55cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
56%endif
57%if ARCH_X86_64
58    movsxd        wq, wd
59%define mov32 movsxd
60%else ; x86-32
61%define mov32 mov
62%endif ; x86-64
63%if %2 == 19
64%if mmsize == 8 ; mmx
65    mova          m2, [max_19bit_int]
66%elif cpuflag(sse4)
67    mova          m2, [max_19bit_int]
68%else ; ssse3/sse2
69    mova          m2, [max_19bit_flt]
70%endif ; mmx/sse2/ssse3/sse4
71%endif ; %2 == 19
72%if %1 == 16
73    mova          m6, [minshort]
74    mova          m7, [unicoeff]
75%elif %1 == 8
76    pxor          m3, m3
77%endif ; %1 == 8/16
78
79%if %1 == 8
80%define movlh movd
81%define movbh movh
82%define srcmul 1
83%else ; %1 == 9-16
84%define movlh movq
85%define movbh movu
86%define srcmul 2
87%endif ; %1 == 8/9-16
88
89%ifnidn %3, X
90
91    ; setup loop
92%if %3 == 8
93    shl           wq, 1                         ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
94%define wshr 1
95%else ; %3 == 4
96%define wshr 0
97%endif ; %3 == 8
98    lea      filterq, [filterq+wq*8]
99%if %2 == 15
100    lea         dstq, [dstq+wq*(2>>wshr)]
101%else ; %2 == 19
102    lea         dstq, [dstq+wq*(4>>wshr)]
103%endif ; %2 == 15/19
104    lea      fltposq, [fltposq+wq*(4>>wshr)]
105    neg           wq
106
107.loop:
108%if %3 == 4 ; filterSize == 4 scaling
109    ; load 2x4 or 4x4 source pixels into m0/m1
110    mov32      pos0q, dword [fltposq+wq*4+ 0]   ; filterPos[0]
111    mov32      pos1q, dword [fltposq+wq*4+ 4]   ; filterPos[1]
112    movlh         m0, [srcq+pos0q*srcmul]       ; src[filterPos[0] + {0,1,2,3}]
113%if mmsize == 8
114    movlh         m1, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
115%else ; mmsize == 16
116%if %1 > 8
117    movhps        m0, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
118%else ; %1 == 8
119    movd          m4, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
120%endif
121    mov32      pos0q, dword [fltposq+wq*4+ 8]   ; filterPos[2]
122    mov32      pos1q, dword [fltposq+wq*4+12]   ; filterPos[3]
123    movlh         m1, [srcq+pos0q*srcmul]       ; src[filterPos[2] + {0,1,2,3}]
124%if %1 > 8
125    movhps        m1, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
126%else ; %1 == 8
127    movd          m5, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
128    punpckldq     m0, m4
129    punpckldq     m1, m5
130%endif ; %1 == 8
131%endif ; mmsize == 8/16
132%if %1 == 8
133    punpcklbw     m0, m3                        ; byte -> word
134    punpcklbw     m1, m3                        ; byte -> word
135%endif ; %1 == 8
136
137    ; multiply with filter coefficients
138%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
139             ; add back 0x8000 * sum(coeffs) after the horizontal add
140    psubw         m0, m6
141    psubw         m1, m6
142%endif ; %1 == 16
143    pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
144    pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
145
146    ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
147%if mmsize == 8 ; mmx
148    movq          m4, m0
149    punpckldq     m0, m1
150    punpckhdq     m4, m1
151    paddd         m0, m4
152%elif notcpuflag(ssse3) ; sse2
153    mova          m4, m0
154    shufps        m0, m1, 10001000b
155    shufps        m4, m1, 11011101b
156    paddd         m0, m4
157%else ; ssse3/sse4
158    phaddd        m0, m1                        ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
159                                                ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
160                                                ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
161                                                ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
162%endif ; mmx/sse2/ssse3/sse4
163%else ; %3 == 8, i.e. filterSize == 8 scaling
164    ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
165    mov32      pos0q, dword [fltposq+wq*2+0]    ; filterPos[0]
166    mov32      pos1q, dword [fltposq+wq*2+4]    ; filterPos[1]
167    movbh         m0, [srcq+ pos0q   *srcmul]   ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
168%if mmsize == 8
169    movbh         m1, [srcq+(pos0q+4)*srcmul]   ; src[filterPos[0] + {4,5,6,7}]
170    movbh         m4, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3}]
171    movbh         m5, [srcq+(pos1q+4)*srcmul]   ; src[filterPos[1] + {4,5,6,7}]
172%else ; mmsize == 16
173    movbh         m1, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
174    mov32      pos0q, dword [fltposq+wq*2+8]    ; filterPos[2]
175    mov32      pos1q, dword [fltposq+wq*2+12]   ; filterPos[3]
176    movbh         m4, [srcq+ pos0q   *srcmul]   ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
177    movbh         m5, [srcq+ pos1q   *srcmul]   ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
178%endif ; mmsize == 8/16
179%if %1 == 8
180    punpcklbw     m0, m3                        ; byte -> word
181    punpcklbw     m1, m3                        ; byte -> word
182    punpcklbw     m4, m3                        ; byte -> word
183    punpcklbw     m5, m3                        ; byte -> word
184%endif ; %1 == 8
185
186    ; multiply
187%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
188             ; add back 0x8000 * sum(coeffs) after the horizontal add
189    psubw         m0, m6
190    psubw         m1, m6
191    psubw         m4, m6
192    psubw         m5, m6
193%endif ; %1 == 16
194    pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
195    pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
196    pmaddwd       m4, [filterq+wq*8+mmsize*2]   ; *= filter[{16,17,..,22,23}]
197    pmaddwd       m5, [filterq+wq*8+mmsize*3]   ; *= filter[{24,25,..,30,31}]
198
199    ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
200%if mmsize == 8
201    paddd         m0, m1
202    paddd         m4, m5
203    movq          m1, m0
204    punpckldq     m0, m4
205    punpckhdq     m1, m4
206    paddd         m0, m1
207%elif notcpuflag(ssse3) ; sse2
208%if %1 == 8
209%define mex m6
210%else
211%define mex m3
212%endif
213    ; emulate horizontal add as transpose + vertical add
214    mova         mex, m0
215    punpckldq     m0, m1
216    punpckhdq    mex, m1
217    paddd         m0, mex
218    mova          m1, m4
219    punpckldq     m4, m5
220    punpckhdq     m1, m5
221    paddd         m4, m1
222    mova          m1, m0
223    punpcklqdq    m0, m4
224    punpckhqdq    m1, m4
225    paddd         m0, m1
226%else ; ssse3/sse4
227    ; FIXME if we rearrange the filter in pairs of 4, we can
228    ; load pixels likewise and use 2 x paddd + phaddd instead
229    ; of 3 x phaddd here, faster on older cpus
230    phaddd        m0, m1
231    phaddd        m4, m5
232    phaddd        m0, m4                        ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
233                                                ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
234                                                ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
235                                                ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
236%endif ; mmx/sse2/ssse3/sse4
237%endif ; %3 == 4/8
238
239%else ; %3 == X, i.e. any filterSize scaling
240
241%ifidn %4, X4
242%define dlt 4
243%else ; %4 == X || %4 == X8
244%define dlt 0
245%endif ; %4 ==/!= X4
246%if ARCH_X86_64
247%define srcq    r8
248%define pos1q   r7
249%define srcendq r9
250    movsxd  fltsizeq, fltsized                  ; filterSize
251    lea      srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
252%else ; x86-32
253%define srcq    srcmemq
254%define pos1q   dstq
255%define srcendq r6m
256    lea        pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
257    mov      srcendq, pos0q
258%endif ; x86-32/64
259    lea      fltposq, [fltposq+wq*4]
260%if %2 == 15
261    lea         dstq, [dstq+wq*2]
262%else ; %2 == 19
263    lea         dstq, [dstq+wq*4]
264%endif ; %2 == 15/19
265    movifnidn  dstmp, dstq
266    neg           wq
267
268.loop:
269    mov32      pos0q, dword [fltposq+wq*4+0]    ; filterPos[0]
270    mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
271    ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
272    pxor          m4, m4
273    pxor          m5, m5
274    mov         srcq, srcmemmp
275
276.innerloop:
277    ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
278    movbh         m0, [srcq+ pos0q     *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
279    movbh         m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
280%if %1 == 8
281    punpcklbw     m0, m3
282    punpcklbw     m1, m3
283%endif ; %1 == 8
284
285    ; multiply
286%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
287             ; add back 0x8000 * sum(coeffs) after the horizontal add
288    psubw         m0, m6
289    psubw         m1, m6
290%endif ; %1 == 16
291    pmaddwd       m0, [filterq]                 ; filter[{0,1,2,3(,4,5,6,7)}]
292    pmaddwd       m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
293    paddd         m4, m0
294    paddd         m5, m1
295    add      filterq, mmsize
296    add         srcq, srcmul*mmsize/2
297    cmp         srcq, srcendq                   ; while (src += 4) < &src[filterSize]
298    jl .innerloop
299
300%ifidn %4, X4
301    mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
302    movlh         m0, [srcq+ pos0q     *srcmul] ; split last 4 srcpx of dstpx[0]
303    sub        pos1q, fltsizeq                  ; and first 4 srcpx of dstpx[1]
304%if %1 > 8
305    movhps        m0, [srcq+(pos1q+dlt)*srcmul]
306%else ; %1 == 8
307    movd          m1, [srcq+(pos1q+dlt)*srcmul]
308    punpckldq     m0, m1
309%endif ; %1 == 8
310%if %1 == 8
311    punpcklbw     m0, m3
312%endif ; %1 == 8
313%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
314             ; add back 0x8000 * sum(coeffs) after the horizontal add
315    psubw         m0, m6
316%endif ; %1 == 16
317    pmaddwd       m0, [filterq]
318%endif ; %4 == X4
319
320    lea      filterq, [filterq+(fltsizeq+dlt)*2]
321
322%if mmsize == 8 ; mmx
323    movq          m0, m4
324    punpckldq     m4, m5
325    punpckhdq     m0, m5
326    paddd         m0, m4
327%else ; mmsize == 16
328%if notcpuflag(ssse3) ; sse2
329    mova          m1, m4
330    punpcklqdq    m4, m5
331    punpckhqdq    m1, m5
332    paddd         m4, m1
333%else ; ssse3/sse4
334    phaddd        m4, m5
335%endif ; sse2/ssse3/sse4
336%ifidn %4, X4
337    paddd         m4, m0
338%endif ; %3 == X4
339%if notcpuflag(ssse3) ; sse2
340    pshufd        m4, m4, 11011000b
341    movhlps       m0, m4
342    paddd         m0, m4
343%else ; ssse3/sse4
344    phaddd        m4, m4
345    SWAP           0, 4
346%endif ; sse2/ssse3/sse4
347%endif ; mmsize == 8/16
348%endif ; %3 ==/!= X
349
350%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
351    paddd         m0, m7
352%endif ; %1 == 16
353
354    ; clip, store
355    psrad         m0, 14 + %1 - %2
356%ifidn %3, X
357    movifnidn   dstq, dstmp
358%endif ; %3 == X
359%if %2 == 15
360    packssdw      m0, m0
361%ifnidn %3, X
362    movh [dstq+wq*(2>>wshr)], m0
363%else ; %3 == X
364    movd [dstq+wq*2], m0
365%endif ; %3 ==/!= X
366%else ; %2 == 19
367%if mmsize == 8
368    PMINSD_MMX    m0, m2, m4
369%elif cpuflag(sse4)
370    pminsd        m0, m2
371%else ; sse2/ssse3
372    cvtdq2ps      m0, m0
373    minps         m0, m2
374    cvtps2dq      m0, m0
375%endif ; mmx/sse2/ssse3/sse4
376%ifnidn %3, X
377    mova [dstq+wq*(4>>wshr)], m0
378%else ; %3 == X
379    movq [dstq+wq*4], m0
380%endif ; %3 ==/!= X
381%endif ; %2 == 15/19
382%ifnidn %3, X
383    add           wq, (mmsize<<wshr)/4          ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
384                                                ; per iteration. see "shl wq,1" above as for why we do this
385%else ; %3 == X
386    add           wq, 2
387%endif ; %3 ==/!= X
388    jl .loop
389    REP_RET
390%endmacro
391
392; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
393%macro SCALE_FUNCS 3
394SCALE_FUNC %1, %2, 4, 4,  6, %3
395SCALE_FUNC %1, %2, 8, 8,  6, %3
396%if mmsize == 8
397SCALE_FUNC %1, %2, X, X,  7, %3
398%else
399SCALE_FUNC %1, %2, X, X4, 7, %3
400SCALE_FUNC %1, %2, X, X8, 7, %3
401%endif
402%endmacro
403
404; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
405%macro SCALE_FUNCS2 3
406%if notcpuflag(sse4)
407SCALE_FUNCS  8, 15, %1
408SCALE_FUNCS  9, 15, %2
409SCALE_FUNCS 10, 15, %2
410SCALE_FUNCS 12, 15, %2
411SCALE_FUNCS 14, 15, %2
412SCALE_FUNCS 16, 15, %3
413%endif ; !sse4
414SCALE_FUNCS  8, 19, %1
415SCALE_FUNCS  9, 19, %2
416SCALE_FUNCS 10, 19, %2
417SCALE_FUNCS 12, 19, %2
418SCALE_FUNCS 14, 19, %2
419SCALE_FUNCS 16, 19, %3
420%endmacro
421
422%if ARCH_X86_32
423INIT_MMX mmx
424SCALE_FUNCS2 0, 0, 0
425%endif
426INIT_XMM sse2
427SCALE_FUNCS2 7, 6, 8
428INIT_XMM ssse3
429SCALE_FUNCS2 6, 6, 8
430INIT_XMM sse4
431SCALE_FUNCS2 6, 6, 8
432